Working With Video Intelligence API Text Tracking In Python



Working With Video Intelligence API Text Tracking In Python


You can use the Video Intelligence API to detect and track text in a video.

Copy the following code into your IPython session:

from google.cloud import videointelligence
from google.cloud.videointelligence import enums, types


def detect_text(video_uri, language_hints=None, segments=None):
    video_client = videointelligence.VideoIntelligenceServiceClient()
    features = [enums.Feature.TEXT_DETECTION]
    config = types.TextDetectionConfig(
        language_hints=language_hints,
    )
    context = types.VideoContext(
        segments=segments,
        text_detection_config=config,
    )

    print(f'Processing video "{video_uri}"...')
    operation = video_client.annotate_video(
        input_uri=video_uri,
        features=features,
        video_context=context,
    )
    return operation.result()

Take a moment to study the code and see how it uses the annotate_video client library method with the TEXT_DETECTION parameter to analyze a video and detect text.

Call the function to analyze the video from seconds 13 to 27:

video_uri = 'gs://cloudmleap/video/next/JaneGoodall.mp4'
segment = types.VideoSegment()
segment.start_time_offset.FromSeconds(13)
segment.end_time_offset.FromSeconds(27)
response = detect_text(video_uri, segments=[segment])

Note: Automatic language detection is performed by default. language_hints can be specified if the languages to be detected are known in advance, which can improve the accuracy of the detection.

Wait a moment for the video to be processed:

Processing video "gs://cloudmleap/video/next/JaneGoodall.mp4"...

Add this function to print out detected text:

def print_video_text(response, min_frames=15):
    # First result only, as a single video is processed
    annotations = response.annotation_results[0].text_annotations
    sort_by_first_segment_start(annotations)

    print(f' Detected Text '.center(80, '-'))
    for annotation in annotations:
        for segment in annotation.segments:
            frames = len(segment.frames)
            if frames < min_frames:
                continue
            text = annotation.text
            confidence = segment.confidence
            start = segment.segment.start_time_offset.ToTimedelta()
            seconds = segment_seconds(segment.segment)
            print(text)
            print(f'  {confidence:4.0%}',
                  f'{start} + {seconds:.1f}s',
                  f'{frames} fr.',
                  sep=' | ')


def sort_by_first_segment_start(annotations):
    def first_segment_start(annotation):
        return annotation.segments[0].segment.start_time_offset.ToTimedelta()
    annotations.sort(key=first_segment_start)


def segment_seconds(segment):
    t1 = segment.start_time_offset.ToTimedelta()
    t2 = segment.end_time_offset.ToTimedelta()
    return (t2 - t1).total_seconds()
Call the function:

print_video_text(response)

You should see something like this:

-------------------------------- Detected Text ---------------------------------
GOMBE NATIONAL PARK
   99% | 0:00:15.760000 + 1.7s | 15 fr.
TANZANIA
  100% | 0:00:15.760000 + 4.8s | 39 fr.
Jane Goodall
   99% | 0:00:23.080000 + 3.8s | 33 fr.
With words and narration by
  100% | 0:00:23.200000 + 3.6s | 31 fr.
  
 Add this function to print out the list of detected text frames and bounding boxes:
 
 def print_text_frames(response, contained_text):
    # Vertex order: top-left, top-right, bottom-right, bottom-left
    def box_top_left(box):
        tl = box.vertices[0]
        return f'({tl.x:.5f}, {tl.y:.5f})'

    def box_bottom_right(box):
        br = box.vertices[2]
        return f'({br.x:.5f}, {br.y:.5f})'

    # First result only, as a single video is processed
    annotations = response.annotation_results[0].text_annotations
    annotations = [a for a in annotations if contained_text in a.text]
    for annotation in annotations:
        print(f' {annotation.text} '.center(80, '-'))
        for text_segment in annotation.segments:
            for frame in text_segment.frames:
                frame_ms = frame.time_offset.ToMilliseconds()
                box = frame.rotated_bounding_box
                print(f'{frame_ms:>7,}',
                      box_top_left(box),
                      box_bottom_right(box),
                      sep=' | ')

Call the function to check which frames show the narrator's name:

contained_text = 'Goodall'
print_text_frames(response, contained_text)

You should see something like this:

--------------------------------- Jane Goodall ---------------------------------
 23,080 | (0.39922, 0.49861) | (0.62752, 0.55888)
 23,200 | (0.38750, 0.49028) | (0.62692, 0.56306)
...
 26,800 | (0.36016, 0.49583) | (0.61094, 0.56048)
 26,920 | (0.45859, 0.49583) | (0.60365, 0.56174)
 
 

Comments