Working With Video Intelligence API Text Tracking In Python

You can use the Video Intelligence API to detect and track text in a video.

Copy the following code into your IPython session:

from google.cloud import videointelligence
from google.cloud.videointelligence import enums, types

def detect_text(video_uri, language_hints=None, segments=None):
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [enums.Feature.TEXT_DETECTION]
config = types.TextDetectionConfig(
language_hints=language_hints,
)
context = types.VideoContext(
segments=segments,
text_detection_config=config,
)

print(f'Processing video "{video_uri}"...')
operation = video_client.annotate_video(
input_uri=video_uri,
features=features,
video_context=context,
)
return operation.result()

Take a moment to study the code and see how it uses the annotate_video client library method with the TEXT_DETECTION parameter to analyze a video and detect text.

Call the function to analyze the video from seconds 13 to 27:

video_uri = 'gs://cloudmleap/video/next/JaneGoodall.mp4'
segment = types.VideoSegment()
segment.start_time_offset.FromSeconds(13)
segment.end_time_offset.FromSeconds(27)
response = detect_text(video_uri, segments=[segment])

Note: Automatic language detection is performed by default. language_hints can be specified if the languages to be detected are known in advance, which can improve the accuracy of the detection.

Wait a moment for the video to be processed:

Processing video "gs://cloudmleap/video/next/JaneGoodall.mp4"...

Add this function to print out detected text:

def print_video_text(response, min_frames=15):
# First result only, as a single video is processed
annotations = response.annotation_results[0].text_annotations
sort_by_first_segment_start(annotations)

print(f' Detected Text '.center(80, '-'))
for annotation in annotations:
for segment in annotation.segments:
frames = len(segment.frames)
if frames < min_frames:
continue
text = annotation.text
confidence = segment.confidence
start = segment.segment.start_time_offset.ToTimedelta()
seconds = segment_seconds(segment.segment)
print(text)
print(f' {confidence:4.0%}',
f'{start} + {seconds:.1f}s',
f'{frames} fr.',
sep=' | ')

def sort_by_first_segment_start(annotations):
def first_segment_start(annotation):
return annotation.segments[0].segment.start_time_offset.ToTimedelta()
annotations.sort(key=first_segment_start)

def segment_seconds(segment):
t1 = segment.start_time_offset.ToTimedelta()
t2 = segment.end_time_offset.ToTimedelta()
return (t2 - t1).total_seconds()

Call the function:

print_video_text(response)

You should see something like this:

-------------------------------- Detected Text ---------------------------------

GOMBE NATIONAL PARK

99% | 0:00:15.760000 + 1.7s | 15 fr.

TANZANIA

100% | 0:00:15.760000 + 4.8s | 39 fr.

Jane Goodall

99% | 0:00:23.080000 + 3.8s | 33 fr.

With words and narration by

100% | 0:00:23.200000 + 3.6s | 31 fr.

Add this function to print out the list of detected text frames and bounding boxes:

def print_text_frames(response, contained_text):
# Vertex order: top-left, top-right, bottom-right, bottom-left
def box_top_left(box):
tl = box.vertices[0]
return f'({tl.x:.5f}, {tl.y:.5f})'

def box_bottom_right(box):
br = box.vertices[2]
return f'({br.x:.5f}, {br.y:.5f})'

# First result only, as a single video is processed
annotations = response.annotation_results[0].text_annotations
annotations = [a for a in annotations if contained_text in a.text]
for annotation in annotations:
print(f' {annotation.text} '.center(80, '-'))
for text_segment in annotation.segments:
for frame in text_segment.frames:
frame_ms = frame.time_offset.ToMilliseconds()
box = frame.rotated_bounding_box
print(f'{frame_ms:>7,}',
box_top_left(box),
box_bottom_right(box),
sep=' | ')

Call the function to check which frames show the narrator's name:

contained_text = 'Goodall'
print_text_frames(response, contained_text)

You should see something like this:

--------------------------------- Jane Goodall ---------------------------------

23,080 | (0.39922, 0.49861) | (0.62752, 0.55888)

23,200 | (0.38750, 0.49028) | (0.62692, 0.56306)

...

26,800 | (0.36016, 0.49583) | (0.61094, 0.56048)

26,920 | (0.45859, 0.49583) | (0.60365, 0.56174)

Mr & Mrs Tamilan - Social Blog and Tamil Technical Information Site

Search This Blog

Working With Video Intelligence API Text Tracking In Python

Comments

Post a Comment