Working With Video Intelligence API Text Tracking In Python
You can use the Video Intelligence API to detect and track text in a video.
Copy the following code into your IPython session:
from google.cloud import videointelligencefrom google.cloud.videointelligence import enums, typesdef detect_text(video_uri, language_hints=None, segments=None):video_client = videointelligence.VideoIntelligenceServiceClient()features = [enums.Feature.TEXT_DETECTION]config = types.TextDetectionConfig(language_hints=language_hints,)context = types.VideoContext(segments=segments,text_detection_config=config,)print(f'Processing video "{video_uri}"...')operation = video_client.annotate_video(input_uri=video_uri,features=features,video_context=context,)return operation.result()
Take a moment to study the code and see how it uses the annotate_video client library method with the TEXT_DETECTION parameter to analyze a video and detect text.
Call the function to analyze the video from seconds 13 to 27:
video_uri = 'gs://cloudmleap/video/next/JaneGoodall.mp4'segment = types.VideoSegment()segment.start_time_offset.FromSeconds(13)segment.end_time_offset.FromSeconds(27)response = detect_text(video_uri, segments=[segment])
Note: Automatic language detection is performed by default. language_hints can be specified if the languages to be detected are known in advance, which can improve the accuracy of the detection.
Wait a moment for the video to be processed:
Processing video "gs://cloudmleap/video/next/JaneGoodall.mp4"...
Add this function to print out detected text:
def print_video_text(response, min_frames=15):# First result only, as a single video is processedannotations = response.annotation_results[0].text_annotationssort_by_first_segment_start(annotations)print(f' Detected Text '.center(80, '-'))for annotation in annotations:for segment in annotation.segments:frames = len(segment.frames)if frames < min_frames:continuetext = annotation.textconfidence = segment.confidencestart = segment.segment.start_time_offset.ToTimedelta()seconds = segment_seconds(segment.segment)print(text)print(f' {confidence:4.0%}',f'{start} + {seconds:.1f}s',f'{frames} fr.',sep=' | ')def sort_by_first_segment_start(annotations):def first_segment_start(annotation):return annotation.segments[0].segment.start_time_offset.ToTimedelta()annotations.sort(key=first_segment_start)def segment_seconds(segment):t1 = segment.start_time_offset.ToTimedelta()t2 = segment.end_time_offset.ToTimedelta()return (t2 - t1).total_seconds()
Call the function:
print_video_text(response)
You should see something like this:
-------------------------------- Detected Text ---------------------------------
GOMBE NATIONAL PARK
99% | 0:00:15.760000 + 1.7s | 15 fr.
TANZANIA
100% | 0:00:15.760000 + 4.8s | 39 fr.
Jane Goodall
99% | 0:00:23.080000 + 3.8s | 33 fr.
With words and narration by
100% | 0:00:23.200000 + 3.6s | 31 fr.
Add this function to print out the list of detected text frames and bounding boxes:
def print_text_frames(response, contained_text):# Vertex order: top-left, top-right, bottom-right, bottom-leftdef box_top_left(box):tl = box.vertices[0]return f'({tl.x:.5f}, {tl.y:.5f})'def box_bottom_right(box):br = box.vertices[2]return f'({br.x:.5f}, {br.y:.5f})'
# First result only, as a single video is processedannotations = response.annotation_results[0].text_annotationsannotations = [a for a in annotations if contained_text in a.text]for annotation in annotations:print(f' {annotation.text} '.center(80, '-'))for text_segment in annotation.segments:for frame in text_segment.frames:frame_ms = frame.time_offset.ToMilliseconds()box = frame.rotated_bounding_boxprint(f'{frame_ms:>7,}',box_top_left(box),box_bottom_right(box),sep=' | ')
Call the function to check which frames show the narrator's name:
contained_text = 'Goodall'print_text_frames(response, contained_text)
You should see something like this:
--------------------------------- Jane Goodall ---------------------------------
23,080 | (0.39922, 0.49861) | (0.62752, 0.55888)
23,200 | (0.38750, 0.49028) | (0.62692, 0.56306)
...
26,800 | (0.36016, 0.49583) | (0.61094, 0.56048)
26,920 | (0.45859, 0.49583) | (0.60365, 0.56174)
Comments
Post a Comment