import json import pprint class Formatter(object): """Formatter should be used as an abstract base class. Formatter classes should inherit from this class and implement their own .format() method which should return a string. A transcript is represented by a List of Dictionary items. :param transcript: list representing 1 or more transcripts :type transcript: list """ def __init__(self, transcript): if not isinstance(transcript, list): raise TypeError("'transcript' must be of type: List") self._transcript = transcript def format(self, **kwargs): raise NotImplementedError('A subclass of Formatter must implement ' \ 'their own .format() method.') class PrettyPrintFormatter(Formatter): def format(self, **kwargs): """Pretty prints a transcript. :return: A pretty printed string representation of the transcript dict.' :rtype str """ return pprint.pformat(self._transcript, **kwargs) class JSONFormatter(Formatter): def format(self, **kwargs): """Converts a transcript into a JSON string. :return: A JSON string representation of the transcript.' :rtype str """ return json.dumps(self._transcript, **kwargs) class TextFormatter(Formatter): def format(self, **kwargs): """Converts a transcript into plain text with no timestamps. :return: all transcript text lines separated by newline breaks.' :rtype str """ return "\n".join(line['text'] for line in self._transcript) class WebVTTFormatter(Formatter): def _seconds_to_timestamp(self, time): """Helper that converts `time` into a transcript cue timestamp. :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp :param time: a float representing time in seconds. :type time: float :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS' :rtype str :example: >>> self._seconds_to_timestamp(6.93) '00:00:06.930' """ time = float(time) hours, mins, secs = ( int(time) // 3600, int(time) // 60, int(time) % 60, ) ms = int(round((time - int(time))*1000, 2)) return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) def format(self, **kwargs): """A basic implementation of WEBVTT formatting. :reference: https://www.w3.org/TR/webvtt1/#introduction-caption """ lines = [] for i, line in enumerate(self._transcript): if i < len(self._transcript) - 1: # Looks ahead, use next start time since duration value # would create an overlap between start times. time_text = "{} --> {}".format( self._seconds_to_timestamp(line['start']), self._seconds_to_timestamp(self._transcript[i + 1]['start']) ) else: # Reached the end, cannot look ahead, use duration now. duration = line['start'] + line['duration'] time_text = "{} --> {}".format( self._seconds_to_timestamp(line['start']), self._seconds_to_timestamp(duration) ) lines.append("{}\n{}".format(time_text, line['text'])) return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" class FormatterLoader(object): TYPES = { 'json': JSONFormatter, 'pretty': PrettyPrintFormatter, 'text': TextFormatter, 'webvvt': WebVTTFormatter, } class UnknownFormatterType(Exception): def __init__(self, formatter_type): super(FormatterLoader.UnknownFormatterType, self).__init__( f'The format \'{formatter_type}\' is not supported. ' f'Choose one of the following formats: {", ".join(FormatterLoader.TYPES.keys())}' ) def __init__(self, formatter_type='pretty'): if formatter_type not in FormatterLoader.TYPES.keys(): raise FormatterLoader.UnknownFormatterType(formatter_type) self._formatter = FormatterLoader.TYPES[formatter_type] def load(self, transcript): return self._formatter(transcript)