From dca4021dd74f90d57dd3659ff84a5381feb4c9f2 Mon Sep 17 00:00:00 2001 From: Chris Howell Date: Wed, 8 Jul 2020 15:05:17 -0700 Subject: [PATCH] Add formatters module Add formatters module that contains a few basic built-in formatters for JSON, SRT, and an arbitrary plain TEXT. Mainly for examples of implementing your own. Module contains an abstract base class `TranscriptFormatter` so that anyone can create their own TranscriptFormatter class Module contains at runtime a `formats` which is basically an instance of a factory of classes (not sure if that is the correct term to use). But it itself is an instance that folks can add custom formatter classes that the API can make use of. --- youtube_transcript_api/formatters.py | 156 +++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 youtube_transcript_api/formatters.py diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py new file mode 100644 index 0000000..a5f30cd --- /dev/null +++ b/youtube_transcript_api/formatters.py @@ -0,0 +1,156 @@ +from abc import ABCMeta +from abc import abstractclassmethod +from collections import defaultdict +import json +import re + +from xml.etree import ElementTree + +from ._html_unescaping import unescape + + +def parse_timecode(time): + """Converts a `time` into a formatted transcript timecode. + + :param time: a float representing time in seconds. + :type time: float + :return: a string formatted as a timecode, 'HH:MM:SS,MS' + :rtype str + + :example: + >>> parse_timecode(6.93) + '00:00:06,930' + """ + time = float(time) + hours, mins, secs = ( + str(int(time)//3600).rjust(2, '0'), + str(int(time)//60).rjust(2, '0'), + str(int(time)%60).rjust(2, '0'), + ) + ms = str(int(round((time - int(time))*1000, 2))).rjust(3, '0') + return f"{hours}:{mins}:{secs},{ms}" + + +class TranscriptFormatter(metaclass=ABCMeta): + """ + Abstract Base TranscriptFormatter class + + This class should be inherited from to create additional + custom transcript formatters. + + """ + HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) + + @abstractclassmethod + def format(cls, transcript_data): + """Any subclass must implement this format class method. + + :param transcript_data: a list of transcripts, 1 or more. + :type transcript_data: list[list[dict], list[dict]] + :return: A list where each item is an individual transcript + as a string. + :rtype: list[str] + """ + pass + + +class JSONTranscriptFormatter(TranscriptFormatter): + """Formatter for outputting JSON data""" + @classmethod + def format(cls, transcript_data): + return [json.dumps(transcript_data)] if transcript_data else [] + + +class TextTranscriptFormatter(TranscriptFormatter): + """Formatter for outputting a Plain Text Format + + Converts the fetched transcript data into separated lines of + plain text separated by newline breaks (\n) with no timecodes. + + """ + @classmethod + def format(cls, transcript_data): + return ['\n'.join(line['text'] for transcript in transcript_data + for line in transcript)] + + +class SRTTranscriptFormatter(TranscriptFormatter): + """Formatter for outputting the SRT Format + + Converts the fetched transcript data into a simple .srt file format. + + """ + @classmethod + def format(cls, transcript_data): + contents = [] + for transcript in transcript_data: + content = [] + for frame, item in enumerate(transcript, start=1): + start_time = float(item.get('start')) + duration = float(item.get('dur', '0.0')) + + end_time = parse_timecode(start_time + duration) + start_time = parse_timecode(start_time) + + content.append("{frame}\n".format(frame=frame)) + content.append("{start_time} --> {end_time}\n".format( + start_time=start_time, end_time=end_time)) + content.append("{text}\n\n".format(text=item.get('text'))) + + contents.append(''.join(content)) + return ['\n\n'.join(contents)] + + +class TranscriptFormatterFactory: + """A Transcript Class Factory + + Allows for adding additional custom Transcript classes for the API + to use. Custom Transcript classes must inherit from the + TranscriptFormatter abstract base class. + """ + def __init__(self): + self._formatters = defaultdict(JSONTranscriptFormatter) + + def add_formatter(self, name, formatter_class): + """Allows for creating additional transcript formatters. + + + :param name: a name given to the `formatter_class` + :type name: str + :param formatter_class: a subclass of TranscriptFormatter + :type formatter_class: class + :rtype None + """ + if not issubclass(formatter_class, TranscriptFormatter): + raise TypeError( + f'{formatter_class} must be a subclass of TranscriptFormatter') + self._formatters.update({name:formatter_class}) + + def add_formatters(self, formatters_dict): + """Allow creation of multiple transcript formatters at a time. + + :param formatters_dict: key(s) are the string name to be given + to the formatter class, value for each key should be a subclass + of TranscriptFormatter. + :type formatters_dict: dict + :rtype None + """ + for name, formatter_class in formatters_dict.items(): + self.add_formatter(name, formatter_class) + + def get_formatter(self, name): + """Retrieve a formatter class by its assigned name. + + :param name: the string name given to the formatter class. + :type name: str + :return: a subclass of `TranscriptFormatter` + """ + return self._formatters[name] + + +formats = TranscriptFormatterFactory() +formats.add_formatters({ + 'json': JSONTranscriptFormatter, + 'srt': SRTTranscriptFormatter, + 'text': TextTranscriptFormatter +})