Add new formatters.py module

2020-09-01 15:21:47 -07:00 · 2020-09-01 15:21:47 -07:00 · f3dc6f508f
parent 7a47fc83ad
commit f3dc6f508f
1 changed files with 91 additions and 0 deletions
--- a/youtube_transcript_api/formatters.py
+++ b/youtube_transcript_api/formatters.py
@ -0,0 +1,91 @@
 import json
 class Formatter(object):
    """Formatter should be used as an abstract base class.
    Formatter classes should inherit from this class and implement
    their own .format() method which should return a string. A 
    transcript is represented by a List of Dictionary items.
    :param transcript: list representing 1 or more transcripts
    :type transcript: list
    """
    def __init__(self, transcript):
        if not isinstance(transcript, list):
            raise TypeError("'transcript' must be of type: List")
        self._transcript = transcript
    def format(self, **kwargs):
        raise NotImplementedError('A subclass of Formatter must implement ' \
            'their own .format() method.')
 class JSONFormatter(Formatter):
    def format(self, **kwargs):
        """Converts a transcript into a JSON string.
        :return: A JSON string representation of the transcript.'
        :rtype str
        """
        return json.dumps(self._transcript, **kwargs)
 class TextFormatter(Formatter):
    def format(self, **kwargs):
        """Converts a transcript into plain text with no timestamps.
        :return: all transcript text lines separated by newline breaks.'
        :rtype str
        """
        return "\n".join(line['text'] for line in self._transcript)
 class WebVTTFormatter(Formatter):
    def _seconds_to_timestamp(self, time):
        """Helper that converts `time` into a transcript cue timestamp.
        :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
        :param time: a float representing time in seconds.
        :type time: float
        :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
        :rtype str
        :example:
        >>> self._seconds_to_timestamp(6.93)
        '00:00:06.930'
        """
        time = float(time)
        hours, mins, secs = (
            int(time) // 3600,
            int(time) // 60,
            int(time) % 60,
        )
        ms = int(round((time - int(time))*1000, 2))
        return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms)
    def format(self, **kwargs):
        """A basic implementation of WEBVTT formatting.
        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
        """
        lines = []
        for i, line in enumerate(self._transcript):
            if i < len(self._transcript)-1:
                # Looks ahead, use next start time since duration value
                # would create an overlap between start times.
                time_text = "{} --> {}".format(
                    self._seconds_to_timestamp(line['start']),
                    self._seconds_to_timestamp(self._transcript[i+1]['start'])
                )
            else:
                # Reached the end, cannot look ahead, use duration now.
                duration = line['start'] + line['duration']
                time_text = "{} --> {}".format(
                    self._seconds_to_timestamp(line['start']),
                    self._seconds_to_timestamp(duration)
                )
            lines.append("{}\n{}".format(time_text, line['text']))
        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"