added FormatterLoader

2021-03-15 17:16:15 +01:00 · 2021-03-15 17:16:15 +01:00 · d314139329
parent 71268dfad9
commit d314139329
3 changed files with 73 additions and 7 deletions
--- a/youtube_transcript_api/_errors.py
+++ b/youtube_transcript_api/_errors.py
@ -38,12 +38,14 @@ class CouldNotRetrieveTranscript(Exception):
 class VideoUnavailable(CouldNotRetrieveTranscript):
    CAUSE_MESSAGE = 'The video is no longer available'
 class TooManyRequests(CouldNotRetrieveTranscript):
    CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\
    - Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
    - Use a different IP address\n\
    - Wait until the ban on your IP has been lifted")
 class TranscriptsDisabled(CouldNotRetrieveTranscript):
    CAUSE_MESSAGE = 'Subtitles are disabled for this video'
--- a/youtube_transcript_api/formatters.py
+++ b/youtube_transcript_api/formatters.py
@ -1,5 +1,7 @@
 import json
 import pprint
 class Formatter(object):
    """Formatter should be used as an abstract base class.
@ -22,6 +24,16 @@ class Formatter(object):
            'their own .format() method.')
 class PrettyPrintFormatter(Formatter):
    def format(self, **kwargs):
        """Pretty prints a transcript.
        :return: A pretty printed string representation of the transcript dict.'
        :rtype str
        """
        return pprint.pformat(self._transcript, **kwargs)
 class JSONFormatter(Formatter):
    def format(self, **kwargs):
        """Converts a transcript into a JSON string.
@ -72,12 +84,12 @@ class WebVTTFormatter(Formatter):
        """
        lines = []
        for i, line in enumerate(self._transcript):
-            if i < len(self._transcript)-1:
+            if i < len(self._transcript) - 1:
                # Looks ahead, use next start time since duration value
                # would create an overlap between start times.
                time_text = "{} --> {}".format(
                    self._seconds_to_timestamp(line['start']),
-                    self._seconds_to_timestamp(self._transcript[i+1]['start'])
+                    self._seconds_to_timestamp(self._transcript[i + 1]['start'])
                )
            else:
                # Reached the end, cannot look ahead, use duration now.
@ -89,3 +101,27 @@ class WebVTTFormatter(Formatter):
            lines.append("{}\n{}".format(time_text, line['text']))
        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
 class FormatterLoader(object):
    TYPES = {
        'json': JSONFormatter,
        'pretty': PrettyPrintFormatter,
        'text': TextFormatter,
        'webvvt': WebVTTFormatter,
    }
    class UnknownFormatterType(Exception):
        def __init__(self, formatter_type):
            super(FormatterLoader.UnknownFormatterType, self).__init__(
                f'The format \'{formatter_type}\' is not supported. '
                f'Choose one of the following formats: {", ".join(FormatterLoader.TYPES.keys())}'
            )
    def __init__(self, formatter_type='pretty'):
        if formatter_type not in FormatterLoader.TYPES.keys():
            raise FormatterLoader.UnknownFormatterType(formatter_type)
        self._formatter = FormatterLoader.TYPES[formatter_type]
    def load(self, transcript):
        return self._formatter(transcript)
--- a/youtube_transcript_api/test/test_formatters.py
+++ b/youtube_transcript_api/test/test_formatters.py
@ -1,12 +1,15 @@
 import json
 from mock import MagicMock
 from unittest import TestCase
 import json
 import pprint
 from youtube_transcript_api.formatters import (
    Formatter,
    JSONFormatter,
    TextFormatter,
-    WebVTTFormatter
+    WebVTTFormatter,
    PrettyPrintFormatter, FormatterLoader
 )
@ -35,6 +38,7 @@ class TestFormatters(TestCase):
    def test_webvtt_formatter_starting(self):
        content = WebVTTFormatter(self.transcript).format()
        lines = content.split('\n')
        # test starting lines
        self.assertEqual(lines[0], "WEBVTT")
        self.assertEqual(lines[1], "")
@ -42,16 +46,40 @@ class TestFormatters(TestCase):
    def test_webvtt_formatter_ending(self):
        content = WebVTTFormatter(self.transcript).format()
        lines = content.split('\n')
        # test ending lines
        self.assertEqual(lines[-2], self.transcript[-1]['text'])
        self.assertEqual(lines[-1], "")
    def test_pretty_print_formatter(self):
        content = PrettyPrintFormatter(self.transcript).format()
        self.assertEqual(content, pprint.pformat(self.transcript))
    def test_json_formatter(self):
        content = JSONFormatter(self.transcript).format()
        self.assertEqual(json.loads(content), self.transcript)
    def test_text_formatter(self):
        content = TextFormatter(self.transcript).format()
        lines = content.split('\n')
        self.assertEqual(lines[0], self.transcript[0]["text"])
        self.assertEqual(lines[-1], self.transcript[-1]["text"])
    def test_formatter_loader(self):
        loader = FormatterLoader('json')
        formatter = loader.load(self.transcript)
        self.assertTrue(isinstance(formatter, JSONFormatter))
    def test_formatter_loader__default_formatter(self):
        loader = FormatterLoader()
        formatter = loader.load(self.transcript)
        self.assertTrue(isinstance(formatter, PrettyPrintFormatter))
    def test_formatter_loader__unknown_format(self):
        with self.assertRaises(FormatterLoader.UnknownFormatterType):
            FormatterLoader('png')