diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 1b8360a..c3afb32 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -37,13 +37,15 @@ class CouldNotRetrieveTranscript(Exception): class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' - + + class TooManyRequests(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\ - Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ - Use a different IP address\n\ - Wait until the ban on your IP has been lifted") + class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py index 6696215..1cc6e9d 100644 --- a/youtube_transcript_api/formatters.py +++ b/youtube_transcript_api/formatters.py @@ -1,5 +1,7 @@ import json +import pprint + class Formatter(object): """Formatter should be used as an abstract base class. @@ -22,6 +24,16 @@ class Formatter(object): 'their own .format() method.') +class PrettyPrintFormatter(Formatter): + def format(self, **kwargs): + """Pretty prints a transcript. + + :return: A pretty printed string representation of the transcript dict.' + :rtype str + """ + return pprint.pformat(self._transcript, **kwargs) + + class JSONFormatter(Formatter): def format(self, **kwargs): """Converts a transcript into a JSON string. @@ -72,12 +84,12 @@ class WebVTTFormatter(Formatter): """ lines = [] for i, line in enumerate(self._transcript): - if i < len(self._transcript)-1: + if i < len(self._transcript) - 1: # Looks ahead, use next start time since duration value # would create an overlap between start times. time_text = "{} --> {}".format( self._seconds_to_timestamp(line['start']), - self._seconds_to_timestamp(self._transcript[i+1]['start']) + self._seconds_to_timestamp(self._transcript[i + 1]['start']) ) else: # Reached the end, cannot look ahead, use duration now. @@ -89,3 +101,27 @@ class WebVTTFormatter(Formatter): lines.append("{}\n{}".format(time_text, line['text'])) return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" + + +class FormatterLoader(object): + TYPES = { + 'json': JSONFormatter, + 'pretty': PrettyPrintFormatter, + 'text': TextFormatter, + 'webvvt': WebVTTFormatter, + } + + class UnknownFormatterType(Exception): + def __init__(self, formatter_type): + super(FormatterLoader.UnknownFormatterType, self).__init__( + f'The format \'{formatter_type}\' is not supported. ' + f'Choose one of the following formats: {", ".join(FormatterLoader.TYPES.keys())}' + ) + + def __init__(self, formatter_type='pretty'): + if formatter_type not in FormatterLoader.TYPES.keys(): + raise FormatterLoader.UnknownFormatterType(formatter_type) + self._formatter = FormatterLoader.TYPES[formatter_type] + + def load(self, transcript): + return self._formatter(transcript) diff --git a/youtube_transcript_api/test/test_formatters.py b/youtube_transcript_api/test/test_formatters.py index 9550845..bb0b274 100644 --- a/youtube_transcript_api/test/test_formatters.py +++ b/youtube_transcript_api/test/test_formatters.py @@ -1,12 +1,15 @@ -import json -from mock import MagicMock from unittest import TestCase +import json + +import pprint + from youtube_transcript_api.formatters import ( Formatter, JSONFormatter, TextFormatter, - WebVTTFormatter + WebVTTFormatter, + PrettyPrintFormatter, FormatterLoader ) @@ -35,6 +38,7 @@ class TestFormatters(TestCase): def test_webvtt_formatter_starting(self): content = WebVTTFormatter(self.transcript).format() lines = content.split('\n') + # test starting lines self.assertEqual(lines[0], "WEBVTT") self.assertEqual(lines[1], "") @@ -42,16 +46,40 @@ class TestFormatters(TestCase): def test_webvtt_formatter_ending(self): content = WebVTTFormatter(self.transcript).format() lines = content.split('\n') + # test ending lines self.assertEqual(lines[-2], self.transcript[-1]['text']) self.assertEqual(lines[-1], "") - + + def test_pretty_print_formatter(self): + content = PrettyPrintFormatter(self.transcript).format() + + self.assertEqual(content, pprint.pformat(self.transcript)) + def test_json_formatter(self): content = JSONFormatter(self.transcript).format() + self.assertEqual(json.loads(content), self.transcript) def test_text_formatter(self): content = TextFormatter(self.transcript).format() lines = content.split('\n') + self.assertEqual(lines[0], self.transcript[0]["text"]) self.assertEqual(lines[-1], self.transcript[-1]["text"]) + + def test_formatter_loader(self): + loader = FormatterLoader('json') + formatter = loader.load(self.transcript) + + self.assertTrue(isinstance(formatter, JSONFormatter)) + + def test_formatter_loader__default_formatter(self): + loader = FormatterLoader() + formatter = loader.load(self.transcript) + + self.assertTrue(isinstance(formatter, PrettyPrintFormatter)) + + def test_formatter_loader__unknown_format(self): + with self.assertRaises(FormatterLoader.UnknownFormatterType): + FormatterLoader('png')