commit
						297fe03752
					
				
							
								
								
									
										78
									
								
								README.md
								
								
								
								
							
							
						
						
									
										78
									
								
								README.md
								
								
								
								
							|  | @ -1,7 +1,7 @@ | |||
| 
 | ||||
| # YouTube Transcript/Subtitle API (including automatically generated subtitles and subtitle translations)   | ||||
| 
 | ||||
| [](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) [](https://travis-ci.com/jdepoix/youtube-transcript-api) [](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) [](http://opensource.org/licenses/MIT) [](https://pypi.org/project/youtube-transcript-api/) [](https://pypi.org/project/youtube-transcript-api/) | ||||
| [](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) [](https://travis-ci.com/jdepoix/youtube-transcript-api) [](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) [](http://opensource.org/licenses/MIT) [](https://pypi.org/project/youtube-transcript-api/) [](https://pypi.org/project/youtube-transcript-api/) | ||||
| 
 | ||||
| This is a python API which allows you to get the transcript/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! | ||||
| 
 | ||||
|  | @ -123,6 +123,8 @@ print(translated_transcript.fetch()) | |||
| 
 | ||||
| ### By example | ||||
| ```python | ||||
| from youtube_transcript_api import YouTubeTranscriptApi | ||||
| 
 | ||||
| # retrieve the available transcripts | ||||
| transcript_list = YouTubeTranscriptApi.list_transcripts('video_id') | ||||
| 
 | ||||
|  | @ -158,6 +160,78 @@ transcript = transcript_list.find_manually_created_transcript(['de', 'en']) | |||
| transcript = transcript_list.find_generated_transcript(['de', 'en']) | ||||
| ``` | ||||
| 
 | ||||
| ### Using Formatters | ||||
| Formatters are meant to be an additional layer of processing of the transcript you pass it. The goal is to convert the transcript from its Python data type into a consistent string of a given "format". Such as a basic text (`.txt`) or even formats that have a defined specification such as JSON (`.json`), WebVTT format (`.vtt`), Comma-separated format (`.csv`), etc... | ||||
| 
 | ||||
| The `formatters` submodule provides a few basic formatters to wrap around you transcript data in cases where you might want to do something such as output a specific format then write that format to a file. Maybe to backup/store and run another script against at a later time. | ||||
| 
 | ||||
| We provided a few subclasses of formatters to use: | ||||
| 
 | ||||
| - JSONFormatter | ||||
| - PrettyPrintFormatter | ||||
| - TextFormatter | ||||
| - WebVTTFormatter (a basic implementation) | ||||
| 
 | ||||
| Here is how to import from the `formatters` module. | ||||
| 
 | ||||
| ```python | ||||
| # the base class to inherit from when creating your own formatter. | ||||
| from youtube_transcript_api.formatters import Formatter | ||||
| 
 | ||||
| # some provided subclasses, each outputs a different string format. | ||||
| from youtube_transcript_api.formatters import JSONFormatter | ||||
| from youtube_transcript_api.formatters import TextFormatter | ||||
| from youtube_transcript_api.formatters import WebVTTFormatter | ||||
| ``` | ||||
| 
 | ||||
| ### Provided Formatter Example | ||||
| Lets say we wanted to retrieve a transcript and write that transcript as a JSON file in the same format as the API returned it as. That would look something like this: | ||||
| 
 | ||||
| ```python | ||||
| # your_custom_script.py | ||||
| 
 | ||||
| from youtube_transcript_api import YouTubeTranscriptApi | ||||
| from youtube_transcript_api.formatters import JSONFormatter | ||||
| 
 | ||||
| # Must be a single transcript. | ||||
| transcript = YouTubeTranscriptApi.get_transcript(video_id) | ||||
| 
 | ||||
| formatter = JSONFormatter() | ||||
| 
 | ||||
| # .format_transcript(transcript) turns the transcript into a JSON string. | ||||
| json_formatted = formatter.format_transcript(transcript) | ||||
| 
 | ||||
| 
 | ||||
| # Now we can write it out to a file. | ||||
| with open('your_filename.json', 'w', encoding='utf-8') as json_file: | ||||
|     json_file.write(json_formatted) | ||||
| 
 | ||||
| # Now should have a new JSON file that you can easily read back into Python. | ||||
| ``` | ||||
| 
 | ||||
| **Passing extra keyword arguments** | ||||
| 
 | ||||
| Since JSONFormatter leverages `json.dumps()` you can also forward keyword arguments into `.format_transcript(transcript)` such as making your file output prettier by forwarding the `indent=2` keyword argument. | ||||
| 
 | ||||
| ```python | ||||
| json_formatted = JSONFormatter().format_transcript(transcript, indent=2) | ||||
| ``` | ||||
| 
 | ||||
| ### Custom Formatter Example | ||||
| You can implement your own formatter class. Just inherit from the `Formatter` base class and ensure you implement the `format_transcript(self, transcript, **kwargs)` and `format_transcripts(self, transcripts, **kwargs)` methods which should ultimately return a string when called on your formatter instance. | ||||
| 
 | ||||
| ```python | ||||
| 
 | ||||
| class MyCustomFormatter(Formatter): | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         # Do your custom work in here, but return a string. | ||||
|         return 'your processed output data as a string.' | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         # Do your custom work in here to format a list of transcripts, but return a string. | ||||
|         return 'your processed output data as a string.' | ||||
| ``` | ||||
| 
 | ||||
| ## CLI | ||||
| 
 | ||||
| Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:   | ||||
|  | @ -182,7 +256,7 @@ youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en | |||
| If you would prefer to write it into a file or pipe it into another application, you can also output the results as json using the following line:   | ||||
| 
 | ||||
| ```   | ||||
| youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json   | ||||
| youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --format json > transcripts.json | ||||
| ```   | ||||
| 
 | ||||
| Translating transcripts using the CLI is also possible: | ||||
|  |  | |||
|  | @ -1,11 +1,9 @@ | |||
| import json | ||||
| 
 | ||||
| import pprint | ||||
| 
 | ||||
| import argparse | ||||
| 
 | ||||
| from ._api import YouTubeTranscriptApi | ||||
| 
 | ||||
| from .formatters import FormatterLoader | ||||
| 
 | ||||
| 
 | ||||
| class YouTubeTranscriptCli(object): | ||||
|     def __init__(self, args): | ||||
|  | @ -34,7 +32,7 @@ class YouTubeTranscriptCli(object): | |||
| 
 | ||||
|         return '\n\n'.join( | ||||
|             [str(exception) for exception in exceptions] | ||||
|             + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) | ||||
|             + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else []) | ||||
|         ) | ||||
| 
 | ||||
|     def _fetch_transcript(self, parsed_args, proxies, cookies, video_id): | ||||
|  | @ -98,11 +96,10 @@ class YouTubeTranscriptCli(object): | |||
|             help='If this flag is set transcripts which have been manually created will not be retrieved.', | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             '--json', | ||||
|             action='store_const', | ||||
|             const=True, | ||||
|             default=False, | ||||
|             help='If this flag is set the output will be JSON formatted.', | ||||
|             '--format', | ||||
|             type=str, | ||||
|             default='pretty', | ||||
|             choices=tuple(FormatterLoader.TYPES.keys()), | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             '--translate', | ||||
|  |  | |||
|  | @ -38,12 +38,14 @@ class CouldNotRetrieveTranscript(Exception): | |||
| class VideoUnavailable(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = 'The video is no longer available' | ||||
| 
 | ||||
| 
 | ||||
| class TooManyRequests(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\ | ||||
|     - Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ | ||||
|     - Use a different IP address\n\ | ||||
|     - Wait until the ban on your IP has been lifted") | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptsDisabled(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = 'Subtitles are disabled for this video' | ||||
| 
 | ||||
|  |  | |||
|  | @ -0,0 +1,168 @@ | |||
| import json | ||||
| 
 | ||||
| import pprint | ||||
| 
 | ||||
| 
 | ||||
| class Formatter(object): | ||||
|     """Formatter should be used as an abstract base class. | ||||
|      | ||||
|     Formatter classes should inherit from this class and implement | ||||
|     their own .format() method which should return a string. A  | ||||
|     transcript is represented by a List of Dictionary items. | ||||
|     """ | ||||
| 
 | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         raise NotImplementedError('A subclass of Formatter must implement ' \ | ||||
|             'their own .format_transcript() method.') | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         raise NotImplementedError('A subclass of Formatter must implement ' \ | ||||
|                                   'their own .format_transcripts() method.') | ||||
| 
 | ||||
| 
 | ||||
| class PrettyPrintFormatter(Formatter): | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         """Pretty prints a transcript. | ||||
| 
 | ||||
|         :param transcript: | ||||
|         :return: A pretty printed string representation of the transcript.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return pprint.pformat(transcript, **kwargs) | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         """Pretty prints a list of transcripts. | ||||
| 
 | ||||
|         :param transcripts: | ||||
|         :return: A pretty printed string representation of the transcripts.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return self.format_transcript(transcripts, **kwargs) | ||||
| 
 | ||||
| 
 | ||||
| class JSONFormatter(Formatter): | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         """Converts a transcript into a JSON string. | ||||
| 
 | ||||
|         :param transcript: | ||||
|         :return: A JSON string representation of the transcript.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return json.dumps(transcript, **kwargs) | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         """Converts a list of transcripts into a JSON string. | ||||
| 
 | ||||
|         :param transcripts: | ||||
|         :return: A JSON string representation of the transcript.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return self.format_transcript(transcripts, **kwargs) | ||||
| 
 | ||||
| 
 | ||||
| class TextFormatter(Formatter): | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         """Converts a transcript into plain text with no timestamps. | ||||
| 
 | ||||
|         :param transcript: | ||||
|         :return: all transcript text lines separated by newline breaks.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return '\n'.join(line['text'] for line in transcript) | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         """Converts a list of transcripts into plain text with no timestamps. | ||||
| 
 | ||||
|         :param transcripts: | ||||
|         :return: all transcript text lines separated by newline breaks.' | ||||
|         :rtype str | ||||
|         """ | ||||
|         return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) | ||||
| 
 | ||||
| 
 | ||||
| class WebVTTFormatter(Formatter): | ||||
|     def _seconds_to_timestamp(self, time): | ||||
|         """Helper that converts `time` into a transcript cue timestamp. | ||||
| 
 | ||||
|         :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp | ||||
| 
 | ||||
|         :param time: a float representing time in seconds. | ||||
|         :type time: float | ||||
|         :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS' | ||||
|         :rtype str | ||||
|         :example: | ||||
|         >>> self._seconds_to_timestamp(6.93) | ||||
|         '00:00:06.930' | ||||
|         """ | ||||
|         time = float(time) | ||||
|         hours, mins, secs = ( | ||||
|             int(time) // 3600, | ||||
|             int(time) // 60, | ||||
|             int(time) % 60, | ||||
|         ) | ||||
|         ms = int(round((time - int(time))*1000, 2)) | ||||
|         return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) | ||||
|      | ||||
|     def format_transcript(self, transcript, **kwargs): | ||||
|         """A basic implementation of WEBVTT formatting. | ||||
| 
 | ||||
|         :param transcript: | ||||
|         :reference: https://www.w3.org/TR/webvtt1/#introduction-caption | ||||
|         """ | ||||
|         lines = [] | ||||
|         for i, line in enumerate(transcript): | ||||
|             if i < len(transcript) - 1: | ||||
|                 # Looks ahead, use next start time since duration value | ||||
|                 # would create an overlap between start times. | ||||
|                 time_text = "{} --> {}".format( | ||||
|                     self._seconds_to_timestamp(line['start']), | ||||
|                     self._seconds_to_timestamp(transcript[i + 1]['start']) | ||||
|                 ) | ||||
|             else: | ||||
|                 # Reached the end, cannot look ahead, use duration now. | ||||
|                 duration = line['start'] + line['duration'] | ||||
|                 time_text = "{} --> {}".format( | ||||
|                     self._seconds_to_timestamp(line['start']), | ||||
|                     self._seconds_to_timestamp(duration) | ||||
|                 ) | ||||
|             lines.append("{}\n{}".format(time_text, line['text'])) | ||||
|          | ||||
|         return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" | ||||
| 
 | ||||
|     def format_transcripts(self, transcripts, **kwargs): | ||||
|         """A basic implementation of WEBVTT formatting for a list of transcripts. | ||||
| 
 | ||||
|         :param transcripts: | ||||
|         :reference: https://www.w3.org/TR/webvtt1/#introduction-caption | ||||
|         """ | ||||
|         return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) | ||||
| 
 | ||||
| 
 | ||||
| class FormatterLoader(object): | ||||
|     TYPES = { | ||||
|         'json': JSONFormatter, | ||||
|         'pretty': PrettyPrintFormatter, | ||||
|         'text': TextFormatter, | ||||
|         'webvvt': WebVTTFormatter, | ||||
|     } | ||||
| 
 | ||||
|     class UnknownFormatterType(Exception): | ||||
|         def __init__(self, formatter_type): | ||||
|             super(FormatterLoader.UnknownFormatterType, self).__init__( | ||||
|                 'The format \'{formatter_type}\' is not supported. ' | ||||
|                 'Choose one of the following formats: {supported_formatter_types}'.format( | ||||
|                     formatter_type=formatter_type, | ||||
|                     supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()), | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|     def load(self, formatter_type='pretty'): | ||||
|         """ | ||||
|         Loads the Formatter for the given formatter type. | ||||
| 
 | ||||
|         :param formatter_type: | ||||
|         :return: Formatter object | ||||
|         """ | ||||
|         if formatter_type not in FormatterLoader.TYPES.keys(): | ||||
|             raise FormatterLoader.UnknownFormatterType(formatter_type) | ||||
|         return FormatterLoader.TYPES[formatter_type]() | ||||
|  | @ -25,50 +25,52 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|         YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock) | ||||
| 
 | ||||
|     def test_argument_parsing(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args() | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --format json --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args() | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args() | ||||
|         parsed_args = YouTubeTranscriptCli(' --format json v1 v2 --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() | ||||
|             'v1 v2 --languages de en --format json ' | ||||
|             '--http-proxy http://user:pass@domain:port ' | ||||
|             '--https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split() | ||||
|             'v1 v2 --languages de en --format json --http-proxy http://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split() | ||||
|             'v1 v2 --languages de en --format json --https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|  | @ -76,34 +78,34 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|     def test_argument_parsing__only_video_ids(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.format, 'pretty') | ||||
|         self.assertEqual(parsed_args.languages, ['en']) | ||||
| 
 | ||||
|     def test_argument_parsing__video_ids_starting_with_dash(self): | ||||
|         parsed_args = YouTubeTranscriptCli('\-v1 \-\-v2 \--v3'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['-v1', '--v2', '--v3']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.format, 'pretty') | ||||
|         self.assertEqual(parsed_args.languages, ['en']) | ||||
| 
 | ||||
|     def test_argument_parsing__fail_without_video_ids(self): | ||||
|         with self.assertRaises(SystemExit): | ||||
|             YouTubeTranscriptCli('--json'.split())._parse_args() | ||||
|             YouTubeTranscriptCli('--format json'.split())._parse_args() | ||||
| 
 | ||||
|     def test_argument_parsing__json(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args() | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --format json'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['en']) | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args() | ||||
|         parsed_args = YouTubeTranscriptCli('--format json v1 v2'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.format, 'json') | ||||
|         self.assertEqual(parsed_args.languages, ['en']) | ||||
| 
 | ||||
|     def test_argument_parsing__languages(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.format, 'pretty') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|     def test_argument_parsing__proxies(self): | ||||
|  | @ -141,13 +143,13 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|     def test_argument_parsing__translate(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.format, 'pretty') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.translate, 'cz') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.format, 'pretty') | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.translate, 'cz') | ||||
| 
 | ||||
|  | @ -194,7 +196,9 @@ class TestYouTubeTranscriptCli(TestCase): | |||
| 
 | ||||
|     def test_run__exclude_manually_created_and_generated(self): | ||||
|         self.assertEqual( | ||||
|             YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split()).run(), | ||||
|             YouTubeTranscriptCli( | ||||
|                 'v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split() | ||||
|             ).run(), | ||||
|             '' | ||||
|         ) | ||||
| 
 | ||||
|  | @ -210,7 +214,7 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|         YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None) | ||||
| 
 | ||||
|     def test_run__json_output(self): | ||||
|         output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() | ||||
|         output = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split()).run() | ||||
| 
 | ||||
|         # will fail if output is not valid json | ||||
|         json.loads(output) | ||||
|  |  | |||
|  | @ -0,0 +1,102 @@ | |||
| from unittest import TestCase | ||||
| 
 | ||||
| import json | ||||
| 
 | ||||
| import pprint | ||||
| 
 | ||||
| from youtube_transcript_api.formatters import ( | ||||
|     Formatter, | ||||
|     JSONFormatter, | ||||
|     TextFormatter, | ||||
|     WebVTTFormatter, | ||||
|     PrettyPrintFormatter, FormatterLoader | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| class TestFormatters(TestCase): | ||||
|     def setUp(self): | ||||
|         self.transcript = [ | ||||
|             {'text': 'Test line 1', 'start': 0.0, 'duration': 1.50}, | ||||
|             {'text': 'line between', 'start': 1.5, 'duration': 2.0}, | ||||
|             {'text': 'testing the end line', 'start': 2.5, 'duration': 3.25} | ||||
|         ] | ||||
|         self.transcripts = [self.transcript, self.transcript] | ||||
| 
 | ||||
|     def test_base_formatter_format_call(self): | ||||
|         with self.assertRaises(NotImplementedError): | ||||
|             Formatter().format_transcript(self.transcript) | ||||
|         with self.assertRaises(NotImplementedError): | ||||
|             Formatter().format_transcripts([self.transcript]) | ||||
| 
 | ||||
|     def test_webvtt_formatter_starting(self): | ||||
|         content = WebVTTFormatter().format_transcript(self.transcript) | ||||
|         lines = content.split('\n') | ||||
| 
 | ||||
|         # test starting lines | ||||
|         self.assertEqual(lines[0], "WEBVTT") | ||||
|         self.assertEqual(lines[1], "") | ||||
|      | ||||
|     def test_webvtt_formatter_ending(self): | ||||
|         content = WebVTTFormatter().format_transcript(self.transcript) | ||||
|         lines = content.split('\n') | ||||
| 
 | ||||
|         # test ending lines | ||||
|         self.assertEqual(lines[-2], self.transcript[-1]['text']) | ||||
|         self.assertEqual(lines[-1], "") | ||||
| 
 | ||||
|     def test_webvtt_formatter_many(self): | ||||
|         formatter = WebVTTFormatter() | ||||
|         content = formatter.format_transcripts(self.transcripts) | ||||
|         formatted_single_transcript = formatter.format_transcript(self.transcript) | ||||
| 
 | ||||
|         self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) | ||||
| 
 | ||||
|     def test_pretty_print_formatter(self): | ||||
|         content = PrettyPrintFormatter().format_transcript(self.transcript) | ||||
| 
 | ||||
|         self.assertEqual(content, pprint.pformat(self.transcript)) | ||||
| 
 | ||||
|     def test_pretty_print_formatter_many(self): | ||||
|         content = PrettyPrintFormatter().format_transcripts(self.transcripts) | ||||
| 
 | ||||
|         self.assertEqual(content, pprint.pformat(self.transcripts)) | ||||
| 
 | ||||
|     def test_json_formatter(self): | ||||
|         content = JSONFormatter().format_transcript(self.transcript) | ||||
| 
 | ||||
|         self.assertEqual(json.loads(content), self.transcript) | ||||
| 
 | ||||
|     def test_json_formatter_many(self): | ||||
|         content = JSONFormatter().format_transcripts(self.transcripts) | ||||
| 
 | ||||
|         self.assertEqual(json.loads(content), self.transcripts) | ||||
| 
 | ||||
|     def test_text_formatter(self): | ||||
|         content = TextFormatter().format_transcript(self.transcript) | ||||
|         lines = content.split('\n') | ||||
| 
 | ||||
|         self.assertEqual(lines[0], self.transcript[0]["text"]) | ||||
|         self.assertEqual(lines[-1], self.transcript[-1]["text"]) | ||||
| 
 | ||||
|     def test_text_formatter_many(self): | ||||
|         formatter = TextFormatter() | ||||
|         content = formatter.format_transcripts(self.transcripts) | ||||
|         formatted_single_transcript = formatter.format_transcript(self.transcript) | ||||
| 
 | ||||
|         self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) | ||||
| 
 | ||||
|     def test_formatter_loader(self): | ||||
|         loader = FormatterLoader() | ||||
|         formatter = loader.load('json') | ||||
| 
 | ||||
|         self.assertTrue(isinstance(formatter, JSONFormatter)) | ||||
| 
 | ||||
|     def test_formatter_loader__default_formatter(self): | ||||
|         loader = FormatterLoader() | ||||
|         formatter = loader.load() | ||||
| 
 | ||||
|         self.assertTrue(isinstance(formatter, PrettyPrintFormatter)) | ||||
| 
 | ||||
|     def test_formatter_loader__unknown_format(self): | ||||
|         with self.assertRaises(FormatterLoader.UnknownFormatterType): | ||||
|             FormatterLoader().load('png') | ||||
		Loading…
	
		Reference in New Issue