commit
						9fa8bb0d70
					
				|  | @ -0,0 +1,30 @@ | |||
| [run] | ||||
| source = youtube_transcript_api | ||||
| 
 | ||||
| 
 | ||||
| [report] | ||||
| omit = | ||||
|   */__main__.py | ||||
| 
 | ||||
| exclude_lines = | ||||
|   pragma: no cover | ||||
| 
 | ||||
|     # Don't complain about missing debug-only code: | ||||
|     def __unicode__ | ||||
|     def __repr__ | ||||
|     if self\.debug | ||||
| 
 | ||||
|     # Don't complain if tests don't hit defensive assertion code: | ||||
|     raise AssertionError | ||||
|     raise NotImplementedError | ||||
| 
 | ||||
|     # Don't complain if non-runnable code isn't run: | ||||
|     if 0: | ||||
|     if __name__ == .__main__.: | ||||
| 
 | ||||
|     # Don't complain about empty stubs of abstract methods | ||||
|     @abstractmethod | ||||
|     @abstractclassmethod | ||||
|     @abstractstaticmethod | ||||
| 
 | ||||
| show_missing = True | ||||
|  | @ -6,3 +6,4 @@ dist | |||
| build | ||||
| *.egg-info | ||||
| upload_new_version.sh | ||||
| .coverage | ||||
|  | @ -0,0 +1,18 @@ | |||
| language: python | ||||
| python: | ||||
|   - "2.7" | ||||
|   - "3.3" | ||||
|   - "3.4" | ||||
|   - "3.5" | ||||
|   - "3.6" | ||||
| matrix: | ||||
|   include: | ||||
|     - python: 3.7 | ||||
|       dist: xenial | ||||
|       sudo: true | ||||
| install: | ||||
|   - pip install -r requirements.txt | ||||
| script: | ||||
|   - coverage run -m unittest discover | ||||
| after_success: | ||||
|   - coveralls | ||||
|  | @ -1,5 +1,11 @@ | |||
| # YouTube Transcript/Subtitle API (including automatically generated subtitles) | ||||
| 
 | ||||
| [](https://travis-ci.org/jdepoix/youtube-transcript-api) | ||||
| [](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) | ||||
| [](http://opensource.org/licenses/MIT) | ||||
| [](https://pypi.org/project/youtube-transcript-api/) | ||||
| [](https://pypi.org/project/youtube-transcript-api/) | ||||
| 
 | ||||
| This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! | ||||
| 
 | ||||
| ## Install | ||||
|  |  | |||
|  | @ -0,0 +1,3 @@ | |||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| .venv/bin/coverage run -m unittest discover && .venv/bin/coverage report | ||||
|  | @ -1 +1,7 @@ | |||
| requests | ||||
| 
 | ||||
| # testing | ||||
| mock | ||||
| httpretty | ||||
| coverage | ||||
| coveralls | ||||
							
								
								
									
										20
									
								
								setup.py
								
								
								
								
							
							
						
						
									
										20
									
								
								setup.py
								
								
								
								
							|  | @ -1,3 +1,7 @@ | |||
| import os | ||||
| 
 | ||||
| import unittest | ||||
| 
 | ||||
| import setuptools | ||||
| 
 | ||||
| 
 | ||||
|  | @ -9,6 +13,15 @@ def get_long_description(): | |||
|     return _get_file_content('README.md') | ||||
| 
 | ||||
| 
 | ||||
| def get_test_suite(): | ||||
|     test_loader = unittest.TestLoader() | ||||
|     test_suite = test_loader.discover( | ||||
|         'test', pattern='test_*.py', | ||||
|         top_level_dir='{dirname}/youtube_transcript_api'.format(dirname=os.path.dirname(__file__)) | ||||
|     ) | ||||
|     return test_suite | ||||
| 
 | ||||
| 
 | ||||
| setuptools.setup( | ||||
|     name="youtube_transcript_api", | ||||
|     version="0.1.3", | ||||
|  | @ -29,6 +42,13 @@ setuptools.setup( | |||
|     install_requires=[ | ||||
|         'requests', | ||||
|     ], | ||||
|     tests_require=[ | ||||
|         'mock', | ||||
|         'httpretty', | ||||
|         'coverage', | ||||
|         'coveralls', | ||||
|     ], | ||||
|     test_suite='setup.get_test_suite', | ||||
|     entry_points={ | ||||
|         'console_scripts': [ | ||||
|             'youtube_transcript_api = youtube_transcript_api.__main__:main', | ||||
|  |  | |||
|  | @ -1,62 +1,14 @@ | |||
| import sys | ||||
| 
 | ||||
| import json | ||||
| 
 | ||||
| from pprint import pprint | ||||
| 
 | ||||
| import logging | ||||
| 
 | ||||
| import argparse | ||||
| 
 | ||||
| from ._api import YouTubeTranscriptApi | ||||
| 
 | ||||
| 
 | ||||
| def parse_args(args): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description=( | ||||
|             'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. ' | ||||
|             'It also works for automatically generated subtitles and it does not require a headless browser, like ' | ||||
|             'other selenium based solutions do!' | ||||
|         ) | ||||
|     ) | ||||
|     parser.add_argument('video_ids', nargs='*', type=str, help='List of YouTube video IDs.') | ||||
|     parser.add_argument( | ||||
|         '--languages', | ||||
|         nargs='*', | ||||
|         default=[], | ||||
|         type=str, | ||||
|         help=( | ||||
|             'A list of language codes in a descending priority. For example, if this is set to "de en" it will first ' | ||||
|             'try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to do so. ' | ||||
|             'As I can\'t provide a complete list of all working language codes with full certainty, you may have to ' | ||||
|             'play around with the language codes a bit, to find the one which is working for you!' | ||||
|         ), | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--json', | ||||
|         action='store_const', | ||||
|         const=True, | ||||
|         default=False, | ||||
|         help='If this flag is set the output will be JSON formatted.', | ||||
|     ) | ||||
| 
 | ||||
|     return parser.parse_args(args) | ||||
| from ._cli import YouTubeTranscriptCli | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     logging.basicConfig() | ||||
| 
 | ||||
|     parsed_args = parse_args(sys.argv[1:]) | ||||
|     transcripts, _ = YouTubeTranscriptApi.get_transcripts( | ||||
|         parsed_args.video_ids, | ||||
|         languages=parsed_args.languages, | ||||
|         continue_after_error=True | ||||
|     ) | ||||
| 
 | ||||
|     if parsed_args.json: | ||||
|         print(json.dumps(transcripts)) | ||||
|     else: | ||||
|         pprint(transcripts) | ||||
|     print(YouTubeTranscriptCli(sys.argv[1:]).run()) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| import sys | ||||
| 
 | ||||
| if sys.version_info.major == 2: | ||||
| # This can only be tested by using different python versions, therefore it is not covered by coverage.py | ||||
| if sys.version_info.major == 2: # pragma: no cover | ||||
|     reload(sys) | ||||
|     sys.setdefaultencoding('utf-8') | ||||
| 
 | ||||
|  | @ -36,8 +37,8 @@ class YouTubeTranscriptApi(): | |||
|             ) | ||||
|             self.video_id = video_id | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def get_transcripts(video_ids, languages=None, continue_after_error=False): | ||||
|     @classmethod | ||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): | ||||
|         """ | ||||
|         Retrieves the transcripts for a list of videos. | ||||
| 
 | ||||
|  | @ -60,7 +61,7 @@ class YouTubeTranscriptApi(): | |||
| 
 | ||||
|         for video_id in video_ids: | ||||
|             try: | ||||
|                 data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages) | ||||
|                 data[video_id] = cls.get_transcript(video_id, languages) | ||||
|             except Exception as exception: | ||||
|                 if not continue_after_error: | ||||
|                     raise exception | ||||
|  | @ -69,15 +70,15 @@ class YouTubeTranscriptApi(): | |||
| 
 | ||||
|         return data, unretrievable_videos | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def get_transcript(video_id, languages=None): | ||||
|     @classmethod | ||||
|     def get_transcript(cls, video_id, languages=None): | ||||
|         """ | ||||
|         Retrieves the transcript for a single video. | ||||
| 
 | ||||
|         :param video_id: the youtube video id | ||||
|         :type video_id: str | ||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||
|         play around with the language codes a bit, to find the one which is working for you! | ||||
|         :type languages: [str] | ||||
|  |  | |||
|  | @ -0,0 +1,57 @@ | |||
| import json | ||||
| 
 | ||||
| import pprint | ||||
| 
 | ||||
| import argparse | ||||
| 
 | ||||
| from ._api import YouTubeTranscriptApi | ||||
| 
 | ||||
| 
 | ||||
| class YouTubeTranscriptCli(): | ||||
|     def __init__(self, args): | ||||
|         self._args = args | ||||
| 
 | ||||
|     def run(self): | ||||
|         parsed_args = self._parse_args() | ||||
| 
 | ||||
|         transcripts, _ = YouTubeTranscriptApi.get_transcripts( | ||||
|             parsed_args.video_ids, | ||||
|             languages=parsed_args.languages, | ||||
|             continue_after_error=True | ||||
|         ) | ||||
| 
 | ||||
|         if parsed_args.json: | ||||
|             return json.dumps(transcripts) | ||||
|         else: | ||||
|             return pprint.pformat(transcripts) | ||||
| 
 | ||||
|     def _parse_args(self): | ||||
|         parser = argparse.ArgumentParser( | ||||
|             description=( | ||||
|                 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. ' | ||||
|                 'It also works for automatically generated subtitles and it does not require a headless browser, like ' | ||||
|                 'other selenium based solutions do!' | ||||
|             ) | ||||
|         ) | ||||
|         parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.') | ||||
|         parser.add_argument( | ||||
|             '--languages', | ||||
|             nargs='*', | ||||
|             default=[], | ||||
|             type=str, | ||||
|             help=( | ||||
|                 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' | ||||
|                 'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails ' | ||||
|                 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you ' | ||||
|                 'may have to play around with the language codes a bit, to find the one which is working for you!' | ||||
|             ), | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             '--json', | ||||
|             action='store_const', | ||||
|             const=True, | ||||
|             default=False, | ||||
|             help='If this flag is set the output will be JSON formatted.', | ||||
|         ) | ||||
| 
 | ||||
|         return parser.parse_args(self._args) | ||||
|  | @ -1,9 +1,11 @@ | |||
| import sys | ||||
| 
 | ||||
| if sys.version_info.major == 3 and sys.version_info.minor >= 4: | ||||
| 
 | ||||
| # This can only be tested by using different python versions, therefore it is not covered by coverage.py | ||||
| if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover | ||||
|     # Python 3.4+ | ||||
|     from html import unescape | ||||
| else: | ||||
| else: # pragma: no cover | ||||
|     if sys.version_info.major <= 2: | ||||
|         # Python 2 | ||||
|         import HTMLParser | ||||
|  |  | |||
|  | @ -0,0 +1 @@ | |||
| 
 | ||||
|  | @ -0,0 +1 @@ | |||
| 
 | ||||
|  | @ -0,0 +1,6 @@ | |||
| <?xml version="1.0" encoding="utf-8" ?> | ||||
| <transcript> | ||||
|     <text start="0" dur="1.54">Hey, this is just a test</text> | ||||
|     <text start="1.54" dur="4.16">this is not the original transcript</text> | ||||
|     <text start="5.7" dur="3.239">just something shorter, I made up for testing</text> | ||||
| </transcript> | ||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -0,0 +1,103 @@ | |||
| from unittest import TestCase | ||||
| from mock import MagicMock | ||||
| 
 | ||||
| import os | ||||
| 
 | ||||
| import httpretty | ||||
| 
 | ||||
| from youtube_transcript_api._api import YouTubeTranscriptApi | ||||
| 
 | ||||
| 
 | ||||
| def load_asset(filename): | ||||
|     with open('{dirname}/assets/{filename}'.format(dirname=os.path.dirname(__file__), filename=filename)) as file: | ||||
|         return file.read() | ||||
| 
 | ||||
| 
 | ||||
| class TestYouTubeTranscriptApi(TestCase): | ||||
|     def setUp(self): | ||||
|         httpretty.enable() | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/watch', | ||||
|             body=load_asset('youtube.html') | ||||
|         ) | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/api/timedtext', | ||||
|             body=load_asset('transcript.xml') | ||||
|         ) | ||||
| 
 | ||||
|     def tearDown(self): | ||||
|         httpretty.disable() | ||||
| 
 | ||||
|     def test_get_transcript(self): | ||||
|         transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') | ||||
| 
 | ||||
|         self.assertEqual( | ||||
|             transcript, | ||||
|             [ | ||||
|                 {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, | ||||
|                 {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, | ||||
|                 {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|     def test_get_transcript__correct_language_is_used(self): | ||||
|         YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) | ||||
|         query_string = httpretty.last_request().querystring | ||||
| 
 | ||||
|         self.assertIn('lang', query_string) | ||||
|         self.assertEqual(len(query_string['lang']), 1) | ||||
|         self.assertEqual(query_string['lang'][0], 'de') | ||||
| 
 | ||||
|     def test_get_transcript__fallback_language_is_used(self): | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/api/timedtext', | ||||
|             body='' | ||||
|         ) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) | ||||
|         query_string = httpretty.last_request().querystring | ||||
| 
 | ||||
|         self.assertIn('lang', query_string) | ||||
|         self.assertEqual(len(query_string['lang']), 1) | ||||
|         self.assertEqual(query_string['lang'][0], 'en') | ||||
| 
 | ||||
|     def test_get_transcript__exception_is_raised_when_not_available(self): | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/api/timedtext', | ||||
|             body='' | ||||
|         ) | ||||
| 
 | ||||
|         with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript): | ||||
|             YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') | ||||
| 
 | ||||
|     def test_get_transcripts(self): | ||||
|         video_id_1 = 'video_id_1' | ||||
|         video_id_2 = 'video_id_2' | ||||
|         languages = ['de', 'en'] | ||||
|         YouTubeTranscriptApi.get_transcript = MagicMock() | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) | ||||
|         self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) | ||||
| 
 | ||||
|     def test_get_transcripts__stop_on_error(self): | ||||
|         YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) | ||||
| 
 | ||||
|         with self.assertRaises(Exception): | ||||
|             YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2']) | ||||
| 
 | ||||
|     def test_get_transcripts__continue_on_error(self): | ||||
|         video_id_1 = 'video_id_1' | ||||
|         video_id_2 = 'video_id_2' | ||||
|         YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) | ||||
|  | @ -0,0 +1,68 @@ | |||
| from unittest import TestCase | ||||
| from mock import MagicMock | ||||
| 
 | ||||
| import json | ||||
| 
 | ||||
| from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi | ||||
| 
 | ||||
| 
 | ||||
| class TestYouTubeTranscriptCli(TestCase): | ||||
|     def test_argument_parsing(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|     def test_argument_parsing__only_video_ids(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.languages, []) | ||||
| 
 | ||||
|     def test_argument_parsing__fail_without_video_ids(self): | ||||
|         with self.assertRaises(SystemExit): | ||||
|             YouTubeTranscriptCli('--json'.split())._parse_args() | ||||
| 
 | ||||
|     def test_argument_parsing__json(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, []) | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, []) | ||||
| 
 | ||||
|     def test_argument_parsing__languages(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|     def test_run(self): | ||||
|         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) | ||||
|         YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts.assert_called_once_with( | ||||
|             ['v1', 'v2'], | ||||
|             languages=['de', 'en'], | ||||
|             continue_after_error=True | ||||
|         ) | ||||
| 
 | ||||
|     def test_run__json_output(self): | ||||
|         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], [])) | ||||
|         output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() | ||||
| 
 | ||||
|         # will fail if output is not valid json | ||||
|         json.loads(output) | ||||
		Loading…
	
		Reference in New Issue