Rebased on PR #11 and added tests
This commit is contained in:
		
							parent
							
								
									4a564743df
								
							
						
					
					
						commit
						86cd1666c0
					
				
							
								
								
									
										25
									
								
								README.md
								
								
								
								
							
							
						
						
									
										25
									
								
								README.md
								
								
								
								
							|  | @ -1,11 +1,5 @@ | ||||||
| # YouTube Transcript/Subtitle API (including automatically generated subtitles) | # YouTube Transcript/Subtitle API (including automatically generated subtitles) | ||||||
| 
 | 
 | ||||||
| [](https://travis-ci.org/jdepoix/youtube-transcript-api) |  | ||||||
| [](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) |  | ||||||
| [](http://opensource.org/licenses/MIT) |  | ||||||
| [](https://pypi.org/project/youtube-transcript-api/) |  | ||||||
| [](https://pypi.org/project/youtube-transcript-api/) |  | ||||||
| 
 |  | ||||||
| This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! | This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! | ||||||
| 
 | 
 | ||||||
| ## Install | ## Install | ||||||
|  | @ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application, | ||||||
| youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json | youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | ### Proxy | ||||||
|  | 
 | ||||||
|  | You can pass a proxy to use during the network requests | ||||||
|  | 
 | ||||||
|  | Code: | ||||||
|  | ```python | ||||||
|  | from youtube_transcript_api import YouTubeTranscriptApi | ||||||
|  | 
 | ||||||
|  | YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"}) | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | CLI: | ||||||
|  | ``` | ||||||
|  | youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|  | 
 | ||||||
| ## Warning | ## Warning | ||||||
| 
 | 
 | ||||||
| This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! | This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! | ||||||
|  | @ -38,7 +38,7 @@ class YouTubeTranscriptApi(): | ||||||
|             self.video_id = video_id |             self.video_id = video_id | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): |     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcripts for a list of videos. |         Retrieves the transcripts for a list of videos. | ||||||
| 
 | 
 | ||||||
|  | @ -55,13 +55,15 @@ class YouTubeTranscriptApi(): | ||||||
|         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of |         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of | ||||||
|         video ids, which could not be retrieved |         video ids, which could not be retrieved | ||||||
|         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} |         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} | ||||||
|  |         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||||
|  |         :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|         """ |         """ | ||||||
|         data = {} |         data = {} | ||||||
|         unretrievable_videos = [] |         unretrievable_videos = [] | ||||||
| 
 | 
 | ||||||
|         for video_id in video_ids: |         for video_id in video_ids: | ||||||
|             try: |             try: | ||||||
|                 data[video_id] = cls.get_transcript(video_id, languages) |                 data[video_id] = cls.get_transcript(video_id, languages, proxies) | ||||||
|             except Exception as exception: |             except Exception as exception: | ||||||
|                 if not continue_after_error: |                 if not continue_after_error: | ||||||
|                     raise exception |                     raise exception | ||||||
|  | @ -71,7 +73,7 @@ class YouTubeTranscriptApi(): | ||||||
|         return data, unretrievable_videos |         return data, unretrievable_videos | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcript(cls, video_id, languages=None): |     def get_transcript(cls, video_id, languages=None, proxies=None): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcript for a single video. |         Retrieves the transcript for a single video. | ||||||
| 
 | 
 | ||||||
|  | @ -84,9 +86,11 @@ class YouTubeTranscriptApi(): | ||||||
|         :type languages: [str] |         :type languages: [str] | ||||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys |         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||||
|         :rtype: [{'text': str, 'start': float, 'end': float}] |         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||||
|  |         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||||
|  |         :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|         """ |         """ | ||||||
|         try: |         try: | ||||||
|             return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() |             return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse() | ||||||
|         except Exception: |         except Exception: | ||||||
|             logger.error( |             logger.error( | ||||||
|                 YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( |                 YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( | ||||||
|  | @ -101,12 +105,16 @@ class _TranscriptFetcher(): | ||||||
|     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' |     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' | ||||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') |     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||||
| 
 | 
 | ||||||
|     def __init__(self, video_id, languages): |     def __init__(self, video_id, languages, proxies): | ||||||
|         self.video_id = video_id |         self.video_id = video_id | ||||||
|         self.languages = languages |         self.languages = languages | ||||||
|  |         self.proxies = proxies | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self): | ||||||
|         fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text |         if self.proxies: | ||||||
|  |             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text | ||||||
|  |         else: | ||||||
|  |             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||||
|         timedtext_url_start = fetched_site.find('timedtext') |         timedtext_url_start = fetched_site.find('timedtext') | ||||||
| 
 | 
 | ||||||
|         for language in (self.languages if self.languages else [None,]): |         for language in (self.languages if self.languages else [None,]): | ||||||
|  | @ -128,7 +136,10 @@ class _TranscriptFetcher(): | ||||||
|         ) |         ) | ||||||
|         if language: |         if language: | ||||||
|             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) |             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) | ||||||
|         return requests.get(url).text |         if self.proxies: | ||||||
|  |             return requests.get(url, proxies=self.proxies).text | ||||||
|  |         else: | ||||||
|  |             return requests.get(url).text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class _TranscriptParser(): | class _TranscriptParser(): | ||||||
|  |  | ||||||
|  | @ -14,10 +14,13 @@ class YouTubeTranscriptCli(): | ||||||
|     def run(self): |     def run(self): | ||||||
|         parsed_args = self._parse_args() |         parsed_args = self._parse_args() | ||||||
| 
 | 
 | ||||||
|  |         proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} | ||||||
|  | 
 | ||||||
|         transcripts, _ = YouTubeTranscriptApi.get_transcripts( |         transcripts, _ = YouTubeTranscriptApi.get_transcripts( | ||||||
|             parsed_args.video_ids, |             parsed_args.video_ids, | ||||||
|             languages=parsed_args.languages, |             languages=parsed_args.languages, | ||||||
|             continue_after_error=True |             continue_after_error=True, | ||||||
|  |             proxies=proxies | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         if parsed_args.json: |         if parsed_args.json: | ||||||
|  | @ -53,5 +56,15 @@ class YouTubeTranscriptCli(): | ||||||
|             default=False, |             default=False, | ||||||
|             help='If this flag is set the output will be JSON formatted.', |             help='If this flag is set the output will be JSON formatted.', | ||||||
|         ) |         ) | ||||||
|  |         parser.add_argument( | ||||||
|  |             '--http-proxy', dest='http_proxy', | ||||||
|  |             default='', metavar='URL', | ||||||
|  |             help='Use the specified HTTP proxy.' | ||||||
|  |         ) | ||||||
|  |         parser.add_argument( | ||||||
|  |             '--https-proxy', dest='https_proxy', | ||||||
|  |             default='', metavar='URL', | ||||||
|  |             help='Use the specified HTTPS proxy.' | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         return parser.parse_args(self._args) |         return parser.parse_args(self._args) | ||||||
|  |  | ||||||
|  | @ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) |         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) |         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None) | ||||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) |         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None) | ||||||
|         self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) |         self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) | ||||||
| 
 | 
 | ||||||
|     def test_get_transcripts__stop_on_error(self): |     def test_get_transcripts__stop_on_error(self): | ||||||
|  | @ -99,5 +99,19 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) |         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) |         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None) | ||||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) |         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None) | ||||||
|  | 
 | ||||||
|  |     def test_get_transcript__with_proxies(self): | ||||||
|  |         transcript = YouTubeTranscriptApi.get_transcript( | ||||||
|  |             'GJLlxj_dtq8', proxies={'http': '', 'https:': ''} | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual( | ||||||
|  |             transcript, | ||||||
|  |             [ | ||||||
|  |                 {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, | ||||||
|  |                 {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, | ||||||
|  |                 {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  | @ -23,6 +23,31 @@ class TestYouTubeTranscriptCli(TestCase): | ||||||
|         self.assertEqual(parsed_args.json, True) |         self.assertEqual(parsed_args.json, True) | ||||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) |         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||||
| 
 | 
 | ||||||
|  |         parsed_args = YouTubeTranscriptCli( | ||||||
|  |             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() | ||||||
|  |         )._parse_args() | ||||||
|  |         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||||
|  |         self.assertEqual(parsed_args.json, True) | ||||||
|  |         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||||
|  |         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||||
|  |         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||||
|  | 
 | ||||||
|  |         parsed_args = YouTubeTranscriptCli( | ||||||
|  |             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split() | ||||||
|  |         )._parse_args() | ||||||
|  |         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||||
|  |         self.assertEqual(parsed_args.json, True) | ||||||
|  |         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||||
|  |         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||||
|  | 
 | ||||||
|  |         parsed_args = YouTubeTranscriptCli( | ||||||
|  |             'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split() | ||||||
|  |         )._parse_args() | ||||||
|  |         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||||
|  |         self.assertEqual(parsed_args.json, True) | ||||||
|  |         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||||
|  |         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||||
|  | 
 | ||||||
|     def test_argument_parsing__only_video_ids(self): |     def test_argument_parsing__only_video_ids(self): | ||||||
|         parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() |         parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() | ||||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) |         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||||
|  | @ -50,6 +75,17 @@ class TestYouTubeTranscriptCli(TestCase): | ||||||
|         self.assertEqual(parsed_args.json, False) |         self.assertEqual(parsed_args.json, False) | ||||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) |         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||||
| 
 | 
 | ||||||
|  |     def test_argument_parsing__proxies(self): | ||||||
|  |         parsed_args = YouTubeTranscriptCli( | ||||||
|  |             'v1 v2 --http-proxy http://user:pass@domain:port'.split() | ||||||
|  |         )._parse_args() | ||||||
|  |         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||||
|  | 
 | ||||||
|  |         parsed_args = YouTubeTranscriptCli( | ||||||
|  |             'v1 v2 --https-proxy https://user:pass@domain:port'.split() | ||||||
|  |         )._parse_args() | ||||||
|  |         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||||
|  | 
 | ||||||
|     def test_run(self): |     def test_run(self): | ||||||
|         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) |         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) | ||||||
|         YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() |         YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() | ||||||
|  | @ -57,7 +93,8 @@ class TestYouTubeTranscriptCli(TestCase): | ||||||
|         YouTubeTranscriptApi.get_transcripts.assert_called_once_with( |         YouTubeTranscriptApi.get_transcripts.assert_called_once_with( | ||||||
|             ['v1', 'v2'], |             ['v1', 'v2'], | ||||||
|             languages=['de', 'en'], |             languages=['de', 'en'], | ||||||
|             continue_after_error=True |             continue_after_error=True, | ||||||
|  |             proxies={"http": "", "https": ""} | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     def test_run__json_output(self): |     def test_run__json_output(self): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue