commit
						f9d7d337f4
					
				
							
								
								
									
										27
									
								
								README.md
								
								
								
								
							
							
						
						
									
										27
									
								
								README.md
								
								
								
								
							|  | @ -1,11 +1,5 @@ | |||
| # YouTube Transcript/Subtitle API (including automatically generated subtitles) | ||||
| 
 | ||||
| [](https://travis-ci.org/jdepoix/youtube-transcript-api) | ||||
| [](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) | ||||
| [](http://opensource.org/licenses/MIT) | ||||
| [](https://pypi.org/project/youtube-transcript-api/) | ||||
| [](https://pypi.org/project/youtube-transcript-api/) | ||||
| 
 | ||||
| This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! | ||||
| 
 | ||||
| ## Install | ||||
|  | @ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application, | |||
| youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json | ||||
| ``` | ||||
| 
 | ||||
| ### Proxy | ||||
| 
 | ||||
| You can pass a proxy to use during the network requests | ||||
| 
 | ||||
| Code: | ||||
| ```python | ||||
| from youtube_transcript_api import YouTubeTranscriptApi | ||||
| 
 | ||||
| YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"}) | ||||
| 
 | ||||
| ``` | ||||
| 
 | ||||
| CLI: | ||||
| ``` | ||||
| youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port | ||||
| ``` | ||||
| 
 | ||||
| Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||
| 
 | ||||
| ## Warning | ||||
| 
 | ||||
| This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! | ||||
| This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! | ||||
|  | @ -38,7 +38,7 @@ class YouTubeTranscriptApi(): | |||
|             self.video_id = video_id | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): | ||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): | ||||
|         """ | ||||
|         Retrieves the transcripts for a list of videos. | ||||
| 
 | ||||
|  | @ -55,13 +55,15 @@ class YouTubeTranscriptApi(): | |||
|         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of | ||||
|         video ids, which could not be retrieved | ||||
|         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} | ||||
|         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||
|         :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||
|         """ | ||||
|         data = {} | ||||
|         unretrievable_videos = [] | ||||
| 
 | ||||
|         for video_id in video_ids: | ||||
|             try: | ||||
|                 data[video_id] = cls.get_transcript(video_id, languages) | ||||
|                 data[video_id] = cls.get_transcript(video_id, languages, proxies) | ||||
|             except Exception as exception: | ||||
|                 if not continue_after_error: | ||||
|                     raise exception | ||||
|  | @ -71,7 +73,7 @@ class YouTubeTranscriptApi(): | |||
|         return data, unretrievable_videos | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_transcript(cls, video_id, languages=None): | ||||
|     def get_transcript(cls, video_id, languages=None, proxies=None): | ||||
|         """ | ||||
|         Retrieves the transcript for a single video. | ||||
| 
 | ||||
|  | @ -84,9 +86,11 @@ class YouTubeTranscriptApi(): | |||
|         :type languages: [str] | ||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||
|         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||
|         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||
|         :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||
|         """ | ||||
|         try: | ||||
|             return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() | ||||
|             return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse() | ||||
|         except Exception: | ||||
|             logger.error( | ||||
|                 YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( | ||||
|  | @ -101,12 +105,16 @@ class _TranscriptFetcher(): | |||
|     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' | ||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||
| 
 | ||||
|     def __init__(self, video_id, languages): | ||||
|     def __init__(self, video_id, languages, proxies): | ||||
|         self.video_id = video_id | ||||
|         self.languages = languages | ||||
|         self.proxies = proxies | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||
|         if self.proxies: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text | ||||
|         else: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||
|         timedtext_url_start = fetched_site.find('timedtext') | ||||
| 
 | ||||
|         for language in (self.languages if self.languages else [None,]): | ||||
|  | @ -128,7 +136,10 @@ class _TranscriptFetcher(): | |||
|         ) | ||||
|         if language: | ||||
|             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) | ||||
|         return requests.get(url).text | ||||
|         if self.proxies: | ||||
|             return requests.get(url, proxies=self.proxies).text | ||||
|         else: | ||||
|             return requests.get(url).text | ||||
| 
 | ||||
| 
 | ||||
| class _TranscriptParser(): | ||||
|  |  | |||
|  | @ -14,10 +14,15 @@ class YouTubeTranscriptCli(): | |||
|     def run(self): | ||||
|         parsed_args = self._parse_args() | ||||
| 
 | ||||
|         proxies = None | ||||
|         if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': | ||||
|             proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} | ||||
| 
 | ||||
|         transcripts, _ = YouTubeTranscriptApi.get_transcripts( | ||||
|             parsed_args.video_ids, | ||||
|             languages=parsed_args.languages, | ||||
|             continue_after_error=True | ||||
|             continue_after_error=True, | ||||
|             proxies=proxies | ||||
|         ) | ||||
| 
 | ||||
|         if parsed_args.json: | ||||
|  | @ -53,5 +58,15 @@ class YouTubeTranscriptCli(): | |||
|             default=False, | ||||
|             help='If this flag is set the output will be JSON formatted.', | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             '--http-proxy', dest='http_proxy', | ||||
|             default='', metavar='URL', | ||||
|             help='Use the specified HTTP proxy.' | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             '--https-proxy', dest='https_proxy', | ||||
|             default='', metavar='URL', | ||||
|             help='Use the specified HTTPS proxy.' | ||||
|         ) | ||||
| 
 | ||||
|         return parser.parse_args(self._args) | ||||
|  |  | |||
|  | @ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase): | |||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None) | ||||
|         self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) | ||||
| 
 | ||||
|     def test_get_transcripts__stop_on_error(self): | ||||
|  | @ -99,5 +99,23 @@ class TestYouTubeTranscriptApi(TestCase): | |||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None) | ||||
| 
 | ||||
|     def test_get_transcript__with_proxies(self): | ||||
|         proxies = {'http': '', 'https:': ''} | ||||
|         transcript = YouTubeTranscriptApi.get_transcript( | ||||
|             'GJLlxj_dtq8', proxies=proxies | ||||
|         ) | ||||
| 
 | ||||
|         self.assertEqual( | ||||
|             transcript, | ||||
|             [ | ||||
|                 {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, | ||||
|                 {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, | ||||
|                 {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} | ||||
|             ] | ||||
|         ) | ||||
|         YouTubeTranscriptApi.get_transcript = MagicMock() | ||||
|         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) | ||||
|         YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', None, proxies) | ||||
|  |  | |||
|  | @ -12,16 +12,49 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) | ||||
|         self.assertEqual(parsed_args.json, True) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
| 
 | ||||
|     def test_argument_parsing__only_video_ids(self): | ||||
|         parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() | ||||
|  | @ -50,6 +83,29 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|         self.assertEqual(parsed_args.json, False) | ||||
|         self.assertEqual(parsed_args.languages, ['de', 'en']) | ||||
| 
 | ||||
|     def test_argument_parsing__proxies(self): | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --http-proxy http://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') | ||||
|         self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') | ||||
| 
 | ||||
|         parsed_args = YouTubeTranscriptCli( | ||||
|             'v1 v2'.split() | ||||
|         )._parse_args() | ||||
|         self.assertEqual(parsed_args.http_proxy, '') | ||||
|         self.assertEqual(parsed_args.https_proxy, '') | ||||
| 
 | ||||
|     def test_run(self): | ||||
|         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) | ||||
|         YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() | ||||
|  | @ -57,7 +113,8 @@ class TestYouTubeTranscriptCli(TestCase): | |||
|         YouTubeTranscriptApi.get_transcripts.assert_called_once_with( | ||||
|             ['v1', 'v2'], | ||||
|             languages=['de', 'en'], | ||||
|             continue_after_error=True | ||||
|             continue_after_error=True, | ||||
|             proxies=None | ||||
|         ) | ||||
| 
 | ||||
|     def test_run__json_output(self): | ||||
|  | @ -66,3 +123,15 @@ class TestYouTubeTranscriptCli(TestCase): | |||
| 
 | ||||
|         # will fail if output is not valid json | ||||
|         json.loads(output) | ||||
| 
 | ||||
|     def test_run__proxies(self): | ||||
|         YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) | ||||
|         YouTubeTranscriptCli( | ||||
|             'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run() | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcripts.assert_called_once_with( | ||||
|             ['v1', 'v2'], | ||||
|             languages=['de', 'en'], | ||||
|             continue_after_error=True, | ||||
|             proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} | ||||
|         ) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue