Implemented code to more thoroughly find a lanuages captions
This commit is contained in:
		
							parent
							
								
									eee2b9ad01
								
							
						
					
					
						commit
						de1ddf0824
					
				|  | @ -99,7 +99,7 @@ class YouTubeTranscriptApi(): | |||
| 
 | ||||
| class _TranscriptFetcher(): | ||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||
|     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' | ||||
|     API_BASE_URL = 'https://www.youtube.com/api/' | ||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||
|     TIMEDTEXT_STRING = 'timedtext?v=' | ||||
| 
 | ||||
|  | @ -107,39 +107,28 @@ class _TranscriptFetcher(): | |||
|         self.video_id = video_id | ||||
|         self.languages = languages | ||||
|         self.proxies = proxies | ||||
|         self.matched_splits = [] | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         if self.proxies: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text | ||||
|         else: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||
|         timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING) | ||||
|         timedtext_url_start = ( | ||||
|             timedtext_splits[2].find(self.TIMEDTEXT_STRING) | ||||
|             + len(timedtext_splits[0]) | ||||
|             + len(timedtext_splits[1]) | ||||
|             + len(self.TIMEDTEXT_STRING) + 1 | ||||
|         ) | ||||
| 
 | ||||
|         for language in (self.languages if self.languages else [None,]): | ||||
|             response = self._execute_api_request(fetched_site, timedtext_url_start, language) | ||||
|         timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] | ||||
|         for language in (self.languages if self.languages else ['en']): | ||||
|             self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split] | ||||
|             if self.matched_splits: | ||||
|                 break | ||||
|         if self.matched_splits: | ||||
|             timedtext_url = min(self.matched_splits, key=len) | ||||
|             response = self._execute_api_request(timedtext_url, language) | ||||
|             if response: | ||||
|                 return response | ||||
| 
 | ||||
|         return None | ||||
| 
 | ||||
|     def _execute_api_request(self, fetched_site, timedtext_url_start, language): | ||||
|         url = self.API_BASE_URL.format( | ||||
|             api_url=fetched_site[ | ||||
|                 timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') | ||||
|             ].replace( | ||||
|                 '\\u0026', '&' | ||||
|             ).replace( | ||||
|                 '\\', '' | ||||
|             ) | ||||
|         ) | ||||
|         if language: | ||||
|             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) | ||||
|     def _execute_api_request(self, timedtext_url, language): | ||||
|         url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}' | ||||
|         if self.proxies: | ||||
|             return requests.get(url, proxies=self.proxies).text | ||||
|         else: | ||||
|  |  | |||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -53,11 +53,11 @@ class TestYouTubeTranscriptApi(TestCase): | |||
|     def test_get_transcript__fallback_language_is_used(self): | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/api/timedtext', | ||||
|             body='' | ||||
|             'https://www.youtube.com/watch', | ||||
|             body=load_asset('youtubeWW1.html.static') | ||||
|         ) | ||||
| 
 | ||||
|         YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) | ||||
|         YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en']) | ||||
|         query_string = httpretty.last_request().querystring | ||||
| 
 | ||||
|         self.assertIn('lang', query_string) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue