Implemented code to more thoroughly find a lanuages captions
This commit is contained in:
		
							parent
							
								
									eee2b9ad01
								
							
						
					
					
						commit
						de1ddf0824
					
				|  | @ -99,7 +99,7 @@ class YouTubeTranscriptApi(): | ||||||
| 
 | 
 | ||||||
| class _TranscriptFetcher(): | class _TranscriptFetcher(): | ||||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' |     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||||
|     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' |     API_BASE_URL = 'https://www.youtube.com/api/' | ||||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') |     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||||
|     TIMEDTEXT_STRING = 'timedtext?v=' |     TIMEDTEXT_STRING = 'timedtext?v=' | ||||||
| 
 | 
 | ||||||
|  | @ -107,39 +107,28 @@ class _TranscriptFetcher(): | ||||||
|         self.video_id = video_id |         self.video_id = video_id | ||||||
|         self.languages = languages |         self.languages = languages | ||||||
|         self.proxies = proxies |         self.proxies = proxies | ||||||
|  |         self.matched_splits = [] | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self): | ||||||
|         if self.proxies: |         if self.proxies: | ||||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text |             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text | ||||||
|         else: |         else: | ||||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text |             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||||
|         timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING) |         timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] | ||||||
|         timedtext_url_start = ( |         for language in (self.languages if self.languages else ['en']): | ||||||
|             timedtext_splits[2].find(self.TIMEDTEXT_STRING) |             self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split] | ||||||
|             + len(timedtext_splits[0]) |             if self.matched_splits: | ||||||
|             + len(timedtext_splits[1]) |                 break | ||||||
|             + len(self.TIMEDTEXT_STRING) + 1 |         if self.matched_splits: | ||||||
|         ) |             timedtext_url = min(self.matched_splits, key=len) | ||||||
| 
 |             response = self._execute_api_request(timedtext_url, language) | ||||||
|         for language in (self.languages if self.languages else [None,]): |  | ||||||
|             response = self._execute_api_request(fetched_site, timedtext_url_start, language) |  | ||||||
|             if response: |             if response: | ||||||
|                 return response |                 return response | ||||||
| 
 | 
 | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|     def _execute_api_request(self, fetched_site, timedtext_url_start, language): |     def _execute_api_request(self, timedtext_url, language): | ||||||
|         url = self.API_BASE_URL.format( |         url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}' | ||||||
|             api_url=fetched_site[ |  | ||||||
|                 timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') |  | ||||||
|             ].replace( |  | ||||||
|                 '\\u0026', '&' |  | ||||||
|             ).replace( |  | ||||||
|                 '\\', '' |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|         if language: |  | ||||||
|             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) |  | ||||||
|         if self.proxies: |         if self.proxies: | ||||||
|             return requests.get(url, proxies=self.proxies).text |             return requests.get(url, proxies=self.proxies).text | ||||||
|         else: |         else: | ||||||
|  |  | ||||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -53,11 +53,11 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
|     def test_get_transcript__fallback_language_is_used(self): |     def test_get_transcript__fallback_language_is_used(self): | ||||||
|         httpretty.register_uri( |         httpretty.register_uri( | ||||||
|             httpretty.GET, |             httpretty.GET, | ||||||
|             'https://www.youtube.com/api/timedtext', |             'https://www.youtube.com/watch', | ||||||
|             body='' |             body=load_asset('youtubeWW1.html.static') | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) |         YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en']) | ||||||
|         query_string = httpretty.last_request().querystring |         query_string = httpretty.last_request().querystring | ||||||
| 
 | 
 | ||||||
|         self.assertIn('lang', query_string) |         self.assertIn('lang', query_string) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue