Removed unnecessary language variables, sort split matches by len while ignoring name arguement
This commit is contained in:
		
							parent
							
								
									7ac7d3266b
								
							
						
					
					
						commit
						c7cb3117be
					
				|  | @ -40,7 +40,7 @@ class YouTubeTranscriptApi(): | ||||||
|             self.video_id = video_id |             self.video_id = video_id | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): |     def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcripts for a list of videos. |         Retrieves the transcripts for a list of videos. | ||||||
| 
 | 
 | ||||||
|  | @ -75,7 +75,7 @@ class YouTubeTranscriptApi(): | ||||||
|         return data, unretrievable_videos |         return data, unretrievable_videos | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcript(cls, video_id, languages=None, proxies=None): |     def get_transcript(cls, video_id, languages=['en'], proxies=None): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcript for a single video. |         Retrieves the transcript for a single video. | ||||||
| 
 | 
 | ||||||
|  | @ -100,14 +100,14 @@ class YouTubeTranscriptApi(): | ||||||
| class _TranscriptFetcher(): | class _TranscriptFetcher(): | ||||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' |     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||||
|     API_BASE_URL = 'https://www.youtube.com/api/' |     API_BASE_URL = 'https://www.youtube.com/api/' | ||||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') |  | ||||||
|     TIMEDTEXT_STRING = 'timedtext?v=' |     TIMEDTEXT_STRING = 'timedtext?v=' | ||||||
|  |     NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)') | ||||||
| 
 | 
 | ||||||
|     def __init__(self, video_id, languages, proxies): |     def __init__(self, video_id, languages, proxies): | ||||||
|         self.video_id = video_id |         self.video_id = video_id | ||||||
|         self.languages = languages |         self.languages = languages | ||||||
|  |         print(languages) | ||||||
|         self.proxies = proxies |         self.proxies = proxies | ||||||
|         self.matched_splits = [] |  | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self): | ||||||
|         if self.proxies: |         if self.proxies: | ||||||
|  | @ -118,19 +118,25 @@ class _TranscriptFetcher(): | ||||||
|                 .replace('\\u0026', '&') |                 .replace('\\u0026', '&') | ||||||
|                 .replace('\\', '')  |                 .replace('\\', '')  | ||||||
|                 for split in fetched_site.split(self.TIMEDTEXT_STRING)] |                 for split in fetched_site.split(self.TIMEDTEXT_STRING)] | ||||||
|         for language in (self.languages if self.languages else ['en']): |         matched_splits = [] | ||||||
|             self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] |         for language in self.languages: | ||||||
|             if self.matched_splits: |             matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] | ||||||
|  |             if matched_splits: | ||||||
|                 break |                 break | ||||||
|         if self.matched_splits: |         if matched_splits: | ||||||
|             timedtext_url = min(self.matched_splits, key=len) |             timedtext_url = min(matched_splits, key=self._sort_splits) | ||||||
|             response = self._execute_api_request(timedtext_url, language) |             response = self._execute_api_request(timedtext_url) | ||||||
|             if response: |             if response: | ||||||
|                 return response |                 return response | ||||||
| 
 | 
 | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|     def _execute_api_request(self, timedtext_url, language): |     #Sorting the matched splits by string length because we want non-asr options returned first | ||||||
|  |     #However, we don't want to include the length of the 'name' argument as it could possible throw this off | ||||||
|  |     def _sort_splits(self, matched_split): | ||||||
|  |         return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) | ||||||
|  | 
 | ||||||
|  |     def _execute_api_request(self, timedtext_url): | ||||||
|         url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) |         url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) | ||||||
|         if self.proxies: |         if self.proxies: | ||||||
|             return requests.get(url, proxies=self.proxies).text |             return requests.get(url, proxies=self.proxies).text | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue