Removed unnecessary language variables, sort split matches by len while ignoring name arguement
This commit is contained in:
		
							parent
							
								
									7ac7d3266b
								
							
						
					
					
						commit
						c7cb3117be
					
				|  | @ -40,7 +40,7 @@ class YouTubeTranscriptApi(): | |||
|             self.video_id = video_id | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): | ||||
|     def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None): | ||||
|         """ | ||||
|         Retrieves the transcripts for a list of videos. | ||||
| 
 | ||||
|  | @ -75,7 +75,7 @@ class YouTubeTranscriptApi(): | |||
|         return data, unretrievable_videos | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_transcript(cls, video_id, languages=None, proxies=None): | ||||
|     def get_transcript(cls, video_id, languages=['en'], proxies=None): | ||||
|         """ | ||||
|         Retrieves the transcript for a single video. | ||||
| 
 | ||||
|  | @ -100,14 +100,14 @@ class YouTubeTranscriptApi(): | |||
| class _TranscriptFetcher(): | ||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||
|     API_BASE_URL = 'https://www.youtube.com/api/' | ||||
|     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||
|     TIMEDTEXT_STRING = 'timedtext?v=' | ||||
|     NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)') | ||||
| 
 | ||||
|     def __init__(self, video_id, languages, proxies): | ||||
|         self.video_id = video_id | ||||
|         self.languages = languages | ||||
|         print(languages) | ||||
|         self.proxies = proxies | ||||
|         self.matched_splits = [] | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         if self.proxies: | ||||
|  | @ -118,19 +118,25 @@ class _TranscriptFetcher(): | |||
|                 .replace('\\u0026', '&') | ||||
|                 .replace('\\', '')  | ||||
|                 for split in fetched_site.split(self.TIMEDTEXT_STRING)] | ||||
|         for language in (self.languages if self.languages else ['en']): | ||||
|             self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] | ||||
|             if self.matched_splits: | ||||
|         matched_splits = [] | ||||
|         for language in self.languages: | ||||
|             matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] | ||||
|             if matched_splits: | ||||
|                 break | ||||
|         if self.matched_splits: | ||||
|             timedtext_url = min(self.matched_splits, key=len) | ||||
|             response = self._execute_api_request(timedtext_url, language) | ||||
|         if matched_splits: | ||||
|             timedtext_url = min(matched_splits, key=self._sort_splits) | ||||
|             response = self._execute_api_request(timedtext_url) | ||||
|             if response: | ||||
|                 return response | ||||
| 
 | ||||
|         return None | ||||
| 
 | ||||
|     def _execute_api_request(self, timedtext_url, language): | ||||
|     #Sorting the matched splits by string length because we want non-asr options returned first | ||||
|     #However, we don't want to include the length of the 'name' argument as it could possible throw this off | ||||
|     def _sort_splits(self, matched_split): | ||||
|         return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) | ||||
| 
 | ||||
|     def _execute_api_request(self, timedtext_url): | ||||
|         url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) | ||||
|         if self.proxies: | ||||
|             return requests.get(url, proxies=self.proxies).text | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue