Implemented code to more thoroughly find a lanuages captions

This commit is contained in:
danielcliu 2019-10-16 22:01:30 -07:00
parent eee2b9ad01
commit de1ddf0824
3 changed files with 1534 additions and 26 deletions

View File

@ -99,7 +99,7 @@ class YouTubeTranscriptApi():
class _TranscriptFetcher(): class _TranscriptFetcher():
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
API_BASE_URL = 'https://www.youtube.com/api/{api_url}' API_BASE_URL = 'https://www.youtube.com/api/'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
TIMEDTEXT_STRING = 'timedtext?v=' TIMEDTEXT_STRING = 'timedtext?v='
@ -107,39 +107,28 @@ class _TranscriptFetcher():
self.video_id = video_id self.video_id = video_id
self.languages = languages self.languages = languages
self.proxies = proxies self.proxies = proxies
self.matched_splits = []
def fetch(self): def fetch(self):
if self.proxies: if self.proxies:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
else: else:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING) timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)]
timedtext_url_start = ( for language in (self.languages if self.languages else ['en']):
timedtext_splits[2].find(self.TIMEDTEXT_STRING) self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split]
+ len(timedtext_splits[0]) if self.matched_splits:
+ len(timedtext_splits[1]) break
+ len(self.TIMEDTEXT_STRING) + 1 if self.matched_splits:
) timedtext_url = min(self.matched_splits, key=len)
response = self._execute_api_request(timedtext_url, language)
for language in (self.languages if self.languages else [None,]):
response = self._execute_api_request(fetched_site, timedtext_url_start, language)
if response: if response:
return response return response
return None return None
def _execute_api_request(self, fetched_site, timedtext_url_start, language): def _execute_api_request(self, timedtext_url, language):
url = self.API_BASE_URL.format( url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}'
api_url=fetched_site[
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
].replace(
'\\u0026', '&'
).replace(
'\\', ''
)
)
if language:
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
if self.proxies: if self.proxies:
return requests.get(url, proxies=self.proxies).text return requests.get(url, proxies=self.proxies).text
else: else:

File diff suppressed because one or more lines are too long

View File

@ -53,11 +53,11 @@ class TestYouTubeTranscriptApi(TestCase):
def test_get_transcript__fallback_language_is_used(self): def test_get_transcript__fallback_language_is_used(self):
httpretty.register_uri( httpretty.register_uri(
httpretty.GET, httpretty.GET,
'https://www.youtube.com/api/timedtext', 'https://www.youtube.com/watch',
body='' body=load_asset('youtubeWW1.html.static')
) )
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en'])
query_string = httpretty.last_request().querystring query_string = httpretty.last_request().querystring
self.assertIn('lang', query_string) self.assertIn('lang', query_string)