Implemented code to more thoroughly find a lanuages captions
This commit is contained in:
parent
eee2b9ad01
commit
de1ddf0824
|
@ -99,7 +99,7 @@ class YouTubeTranscriptApi():
|
|||
|
||||
class _TranscriptFetcher():
|
||||
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
||||
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
||||
API_BASE_URL = 'https://www.youtube.com/api/'
|
||||
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
||||
TIMEDTEXT_STRING = 'timedtext?v='
|
||||
|
||||
|
@ -107,39 +107,28 @@ class _TranscriptFetcher():
|
|||
self.video_id = video_id
|
||||
self.languages = languages
|
||||
self.proxies = proxies
|
||||
self.matched_splits = []
|
||||
|
||||
def fetch(self):
|
||||
if self.proxies:
|
||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
|
||||
else:
|
||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
||||
timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING)
|
||||
timedtext_url_start = (
|
||||
timedtext_splits[2].find(self.TIMEDTEXT_STRING)
|
||||
+ len(timedtext_splits[0])
|
||||
+ len(timedtext_splits[1])
|
||||
+ len(self.TIMEDTEXT_STRING) + 1
|
||||
)
|
||||
|
||||
for language in (self.languages if self.languages else [None,]):
|
||||
response = self._execute_api_request(fetched_site, timedtext_url_start, language)
|
||||
timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)]
|
||||
for language in (self.languages if self.languages else ['en']):
|
||||
self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split]
|
||||
if self.matched_splits:
|
||||
break
|
||||
if self.matched_splits:
|
||||
timedtext_url = min(self.matched_splits, key=len)
|
||||
response = self._execute_api_request(timedtext_url, language)
|
||||
if response:
|
||||
return response
|
||||
|
||||
return None
|
||||
|
||||
def _execute_api_request(self, fetched_site, timedtext_url_start, language):
|
||||
url = self.API_BASE_URL.format(
|
||||
api_url=fetched_site[
|
||||
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
|
||||
].replace(
|
||||
'\\u0026', '&'
|
||||
).replace(
|
||||
'\\', ''
|
||||
)
|
||||
)
|
||||
if language:
|
||||
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
|
||||
def _execute_api_request(self, timedtext_url, language):
|
||||
url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}'
|
||||
if self.proxies:
|
||||
return requests.get(url, proxies=self.proxies).text
|
||||
else:
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -53,11 +53,11 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
def test_get_transcript__fallback_language_is_used(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=''
|
||||
'https://www.youtube.com/watch',
|
||||
body=load_asset('youtubeWW1.html.static')
|
||||
)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
|
||||
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en'])
|
||||
query_string = httpretty.last_request().querystring
|
||||
|
||||
self.assertIn('lang', query_string)
|
||||
|
|
Loading…
Reference in New Issue