refactored the way transcript information is retrieved and thereby improved error messages
This commit is contained in:
		
							parent
							
								
									54ef72fafd
								
							
						
					
					
						commit
						df417be915
					
				|  | @ -1 +1,3 @@ | |||
| from ._api import YouTubeTranscriptApi | ||||
| from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript | ||||
| from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable | ||||
|  |  | |||
|  | @ -1,44 +1,9 @@ | |||
| import sys | ||||
| 
 | ||||
| # This can only be tested by using different python versions, therefore it is not covered by coverage.py | ||||
| if sys.version_info.major == 2: # pragma: no cover | ||||
|     reload(sys) | ||||
|     sys.setdefaultencoding('utf-8') | ||||
| 
 | ||||
| from xml.etree import ElementTree | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| import requests | ||||
| 
 | ||||
| from ._html_unescaping import unescape | ||||
| from ._transcripts import TranscriptDataFetcher | ||||
| 
 | ||||
| 
 | ||||
| class YouTubeTranscriptApi(): | ||||
|     class CouldNotRetrieveTranscript(Exception): | ||||
|         """ | ||||
|         Raised if a transcript could not be retrieved. | ||||
|         """ | ||||
| 
 | ||||
|         ERROR_MESSAGE = ( | ||||
|             'Could not get the transcript for the video {video_url}! ' | ||||
|             'This usually happens if one of the following things is the case:\n' | ||||
|             ' - subtitles have been disabled by the uploader\n' | ||||
|             ' - none of the language codes you provided are valid\n' | ||||
|             ' - none of the languages you provided are supported by the video\n' | ||||
|             ' - the video is no longer available.\n\n' | ||||
|             'If none of these things is the case, please create an issue at ' | ||||
|             'https://github.com/jdepoix/youtube-transcript-api/issues.' | ||||
|             'Please add which version of youtube_transcript_api you are using and make sure that there ' | ||||
|             'are no open issues which already describe your problem!' | ||||
|         ) | ||||
| 
 | ||||
|         def __init__(self, video_id): | ||||
|             super(YouTubeTranscriptApi.CouldNotRetrieveTranscript, self).__init__( | ||||
|                 self.ERROR_MESSAGE.format(video_url=_TranscriptFetcher.WATCH_URL.format(video_id=video_id)) | ||||
|             ) | ||||
|             self.video_id = video_id | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): | ||||
|         """ | ||||
|  | @ -47,7 +12,7 @@ class YouTubeTranscriptApi(): | |||
|         :param video_ids: a list of youtube video ids | ||||
|         :type video_ids: [str] | ||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||
|         play around with the language codes a bit, to find the one which is working for you! | ||||
|         :type languages: [str] | ||||
|  | @ -91,78 +56,6 @@ class YouTubeTranscriptApi(): | |||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||
|         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||
|         """ | ||||
|         try: | ||||
|             return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse() | ||||
|         except Exception: | ||||
|             raise YouTubeTranscriptApi.CouldNotRetrieveTranscript(video_id) | ||||
| 
 | ||||
| 
 | ||||
| class _TranscriptFetcher(): | ||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||
|     API_BASE_URL = 'https://www.youtube.com/api/' | ||||
|     TIMEDTEXT_STRING = 'timedtext?v=' | ||||
|     NAME_REGEX = re.compile(r'&name=.*?(&)|&name=.*') | ||||
| 
 | ||||
|     def __init__(self, video_id, languages, proxies): | ||||
|         self.video_id = video_id | ||||
|         self.languages = languages | ||||
|         self.proxies = proxies | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         if self.proxies: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text | ||||
|         else: | ||||
|             fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||
|         timedtext_splits = [split[:split.find('"')] | ||||
|                 .replace('\\u0026', '&') | ||||
|                 .replace('\\', '')  | ||||
|                 for split in fetched_site.split(self.TIMEDTEXT_STRING)] | ||||
|         matched_splits = [] | ||||
|         for language in self.languages: | ||||
|             matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] | ||||
|             if matched_splits: | ||||
|                 break | ||||
|         if matched_splits: | ||||
|             timedtext_url = min(matched_splits, key=self._sort_splits) | ||||
|             response = self._execute_api_request(timedtext_url) | ||||
|             if response: | ||||
|                 return response | ||||
| 
 | ||||
|         return None | ||||
| 
 | ||||
|     def _sort_splits(self, matched_split): | ||||
|         """Returns a value related to a given caption track url. | ||||
| 
 | ||||
|         This function is used to sort the matched splits by string  | ||||
|         length because we want non-asr and non-dialect options returned first. | ||||
|         With this in mind, it is remove the 'name' arugument from the url as  | ||||
|         it could possibly make the values inaccurate to what we desire. | ||||
| 
 | ||||
|         matched_split: The caption track url we want to return a value for.         | ||||
|         """ | ||||
|         return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) | ||||
| 
 | ||||
|     def _execute_api_request(self, timedtext_url): | ||||
|         url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) | ||||
|         if self.proxies: | ||||
|             return requests.get(url, proxies=self.proxies).text | ||||
|         else: | ||||
|             return requests.get(url).text | ||||
| 
 | ||||
| 
 | ||||
| class _TranscriptParser(): | ||||
|     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) | ||||
| 
 | ||||
|     def __init__(self, plain_data): | ||||
|         self.plain_data = plain_data | ||||
| 
 | ||||
|     def parse(self): | ||||
|         return [ | ||||
|             { | ||||
|                 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), | ||||
|                 'start': float(xml_element.attrib['start']), | ||||
|                 'duration': float(xml_element.attrib['dur']), | ||||
|             } | ||||
|             for xml_element in ElementTree.fromstring(self.plain_data) | ||||
|             if xml_element.text is not None | ||||
|         ] | ||||
|         with requests.Session() as http_client: | ||||
|             http_client.proxies = proxies if proxies else {} | ||||
|             return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() | ||||
|  |  | |||
|  | @ -0,0 +1,62 @@ | |||
| from ._settings import WATCH_URL | ||||
| 
 | ||||
| 
 | ||||
| class CouldNotRetrieveTranscript(Exception): | ||||
|     """ | ||||
|     Raised if a transcript could not be retrieved. | ||||
|     """ | ||||
|     ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!' | ||||
|     CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}' | ||||
|     CAUSE_MESSAGE = '' | ||||
|     GITHUB_REFERRAL = ( | ||||
|         '\n\nIf you are sure that the described cause is not responsible for this error ' | ||||
|         'and that a transcript should be retrievable, please create an issue at ' | ||||
|         'https://github.com/jdepoix/youtube-transcript-api/issues.' | ||||
|         'Please add which version of youtube_transcript_api you are using ' | ||||
|         'and provide the information needed to replicate the error. ' | ||||
|         'Also make sure that there are no open issues which already describe your problem!' | ||||
|     ) | ||||
| 
 | ||||
|     def __init__(self, video_id): | ||||
|         self.video_id = video_id | ||||
|         super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message()) | ||||
| 
 | ||||
|     def _build_error_message(self): | ||||
|         cause = self.cause | ||||
|         error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id)) | ||||
| 
 | ||||
|         if cause: | ||||
|             error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL | ||||
| 
 | ||||
|         return error_message | ||||
| 
 | ||||
|     @property | ||||
|     def cause(self): | ||||
|         return self.CAUSE_MESSAGE | ||||
| 
 | ||||
| 
 | ||||
| class VideoUnavailable(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = 'The video is no longer available' | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptsDisabled(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = 'Subtitles are disabled for this video' | ||||
| 
 | ||||
| 
 | ||||
| class NoTranscriptFound(CouldNotRetrieveTranscript): | ||||
|     CAUSE_MESSAGE = ( | ||||
|         'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' | ||||
|         '{transcript_data}' | ||||
|     ) | ||||
| 
 | ||||
|     def __init__(self, video_id, requested_language_codes, transcript_data): | ||||
|         self._requested_language_codes = requested_language_codes | ||||
|         self._transcript_data = transcript_data | ||||
|         super(NoTranscriptFound, self).__init__(video_id) | ||||
| 
 | ||||
|     @property | ||||
|     def cause(self): | ||||
|         return self.CAUSE_MESSAGE.format( | ||||
|             requested_language_codes=self._requested_language_codes, | ||||
|             transcript_data=str(self._transcript_data), | ||||
|         ) | ||||
|  | @ -0,0 +1 @@ | |||
| WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||
|  | @ -0,0 +1,202 @@ | |||
| import sys | ||||
| 
 | ||||
| # This can only be tested by using different python versions, therefore it is not covered by coverage.py | ||||
| if sys.version_info.major == 2: # pragma: no cover | ||||
|     reload(sys) | ||||
|     sys.setdefaultencoding('utf-8') | ||||
| 
 | ||||
| import json | ||||
| 
 | ||||
| from xml.etree import ElementTree | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| from ._html_unescaping import unescape | ||||
| from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled | ||||
| from ._settings import WATCH_URL | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptDataFetcher(): | ||||
|     def __init__(self, http_client): | ||||
|         self._http_client = http_client | ||||
| 
 | ||||
|     def fetch(self, video_id): | ||||
|         return TranscriptData.build( | ||||
|             self._http_client, | ||||
|             video_id, | ||||
|             self._extract_captions_json(self._fetch_html(video_id), video_id) | ||||
|         ) | ||||
| 
 | ||||
|     def _extract_captions_json(self, html, video_id): | ||||
|         splitted_html = html.split('"captions":') | ||||
| 
 | ||||
|         if len(splitted_html) <= 1: | ||||
|             if '"playabilityStatus":' not in html: | ||||
|                 raise VideoUnavailable(video_id) | ||||
| 
 | ||||
|             raise TranscriptsDisabled(video_id) | ||||
| 
 | ||||
|         return json.loads(splitted_html[1].split(',"videoDetails')[0].replace('\n', ''))[ | ||||
|             'playerCaptionsTracklistRenderer' | ||||
|         ] | ||||
| 
 | ||||
|     def _fetch_html(self, video_id): | ||||
|         return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace( | ||||
|             '\\u0026', '&' | ||||
|         ).replace( | ||||
|             '\\', '' | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptData(): | ||||
|     # TODO implement iterator | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages | ||||
|     ): | ||||
|         self._http_client = http_client | ||||
|         self.video_id = video_id | ||||
|         self._manually_created_transcripts = manually_created_transcripts | ||||
|         self._generated_transcripts = generated_transcripts | ||||
|         self._translation_languages = translation_languages | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def build(http_client, video_id, captions_json): | ||||
|         manually_created_transcripts = [] | ||||
|         generated_transcripts = [] | ||||
| 
 | ||||
|         for caption in captions_json['captionTracks']: | ||||
|             (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( | ||||
|                 { | ||||
|                     'url': caption['baseUrl'], | ||||
|                     'language': caption['name']['simpleText'], | ||||
|                     'language_code': caption['languageCode'], | ||||
|                     'is_generated': caption.get('kind', '') == 'asr', | ||||
|                     'is_translatable': caption['isTranslatable'], | ||||
|                 } | ||||
|             ) | ||||
| 
 | ||||
|         return TranscriptData( | ||||
|             http_client, | ||||
|             video_id, | ||||
|             manually_created_transcripts, | ||||
|             generated_transcripts, | ||||
|             [ | ||||
|                 { | ||||
|                     'language': translation_language['languageName']['simpleText'], | ||||
|                     'language_code': translation_language['languageCode'], | ||||
|                 } for translation_language in captions_json['translationLanguages'] | ||||
|             ], | ||||
|         ) | ||||
| 
 | ||||
|     def find_transcript(self, language_codes): | ||||
|         try: | ||||
|             return self.find_manually_created_transcript(language_codes) | ||||
|         except NoTranscriptFound: | ||||
|             pass | ||||
| 
 | ||||
|         return self.find_generated_transcript(language_codes) | ||||
| 
 | ||||
|     def find_generated_transcript(self, language_codes): | ||||
|         return self._find_transcript(language_codes, generated=True) | ||||
| 
 | ||||
|     def find_manually_created_transcript(self, language_codes): | ||||
|         return self._find_transcript(language_codes, generated=False) | ||||
| 
 | ||||
|     def _find_transcript(self, language_codes, generated): | ||||
|         transcripts = self._generated_transcripts if generated else self._manually_created_transcripts | ||||
| 
 | ||||
|         for language_code in language_codes: | ||||
|             for transcript in transcripts: | ||||
|                 if transcript['language_code'] == language_code: | ||||
|                     return Transcript( | ||||
|                         self._http_client, | ||||
|                         transcript['url'], | ||||
|                         transcript['language'], | ||||
|                         transcript['language_code'], | ||||
|                         transcript['is_generated'], | ||||
|                         self._translation_languages if transcript['is_translatable'] else [] | ||||
|                     ) | ||||
| 
 | ||||
|         raise NoTranscriptFound( | ||||
|             self.video_id, | ||||
|             language_codes, | ||||
|             self | ||||
|         ) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         return ( | ||||
|             'For this video ({video_id}) transcripts are available in the following languages:\n\n' | ||||
|             '(MANUALLY CREATED)\n' | ||||
|             '{available_manually_created_transcript_languages}\n\n' | ||||
|             '(GENERATED)\n' | ||||
|             '{available_generated_transcripts}' | ||||
|         ).format( | ||||
|             video_id=self.video_id, | ||||
|             available_manually_created_transcript_languages=self._get_language_description( | ||||
|                 self._manually_created_transcripts | ||||
|             ), | ||||
|             available_generated_transcripts=self._get_language_description( | ||||
|                 self._generated_transcripts | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|     def _get_language_description(self, transcripts): | ||||
|         return '\n'.join( | ||||
|             ' - {language_code} ("{language}")'.format( | ||||
|                 language=transcript['language'], | ||||
|                 language_code=transcript['language_code'], | ||||
|             ) for transcript in transcripts | ||||
|         ) if transcripts else 'None' | ||||
| 
 | ||||
| 
 | ||||
| class Transcript(): | ||||
|     def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): | ||||
|         self._http_client = http_client | ||||
|         self.url = url | ||||
|         self.language = language | ||||
|         self.language_code = language_code | ||||
|         self.is_generated = is_generated | ||||
|         self.translation_languages = translation_languages | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         return _TranscriptParser().parse( | ||||
|             self._http_client.get(self.url).text | ||||
|         ) | ||||
| 
 | ||||
| # TODO integrate translations in future release | ||||
| #     @property | ||||
| #     def is_translatable(self): | ||||
| #         return len(self.translation_languages) > 0 | ||||
| # | ||||
| # | ||||
| # class TranslatableTranscript(Transcript): | ||||
| #     def __init__(self, http_client, url, translation_languages): | ||||
| #         super(TranslatableTranscript, self).__init__(http_client, url) | ||||
| #         self._translation_languages = translation_languages | ||||
| #         self._translation_language_codes = {language['language_code'] for language in translation_languages} | ||||
| # | ||||
| # | ||||
| #     def translate(self, language_code): | ||||
| #         if language_code not in self._translation_language_codes: | ||||
| #             raise TranslatableTranscript.TranslationLanguageNotAvailable() | ||||
| # | ||||
| #         return Transcript( | ||||
| #             self._http_client, | ||||
| #             '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code) | ||||
| #         ) | ||||
| 
 | ||||
| 
 | ||||
| class _TranscriptParser(): | ||||
|     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) | ||||
| 
 | ||||
|     def parse(self, plain_data): | ||||
|         return [ | ||||
|             { | ||||
|                 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), | ||||
|                 'start': float(xml_element.attrib['start']), | ||||
|                 'duration': float(xml_element.attrib['dur']), | ||||
|             } | ||||
|             for xml_element in ElementTree.fromstring(plain_data) | ||||
|             if xml_element.text is not None | ||||
|         ] | ||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -5,7 +5,7 @@ import os | |||
| 
 | ||||
| import httpretty | ||||
| 
 | ||||
| from youtube_transcript_api._api import YouTubeTranscriptApi | ||||
| from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable, NoTranscriptFound, TranscriptsDisabled | ||||
| 
 | ||||
| 
 | ||||
| def load_asset(filename): | ||||
|  | @ -64,15 +64,29 @@ class TestYouTubeTranscriptApi(TestCase): | |||
|         self.assertEqual(len(query_string['lang']), 1) | ||||
|         self.assertEqual(query_string['lang'][0], 'en') | ||||
| 
 | ||||
|     def test_get_transcript__exception_is_raised_when_not_available(self): | ||||
|     def test_get_transcript__exception_if_video_unavailable(self): | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/api/timedtext', | ||||
|             body='' | ||||
|             'https://www.youtube.com/watch', | ||||
|             body=load_asset('youtube_video_unavailable.html.static') | ||||
|         ) | ||||
| 
 | ||||
|         with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript): | ||||
|             YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') | ||||
|         with self.assertRaises(VideoUnavailable): | ||||
|             YouTubeTranscriptApi.get_transcript('abc') | ||||
| 
 | ||||
|     def test_get_transcript__exception_if_transcripts_disabled(self): | ||||
|         httpretty.register_uri( | ||||
|             httpretty.GET, | ||||
|             'https://www.youtube.com/watch', | ||||
|             body=load_asset('youtube_transcripts_disabled.html.static') | ||||
|         ) | ||||
| 
 | ||||
|         with self.assertRaises(TranscriptsDisabled): | ||||
|             YouTubeTranscriptApi.get_transcript('dsMFmonKDD4') | ||||
| 
 | ||||
|     def test_get_transcript__exception_if_language_unavailable(self): | ||||
|         with self.assertRaises(NoTranscriptFound): | ||||
|             YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz']) | ||||
| 
 | ||||
|     def test_get_transcripts(self): | ||||
|         video_id_1 = 'video_id_1' | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue