fixed bug; added doctstrings for public methods
This commit is contained in:
		
							parent
							
								
									df417be915
								
							
						
					
					
						commit
						c2c49c3c17
					
				|  | @ -1,3 +1,3 @@ | ||||||
| from ._api import YouTubeTranscriptApi | from ._api import YouTubeTranscriptApi | ||||||
| from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript | from ._transcripts import TranscriptList, Transcript | ||||||
| from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable | from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| import requests | import requests | ||||||
| 
 | 
 | ||||||
| from ._transcripts import TranscriptDataFetcher | from ._transcripts import TranscriptListFetcher | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class YouTubeTranscriptApi(): | class YouTubeTranscriptApi(): | ||||||
|  | @ -13,8 +13,7 @@ class YouTubeTranscriptApi(): | ||||||
|         :type video_ids: [str] |         :type video_ids: [str] | ||||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] |         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to |         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to |         do so. | ||||||
|         play around with the language codes a bit, to find the one which is working for you! |  | ||||||
|         :type languages: [str] |         :type languages: [str] | ||||||
|         :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving |         :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving | ||||||
|         one of the video transcripts |         one of the video transcripts | ||||||
|  | @ -23,7 +22,7 @@ class YouTubeTranscriptApi(): | ||||||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies |         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of |         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of | ||||||
|         video ids, which could not be retrieved |         video ids, which could not be retrieved | ||||||
|         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} |         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) | ||||||
|         """ |         """ | ||||||
|         data = {} |         data = {} | ||||||
|         unretrievable_videos = [] |         unretrievable_videos = [] | ||||||
|  | @ -48,8 +47,7 @@ class YouTubeTranscriptApi(): | ||||||
|         :type video_id: str |         :type video_id: str | ||||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] |         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to |         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to |         do so. | ||||||
|         play around with the language codes a bit, to find the one which is working for you! |  | ||||||
|         :type languages: [str] |         :type languages: [str] | ||||||
|         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests |         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies |         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|  | @ -58,4 +56,4 @@ class YouTubeTranscriptApi(): | ||||||
|         """ |         """ | ||||||
|         with requests.Session() as http_client: |         with requests.Session() as http_client: | ||||||
|             http_client.proxies = proxies if proxies else {} |             http_client.proxies = proxies if proxies else {} | ||||||
|             return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() |             return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() | ||||||
|  |  | ||||||
|  | @ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled | ||||||
| from ._settings import WATCH_URL | from ._settings import WATCH_URL | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TranscriptDataFetcher(): | class TranscriptListFetcher(): | ||||||
|     def __init__(self, http_client): |     def __init__(self, http_client): | ||||||
|         self._http_client = http_client |         self._http_client = http_client | ||||||
| 
 | 
 | ||||||
|     def fetch(self, video_id): |     def fetch(self, video_id): | ||||||
|         return TranscriptData.build( |         return TranscriptList.build( | ||||||
|             self._http_client, |             self._http_client, | ||||||
|             video_id, |             video_id, | ||||||
|             self._extract_captions_json(self._fetch_html(video_id), video_id) |             self._extract_captions_json(self._fetch_html(video_id), video_id) | ||||||
|  | @ -48,48 +48,89 @@ class TranscriptDataFetcher(): | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TranscriptData(): | class TranscriptList(): | ||||||
|  |     """ | ||||||
|  |     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available | ||||||
|  |     for a given YouTube video. Also it provides functionality to search for a transcript in a given language. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|     # TODO implement iterator |     # TODO implement iterator | ||||||
| 
 | 
 | ||||||
|     def __init__( |     def __init__(self, video_id, manually_created_transcripts, generated_transcripts): | ||||||
|         self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages |         """ | ||||||
|     ): |         The constructor is only for internal use. Use the static build method instead. | ||||||
|         self._http_client = http_client | 
 | ||||||
|  |         :param video_id: the id of the video this TranscriptList is for | ||||||
|  |         :type video_id: str | ||||||
|  |         :param manually_created_transcripts: dict mapping language codes to the manually created transcripts | ||||||
|  |         :type manually_created_transcripts: dict[str, Transcript] | ||||||
|  |         :param generated_transcripts: dict mapping language codes to the generated transcripts | ||||||
|  |         :type generated_transcripts: dict[str, Transcript] | ||||||
|  |         """ | ||||||
|         self.video_id = video_id |         self.video_id = video_id | ||||||
|         self._manually_created_transcripts = manually_created_transcripts |         self._manually_created_transcripts = manually_created_transcripts | ||||||
|         self._generated_transcripts = generated_transcripts |         self._generated_transcripts = generated_transcripts | ||||||
|         self._translation_languages = translation_languages |  | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def build(http_client, video_id, captions_json): |     def build(http_client, video_id, captions_json): | ||||||
|         manually_created_transcripts = [] |         """ | ||||||
|         generated_transcripts = [] |         Factory method for TranscriptList. | ||||||
|  | 
 | ||||||
|  |         :param http_client: http client which is used to make the transcript retrieving http calls | ||||||
|  |         :type http_client: requests.Session | ||||||
|  |         :param video_id: the id of the video this TranscriptList is for | ||||||
|  |         :type video_id: str | ||||||
|  |         :param captions_json: the JSON parsed from the YouTube pages static HTML | ||||||
|  |         :type captions_json: dict | ||||||
|  |         :return: the created TranscriptList | ||||||
|  |         :rtype TranscriptList | ||||||
|  |         """ | ||||||
|  |         translation_languages = [ | ||||||
|  |             { | ||||||
|  |                 'language': translation_language['languageName']['simpleText'], | ||||||
|  |                 'language_code': translation_language['languageCode'], | ||||||
|  |             } for translation_language in captions_json['translationLanguages'] | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         manually_created_transcripts = {} | ||||||
|  |         generated_transcripts = {} | ||||||
| 
 | 
 | ||||||
|         for caption in captions_json['captionTracks']: |         for caption in captions_json['captionTracks']: | ||||||
|             (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( |             if caption.get('kind', '') == 'asr': | ||||||
|                 { |                 transcript_dict = generated_transcripts | ||||||
|                     'url': caption['baseUrl'], |             else: | ||||||
|                     'language': caption['name']['simpleText'], |                 transcript_dict = manually_created_transcripts | ||||||
|                     'language_code': caption['languageCode'], | 
 | ||||||
|                     'is_generated': caption.get('kind', '') == 'asr', |             transcript_dict[caption['languageCode']] = Transcript( | ||||||
|                     'is_translatable': caption['isTranslatable'], |                 http_client, | ||||||
|                 } |                 video_id, | ||||||
|  |                 caption['baseUrl'], | ||||||
|  |                 caption['name']['simpleText'], | ||||||
|  |                 caption['languageCode'], | ||||||
|  |                 caption.get('kind', '') == 'asr', | ||||||
|  |                 translation_languages if caption['isTranslatable'] else [] | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         return TranscriptData( |         return TranscriptList( | ||||||
|             http_client, |  | ||||||
|             video_id, |             video_id, | ||||||
|             manually_created_transcripts, |             manually_created_transcripts, | ||||||
|             generated_transcripts, |             generated_transcripts, | ||||||
|             [ |  | ||||||
|                 { |  | ||||||
|                     'language': translation_language['languageName']['simpleText'], |  | ||||||
|                     'language_code': translation_language['languageCode'], |  | ||||||
|                 } for translation_language in captions_json['translationLanguages'] |  | ||||||
|             ], |  | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     def find_transcript(self, language_codes): |     def find_transcript(self, language_codes): | ||||||
|  |         """ | ||||||
|  |         Finds a transcript for a given language code. Manually created transcripts are returned first and only if none | ||||||
|  |         are found, generated transcripts are used. If you only want generated transcripts use | ||||||
|  |         find_manually_created_transcript instead. | ||||||
|  | 
 | ||||||
|  |         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||||
|  |         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||||
|  |         it fails to do so. | ||||||
|  |         :type languages: [str] | ||||||
|  |         :return: the found Transcript | ||||||
|  |         :rtype: Transcript | ||||||
|  |         :raises: NoTranscriptFound | ||||||
|  |         """ | ||||||
|         try: |         try: | ||||||
|             return self.find_manually_created_transcript(language_codes) |             return self.find_manually_created_transcript(language_codes) | ||||||
|         except NoTranscriptFound: |         except NoTranscriptFound: | ||||||
|  | @ -98,25 +139,39 @@ class TranscriptData(): | ||||||
|         return self.find_generated_transcript(language_codes) |         return self.find_generated_transcript(language_codes) | ||||||
| 
 | 
 | ||||||
|     def find_generated_transcript(self, language_codes): |     def find_generated_transcript(self, language_codes): | ||||||
|  |         """ | ||||||
|  |         Finds a automatically generated transcript for a given language code. | ||||||
|  | 
 | ||||||
|  |         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||||
|  |         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||||
|  |         it fails to do so. | ||||||
|  |         :type languages: [str] | ||||||
|  |         :return: the found Transcript | ||||||
|  |         :rtype: Transcript | ||||||
|  |         :raises: NoTranscriptFound | ||||||
|  |         """ | ||||||
|         return self._find_transcript(language_codes, generated=True) |         return self._find_transcript(language_codes, generated=True) | ||||||
| 
 | 
 | ||||||
|     def find_manually_created_transcript(self, language_codes): |     def find_manually_created_transcript(self, language_codes): | ||||||
|  |         """ | ||||||
|  |         Finds a manually created transcript for a given language code. | ||||||
|  | 
 | ||||||
|  |         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||||
|  |         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||||
|  |         it fails to do so. | ||||||
|  |         :type languages: [str] | ||||||
|  |         :return: the found Transcript | ||||||
|  |         :rtype: Transcript | ||||||
|  |         :raises: NoTranscriptFound | ||||||
|  |         """ | ||||||
|         return self._find_transcript(language_codes, generated=False) |         return self._find_transcript(language_codes, generated=False) | ||||||
| 
 | 
 | ||||||
|     def _find_transcript(self, language_codes, generated): |     def _find_transcript(self, language_codes, generated): | ||||||
|         transcripts = self._generated_transcripts if generated else self._manually_created_transcripts |         transcripts = self._generated_transcripts if generated else self._manually_created_transcripts | ||||||
| 
 | 
 | ||||||
|         for language_code in language_codes: |         for language_code in language_codes: | ||||||
|             for transcript in transcripts: |             if language_code in transcripts: | ||||||
|                 if transcript['language_code'] == language_code: |                 return transcripts[language_code] | ||||||
|                     return Transcript( |  | ||||||
|                         self._http_client, |  | ||||||
|                         transcript['url'], |  | ||||||
|                         transcript['language'], |  | ||||||
|                         transcript['language_code'], |  | ||||||
|                         transcript['is_generated'], |  | ||||||
|                         self._translation_languages if transcript['is_translatable'] else [] |  | ||||||
|                     ) |  | ||||||
| 
 | 
 | ||||||
|         raise NoTranscriptFound( |         raise NoTranscriptFound( | ||||||
|             self.video_id, |             self.video_id, | ||||||
|  | @ -134,34 +189,59 @@ class TranscriptData(): | ||||||
|         ).format( |         ).format( | ||||||
|             video_id=self.video_id, |             video_id=self.video_id, | ||||||
|             available_manually_created_transcript_languages=self._get_language_description( |             available_manually_created_transcript_languages=self._get_language_description( | ||||||
|                 self._manually_created_transcripts |                 self._manually_created_transcripts.values() | ||||||
|             ), |             ), | ||||||
|             available_generated_transcripts=self._get_language_description( |             available_generated_transcripts=self._get_language_description( | ||||||
|                 self._generated_transcripts |                 self._generated_transcripts.values() | ||||||
|             ), |             ), | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     def _get_language_description(self, transcripts): |     def _get_language_description(self, transcripts): | ||||||
|         return '\n'.join( |         return '\n'.join( | ||||||
|             ' - {language_code} ("{language}")'.format( |             ' - {transcript}'.format(transcript=str(transcript)) | ||||||
|                 language=transcript['language'], |             for transcript in transcripts | ||||||
|                 language_code=transcript['language_code'], |  | ||||||
|             ) for transcript in transcripts |  | ||||||
|         ) if transcripts else 'None' |         ) if transcripts else 'None' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Transcript(): | class Transcript(): | ||||||
|     def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): |     def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): | ||||||
|  |         """ | ||||||
|  |         You probably don't want to initialize this directly. Usually you'll access Transcript objects using a | ||||||
|  |         TranscriptList. | ||||||
|  | 
 | ||||||
|  |         :param http_client: http client which is used to make the transcript retrieving http calls | ||||||
|  |         :type http_client: requests.Session | ||||||
|  |         :param video_id: the id of the video this TranscriptList is for | ||||||
|  |         :type video_id: str | ||||||
|  |         :param url: the url which needs to be called to fetch the transcript | ||||||
|  |         :param language: the name of the language this transcript uses | ||||||
|  |         :param language_code: | ||||||
|  |         :param is_generated: | ||||||
|  |         :param translation_languages: | ||||||
|  |         """ | ||||||
|         self._http_client = http_client |         self._http_client = http_client | ||||||
|         self.url = url |         self.video_id = video_id | ||||||
|  |         self._url = url | ||||||
|         self.language = language |         self.language = language | ||||||
|         self.language_code = language_code |         self.language_code = language_code | ||||||
|         self.is_generated = is_generated |         self.is_generated = is_generated | ||||||
|         self.translation_languages = translation_languages |         self.translation_languages = translation_languages | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self): | ||||||
|  |         """ | ||||||
|  |         Loads the actual transcript data. | ||||||
|  | 
 | ||||||
|  |         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||||
|  |         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||||
|  |         """ | ||||||
|         return _TranscriptParser().parse( |         return _TranscriptParser().parse( | ||||||
|             self._http_client.get(self.url).text |             self._http_client.get(self._url).text | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def __str__(self): | ||||||
|  |         return '{language_code} ("{language}")'.format( | ||||||
|  |             language=self.language, | ||||||
|  |             language_code=self.language_code, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| # TODO integrate translations in future release | # TODO integrate translations in future release | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue