fixed bug; added doctstrings for public methods
This commit is contained in:
		
							parent
							
								
									df417be915
								
							
						
					
					
						commit
						c2c49c3c17
					
				|  | @ -1,3 +1,3 @@ | |||
| from ._api import YouTubeTranscriptApi | ||||
| from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript | ||||
| from ._transcripts import TranscriptList, Transcript | ||||
| from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| import requests | ||||
| 
 | ||||
| from ._transcripts import TranscriptDataFetcher | ||||
| from ._transcripts import TranscriptListFetcher | ||||
| 
 | ||||
| 
 | ||||
| class YouTubeTranscriptApi(): | ||||
|  | @ -13,8 +13,7 @@ class YouTubeTranscriptApi(): | |||
|         :type video_ids: [str] | ||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||
|         play around with the language codes a bit, to find the one which is working for you! | ||||
|         do so. | ||||
|         :type languages: [str] | ||||
|         :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving | ||||
|         one of the video transcripts | ||||
|  | @ -23,7 +22,7 @@ class YouTubeTranscriptApi(): | |||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||
|         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of | ||||
|         video ids, which could not be retrieved | ||||
|         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} | ||||
|         :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) | ||||
|         """ | ||||
|         data = {} | ||||
|         unretrievable_videos = [] | ||||
|  | @ -48,8 +47,7 @@ class YouTubeTranscriptApi(): | |||
|         :type video_id: str | ||||
|         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||
|         it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to | ||||
|         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||
|         play around with the language codes a bit, to find the one which is working for you! | ||||
|         do so. | ||||
|         :type languages: [str] | ||||
|         :param proxies: a dictionary mapping of http and https proxies to be used for the network requests | ||||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||
|  | @ -58,4 +56,4 @@ class YouTubeTranscriptApi(): | |||
|         """ | ||||
|         with requests.Session() as http_client: | ||||
|             http_client.proxies = proxies if proxies else {} | ||||
|             return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() | ||||
|             return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() | ||||
|  |  | |||
|  | @ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled | |||
| from ._settings import WATCH_URL | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptDataFetcher(): | ||||
| class TranscriptListFetcher(): | ||||
|     def __init__(self, http_client): | ||||
|         self._http_client = http_client | ||||
| 
 | ||||
|     def fetch(self, video_id): | ||||
|         return TranscriptData.build( | ||||
|         return TranscriptList.build( | ||||
|             self._http_client, | ||||
|             video_id, | ||||
|             self._extract_captions_json(self._fetch_html(video_id), video_id) | ||||
|  | @ -48,48 +48,89 @@ class TranscriptDataFetcher(): | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptData(): | ||||
| class TranscriptList(): | ||||
|     """ | ||||
|     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available | ||||
|     for a given YouTube video. Also it provides functionality to search for a transcript in a given language. | ||||
|     """ | ||||
| 
 | ||||
|     # TODO implement iterator | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages | ||||
|     ): | ||||
|         self._http_client = http_client | ||||
|     def __init__(self, video_id, manually_created_transcripts, generated_transcripts): | ||||
|         """ | ||||
|         The constructor is only for internal use. Use the static build method instead. | ||||
| 
 | ||||
|         :param video_id: the id of the video this TranscriptList is for | ||||
|         :type video_id: str | ||||
|         :param manually_created_transcripts: dict mapping language codes to the manually created transcripts | ||||
|         :type manually_created_transcripts: dict[str, Transcript] | ||||
|         :param generated_transcripts: dict mapping language codes to the generated transcripts | ||||
|         :type generated_transcripts: dict[str, Transcript] | ||||
|         """ | ||||
|         self.video_id = video_id | ||||
|         self._manually_created_transcripts = manually_created_transcripts | ||||
|         self._generated_transcripts = generated_transcripts | ||||
|         self._translation_languages = translation_languages | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def build(http_client, video_id, captions_json): | ||||
|         manually_created_transcripts = [] | ||||
|         generated_transcripts = [] | ||||
|         """ | ||||
|         Factory method for TranscriptList. | ||||
| 
 | ||||
|         for caption in captions_json['captionTracks']: | ||||
|             (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( | ||||
|                 { | ||||
|                     'url': caption['baseUrl'], | ||||
|                     'language': caption['name']['simpleText'], | ||||
|                     'language_code': caption['languageCode'], | ||||
|                     'is_generated': caption.get('kind', '') == 'asr', | ||||
|                     'is_translatable': caption['isTranslatable'], | ||||
|                 } | ||||
|             ) | ||||
| 
 | ||||
|         return TranscriptData( | ||||
|             http_client, | ||||
|             video_id, | ||||
|             manually_created_transcripts, | ||||
|             generated_transcripts, | ||||
|             [ | ||||
|         :param http_client: http client which is used to make the transcript retrieving http calls | ||||
|         :type http_client: requests.Session | ||||
|         :param video_id: the id of the video this TranscriptList is for | ||||
|         :type video_id: str | ||||
|         :param captions_json: the JSON parsed from the YouTube pages static HTML | ||||
|         :type captions_json: dict | ||||
|         :return: the created TranscriptList | ||||
|         :rtype TranscriptList | ||||
|         """ | ||||
|         translation_languages = [ | ||||
|             { | ||||
|                 'language': translation_language['languageName']['simpleText'], | ||||
|                 'language_code': translation_language['languageCode'], | ||||
|             } for translation_language in captions_json['translationLanguages'] | ||||
|             ], | ||||
|         ] | ||||
| 
 | ||||
|         manually_created_transcripts = {} | ||||
|         generated_transcripts = {} | ||||
| 
 | ||||
|         for caption in captions_json['captionTracks']: | ||||
|             if caption.get('kind', '') == 'asr': | ||||
|                 transcript_dict = generated_transcripts | ||||
|             else: | ||||
|                 transcript_dict = manually_created_transcripts | ||||
| 
 | ||||
|             transcript_dict[caption['languageCode']] = Transcript( | ||||
|                 http_client, | ||||
|                 video_id, | ||||
|                 caption['baseUrl'], | ||||
|                 caption['name']['simpleText'], | ||||
|                 caption['languageCode'], | ||||
|                 caption.get('kind', '') == 'asr', | ||||
|                 translation_languages if caption['isTranslatable'] else [] | ||||
|             ) | ||||
| 
 | ||||
|         return TranscriptList( | ||||
|             video_id, | ||||
|             manually_created_transcripts, | ||||
|             generated_transcripts, | ||||
|         ) | ||||
| 
 | ||||
|     def find_transcript(self, language_codes): | ||||
|         """ | ||||
|         Finds a transcript for a given language code. Manually created transcripts are returned first and only if none | ||||
|         are found, generated transcripts are used. If you only want generated transcripts use | ||||
|         find_manually_created_transcript instead. | ||||
| 
 | ||||
|         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||
|         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||
|         it fails to do so. | ||||
|         :type languages: [str] | ||||
|         :return: the found Transcript | ||||
|         :rtype: Transcript | ||||
|         :raises: NoTranscriptFound | ||||
|         """ | ||||
|         try: | ||||
|             return self.find_manually_created_transcript(language_codes) | ||||
|         except NoTranscriptFound: | ||||
|  | @ -98,25 +139,39 @@ class TranscriptData(): | |||
|         return self.find_generated_transcript(language_codes) | ||||
| 
 | ||||
|     def find_generated_transcript(self, language_codes): | ||||
|         """ | ||||
|         Finds a automatically generated transcript for a given language code. | ||||
| 
 | ||||
|         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||
|         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||
|         it fails to do so. | ||||
|         :type languages: [str] | ||||
|         :return: the found Transcript | ||||
|         :rtype: Transcript | ||||
|         :raises: NoTranscriptFound | ||||
|         """ | ||||
|         return self._find_transcript(language_codes, generated=True) | ||||
| 
 | ||||
|     def find_manually_created_transcript(self, language_codes): | ||||
|         """ | ||||
|         Finds a manually created transcript for a given language code. | ||||
| 
 | ||||
|         :param language_codes: A list of language codes in a descending priority. For example, if this is set to | ||||
|         ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if | ||||
|         it fails to do so. | ||||
|         :type languages: [str] | ||||
|         :return: the found Transcript | ||||
|         :rtype: Transcript | ||||
|         :raises: NoTranscriptFound | ||||
|         """ | ||||
|         return self._find_transcript(language_codes, generated=False) | ||||
| 
 | ||||
|     def _find_transcript(self, language_codes, generated): | ||||
|         transcripts = self._generated_transcripts if generated else self._manually_created_transcripts | ||||
| 
 | ||||
|         for language_code in language_codes: | ||||
|             for transcript in transcripts: | ||||
|                 if transcript['language_code'] == language_code: | ||||
|                     return Transcript( | ||||
|                         self._http_client, | ||||
|                         transcript['url'], | ||||
|                         transcript['language'], | ||||
|                         transcript['language_code'], | ||||
|                         transcript['is_generated'], | ||||
|                         self._translation_languages if transcript['is_translatable'] else [] | ||||
|                     ) | ||||
|             if language_code in transcripts: | ||||
|                 return transcripts[language_code] | ||||
| 
 | ||||
|         raise NoTranscriptFound( | ||||
|             self.video_id, | ||||
|  | @ -134,34 +189,59 @@ class TranscriptData(): | |||
|         ).format( | ||||
|             video_id=self.video_id, | ||||
|             available_manually_created_transcript_languages=self._get_language_description( | ||||
|                 self._manually_created_transcripts | ||||
|                 self._manually_created_transcripts.values() | ||||
|             ), | ||||
|             available_generated_transcripts=self._get_language_description( | ||||
|                 self._generated_transcripts | ||||
|                 self._generated_transcripts.values() | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|     def _get_language_description(self, transcripts): | ||||
|         return '\n'.join( | ||||
|             ' - {language_code} ("{language}")'.format( | ||||
|                 language=transcript['language'], | ||||
|                 language_code=transcript['language_code'], | ||||
|             ) for transcript in transcripts | ||||
|             ' - {transcript}'.format(transcript=str(transcript)) | ||||
|             for transcript in transcripts | ||||
|         ) if transcripts else 'None' | ||||
| 
 | ||||
| 
 | ||||
| class Transcript(): | ||||
|     def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): | ||||
|     def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): | ||||
|         """ | ||||
|         You probably don't want to initialize this directly. Usually you'll access Transcript objects using a | ||||
|         TranscriptList. | ||||
| 
 | ||||
|         :param http_client: http client which is used to make the transcript retrieving http calls | ||||
|         :type http_client: requests.Session | ||||
|         :param video_id: the id of the video this TranscriptList is for | ||||
|         :type video_id: str | ||||
|         :param url: the url which needs to be called to fetch the transcript | ||||
|         :param language: the name of the language this transcript uses | ||||
|         :param language_code: | ||||
|         :param is_generated: | ||||
|         :param translation_languages: | ||||
|         """ | ||||
|         self._http_client = http_client | ||||
|         self.url = url | ||||
|         self.video_id = video_id | ||||
|         self._url = url | ||||
|         self.language = language | ||||
|         self.language_code = language_code | ||||
|         self.is_generated = is_generated | ||||
|         self.translation_languages = translation_languages | ||||
| 
 | ||||
|     def fetch(self): | ||||
|         """ | ||||
|         Loads the actual transcript data. | ||||
| 
 | ||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||
|         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||
|         """ | ||||
|         return _TranscriptParser().parse( | ||||
|             self._http_client.get(self.url).text | ||||
|             self._http_client.get(self._url).text | ||||
|         ) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         return '{language_code} ("{language}")'.format( | ||||
|             language=self.language, | ||||
|             language_code=self.language_code, | ||||
|         ) | ||||
| 
 | ||||
| # TODO integrate translations in future release | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue