diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index e2ed0aa..c9bb4eb 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -1,3 +1,3 @@ from ._api import YouTubeTranscriptApi -from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript +from ._transcripts import TranscriptList, Transcript from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 98ce16c..3476b9b 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,6 +1,6 @@ import requests -from ._transcripts import TranscriptDataFetcher +from ._transcripts import TranscriptListFetcher class YouTubeTranscriptApi(): @@ -13,8 +13,7 @@ class YouTubeTranscriptApi(): :type video_ids: [str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to - do so. As I can't provide a complete list of all working language codes with full certainty, you may have to - play around with the language codes a bit, to find the one which is working for you! + do so. :type languages: [str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts @@ -23,7 +22,7 @@ class YouTubeTranscriptApi(): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved - :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} + :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) """ data = {} unretrievable_videos = [] @@ -48,8 +47,7 @@ class YouTubeTranscriptApi(): :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to - do so. As I can't provide a complete list of all working language codes with full certainty, you may have to - play around with the language codes a bit, to find the one which is working for you! + do so. :type languages: [str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies @@ -58,4 +56,4 @@ class YouTubeTranscriptApi(): """ with requests.Session() as http_client: http_client.proxies = proxies if proxies else {} - return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() + return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 37a272b..9e09258 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled from ._settings import WATCH_URL -class TranscriptDataFetcher(): +class TranscriptListFetcher(): def __init__(self, http_client): self._http_client = http_client def fetch(self, video_id): - return TranscriptData.build( + return TranscriptList.build( self._http_client, video_id, self._extract_captions_json(self._fetch_html(video_id), video_id) @@ -48,48 +48,89 @@ class TranscriptDataFetcher(): ) -class TranscriptData(): +class TranscriptList(): + """ + This object represents a list of transcripts. It can be iterated over to list all transcripts which are available + for a given YouTube video. Also it provides functionality to search for a transcript in a given language. + """ + # TODO implement iterator - def __init__( - self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages - ): - self._http_client = http_client + def __init__(self, video_id, manually_created_transcripts, generated_transcripts): + """ + The constructor is only for internal use. Use the static build method instead. + + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param manually_created_transcripts: dict mapping language codes to the manually created transcripts + :type manually_created_transcripts: dict[str, Transcript] + :param generated_transcripts: dict mapping language codes to the generated transcripts + :type generated_transcripts: dict[str, Transcript] + """ self.video_id = video_id self._manually_created_transcripts = manually_created_transcripts self._generated_transcripts = generated_transcripts - self._translation_languages = translation_languages @staticmethod def build(http_client, video_id, captions_json): - manually_created_transcripts = [] - generated_transcripts = [] + """ + Factory method for TranscriptList. + + :param http_client: http client which is used to make the transcript retrieving http calls + :type http_client: requests.Session + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param captions_json: the JSON parsed from the YouTube pages static HTML + :type captions_json: dict + :return: the created TranscriptList + :rtype TranscriptList + """ + translation_languages = [ + { + 'language': translation_language['languageName']['simpleText'], + 'language_code': translation_language['languageCode'], + } for translation_language in captions_json['translationLanguages'] + ] + + manually_created_transcripts = {} + generated_transcripts = {} for caption in captions_json['captionTracks']: - (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( - { - 'url': caption['baseUrl'], - 'language': caption['name']['simpleText'], - 'language_code': caption['languageCode'], - 'is_generated': caption.get('kind', '') == 'asr', - 'is_translatable': caption['isTranslatable'], - } + if caption.get('kind', '') == 'asr': + transcript_dict = generated_transcripts + else: + transcript_dict = manually_created_transcripts + + transcript_dict[caption['languageCode']] = Transcript( + http_client, + video_id, + caption['baseUrl'], + caption['name']['simpleText'], + caption['languageCode'], + caption.get('kind', '') == 'asr', + translation_languages if caption['isTranslatable'] else [] ) - return TranscriptData( - http_client, + return TranscriptList( video_id, manually_created_transcripts, generated_transcripts, - [ - { - 'language': translation_language['languageName']['simpleText'], - 'language_code': translation_language['languageCode'], - } for translation_language in captions_json['translationLanguages'] - ], ) def find_transcript(self, language_codes): + """ + Finds a transcript for a given language code. Manually created transcripts are returned first and only if none + are found, generated transcripts are used. If you only want generated transcripts use + find_manually_created_transcript instead. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ try: return self.find_manually_created_transcript(language_codes) except NoTranscriptFound: @@ -98,25 +139,39 @@ class TranscriptData(): return self.find_generated_transcript(language_codes) def find_generated_transcript(self, language_codes): + """ + Finds a automatically generated transcript for a given language code. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ return self._find_transcript(language_codes, generated=True) def find_manually_created_transcript(self, language_codes): + """ + Finds a manually created transcript for a given language code. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ return self._find_transcript(language_codes, generated=False) def _find_transcript(self, language_codes, generated): transcripts = self._generated_transcripts if generated else self._manually_created_transcripts for language_code in language_codes: - for transcript in transcripts: - if transcript['language_code'] == language_code: - return Transcript( - self._http_client, - transcript['url'], - transcript['language'], - transcript['language_code'], - transcript['is_generated'], - self._translation_languages if transcript['is_translatable'] else [] - ) + if language_code in transcripts: + return transcripts[language_code] raise NoTranscriptFound( self.video_id, @@ -134,34 +189,59 @@ class TranscriptData(): ).format( video_id=self.video_id, available_manually_created_transcript_languages=self._get_language_description( - self._manually_created_transcripts + self._manually_created_transcripts.values() ), available_generated_transcripts=self._get_language_description( - self._generated_transcripts + self._generated_transcripts.values() ), ) def _get_language_description(self, transcripts): return '\n'.join( - ' - {language_code} ("{language}")'.format( - language=transcript['language'], - language_code=transcript['language_code'], - ) for transcript in transcripts + ' - {transcript}'.format(transcript=str(transcript)) + for transcript in transcripts ) if transcripts else 'None' class Transcript(): - def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): + """ + You probably don't want to initialize this directly. Usually you'll access Transcript objects using a + TranscriptList. + + :param http_client: http client which is used to make the transcript retrieving http calls + :type http_client: requests.Session + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param url: the url which needs to be called to fetch the transcript + :param language: the name of the language this transcript uses + :param language_code: + :param is_generated: + :param translation_languages: + """ self._http_client = http_client - self.url = url + self.video_id = video_id + self._url = url self.language = language self.language_code = language_code self.is_generated = is_generated self.translation_languages = translation_languages def fetch(self): + """ + Loads the actual transcript data. + + :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys + :rtype: [{'text': str, 'start': float, 'end': float}] + """ return _TranscriptParser().parse( - self._http_client.get(self.url).text + self._http_client.get(self._url).text + ) + + def __str__(self): + return '{language_code} ("{language}")'.format( + language=self.language, + language_code=self.language_code, ) # TODO integrate translations in future release