fixed bug; added doctstrings for public methods

2019-12-11 11:42:14 +01:00 · 2019-12-11 11:42:14 +01:00 · c2c49c3c17
parent df417be915
commit c2c49c3c17
3 changed files with 131 additions and 53 deletions
--- a/youtube_transcript_api/init.py
+++ b/youtube_transcript_api/init.py
@ -1,3 +1,3 @@
 from ._api import YouTubeTranscriptApi
-from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript
+from ._transcripts import TranscriptList, Transcript
 from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -1,6 +1,6 @@
 import requests

-from ._transcripts import TranscriptDataFetcher
+from ._transcripts import TranscriptListFetcher


 class YouTubeTranscriptApi():
@ -13,8 +13,7 @@ class YouTubeTranscriptApi():
        :type video_ids: [str]
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
-        do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
-        play around with the language codes a bit, to find the one which is working for you!
+        do so.
        :type languages: [str]
        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
        one of the video transcripts
@ -23,7 +22,7 @@ class YouTubeTranscriptApi():
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
        video ids, which could not be retrieved
-        :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
+        :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
        """
        data = {}
        unretrievable_videos = []
@ -48,8 +47,7 @@ class YouTubeTranscriptApi():
        :type video_id: str
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
-        do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
-        play around with the language codes a bit, to find the one which is working for you!
+        do so.
        :type languages: [str]
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
@ -58,4 +56,4 @@ class YouTubeTranscriptApi():
        """
        with requests.Session() as http_client:
            http_client.proxies = proxies if proxies else {}
-            return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
+            return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
 from ._settings import WATCH_URL


-class TranscriptDataFetcher():
+class TranscriptListFetcher():
    def __init__(self, http_client):
        self._http_client = http_client

    def fetch(self, video_id):
-        return TranscriptData.build(
+        return TranscriptList.build(
            self._http_client,
            video_id,
            self._extract_captions_json(self._fetch_html(video_id), video_id)
@ -48,48 +48,89 @@ class TranscriptDataFetcher():
        )


-class TranscriptData():
+class TranscriptList():
+    """
+    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
+    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
+    """
+
    # TODO implement iterator

-    def __init__(
-        self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages
-    ):
-        self._http_client = http_client
+    def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
+        """
+        The constructor is only for internal use. Use the static build method instead.
+
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
+        :type manually_created_transcripts: dict[str, Transcript]
+        :param generated_transcripts: dict mapping language codes to the generated transcripts
+        :type generated_transcripts: dict[str, Transcript]
+        """
        self.video_id = video_id
        self._manually_created_transcripts = manually_created_transcripts
        self._generated_transcripts = generated_transcripts
-        self._translation_languages = translation_languages

    @staticmethod
    def build(http_client, video_id, captions_json):
-        manually_created_transcripts = []
-        generated_transcripts = []
+        """
+        Factory method for TranscriptList.
+
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param captions_json: the JSON parsed from the YouTube pages static HTML
+        :type captions_json: dict
+        :return: the created TranscriptList
+        :rtype TranscriptList
+        """
+        translation_languages = [
+            {
+                'language': translation_language['languageName']['simpleText'],
+                'language_code': translation_language['languageCode'],
+            } for translation_language in captions_json['translationLanguages']
+        ]
+
+        manually_created_transcripts = {}
+        generated_transcripts = {}

        for caption in captions_json['captionTracks']:
-            (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append(
-                {
-                    'url': caption['baseUrl'],
-                    'language': caption['name']['simpleText'],
-                    'language_code': caption['languageCode'],
-                    'is_generated': caption.get('kind', '') == 'asr',
-                    'is_translatable': caption['isTranslatable'],
-                }
+            if caption.get('kind', '') == 'asr':
+                transcript_dict = generated_transcripts
+            else:
+                transcript_dict = manually_created_transcripts
+
+            transcript_dict[caption['languageCode']] = Transcript(
+                http_client,
+                video_id,
+                caption['baseUrl'],
+                caption['name']['simpleText'],
+                caption['languageCode'],
+                caption.get('kind', '') == 'asr',
+                translation_languages if caption['isTranslatable'] else []
            )

-        return TranscriptData(
-            http_client,
+        return TranscriptList(
            video_id,
            manually_created_transcripts,
            generated_transcripts,
-            [
-                {
-                    'language': translation_language['languageName']['simpleText'],
-                    'language_code': translation_language['languageCode'],
-                } for translation_language in captions_json['translationLanguages']
-            ],
        )

    def find_transcript(self, language_codes):
+        """
+        Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
+        are found, generated transcripts are used. If you only want generated transcripts use
+        find_manually_created_transcript instead.
+
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: [str]
+        :return: the found Transcript
+        :rtype: Transcript
+        :raises: NoTranscriptFound
+        """
        try:
            return self.find_manually_created_transcript(language_codes)
        except NoTranscriptFound:
@ -98,25 +139,39 @@ class TranscriptData():
        return self.find_generated_transcript(language_codes)

    def find_generated_transcript(self, language_codes):
+        """
+        Finds a automatically generated transcript for a given language code.
+
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: [str]
+        :return: the found Transcript
+        :rtype: Transcript
+        :raises: NoTranscriptFound
+        """
        return self._find_transcript(language_codes, generated=True)

    def find_manually_created_transcript(self, language_codes):
+        """
+        Finds a manually created transcript for a given language code.
+
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: [str]
+        :return: the found Transcript
+        :rtype: Transcript
+        :raises: NoTranscriptFound
+        """
        return self._find_transcript(language_codes, generated=False)

    def _find_transcript(self, language_codes, generated):
        transcripts = self._generated_transcripts if generated else self._manually_created_transcripts

        for language_code in language_codes:
-            for transcript in transcripts:
-                if transcript['language_code'] == language_code:
-                    return Transcript(
-                        self._http_client,
-                        transcript['url'],
-                        transcript['language'],
-                        transcript['language_code'],
-                        transcript['is_generated'],
-                        self._translation_languages if transcript['is_translatable'] else []
-                    )
+            if language_code in transcripts:
+                return transcripts[language_code]

        raise NoTranscriptFound(
            self.video_id,
@ -134,34 +189,59 @@ class TranscriptData():
        ).format(
            video_id=self.video_id,
            available_manually_created_transcript_languages=self._get_language_description(
-                self._manually_created_transcripts
+                self._manually_created_transcripts.values()
            ),
            available_generated_transcripts=self._get_language_description(
-                self._generated_transcripts
+                self._generated_transcripts.values()
            ),
        )

    def _get_language_description(self, transcripts):
        return '\n'.join(
-            ' - {language_code} ("{language}")'.format(
-                language=transcript['language'],
-                language_code=transcript['language_code'],
-            ) for transcript in transcripts
+            ' - {transcript}'.format(transcript=str(transcript))
+            for transcript in transcripts
        ) if transcripts else 'None'


 class Transcript():
-    def __init__(self, http_client, url, language, language_code, is_generated, translation_languages):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
+        """
+        You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
+        TranscriptList.
+
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param url: the url which needs to be called to fetch the transcript
+        :param language: the name of the language this transcript uses
+        :param language_code:
+        :param is_generated:
+        :param translation_languages:
+        """
        self._http_client = http_client
-        self.url = url
+        self.video_id = video_id
+        self._url = url
        self.language = language
        self.language_code = language_code
        self.is_generated = is_generated
        self.translation_languages = translation_languages

    def fetch(self):
+        """
+        Loads the actual transcript data.
+
+        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
+        :rtype: [{'text': str, 'start': float, 'end': float}]
+        """
        return _TranscriptParser().parse(
-            self._http_client.get(self.url).text
+            self._http_client.get(self._url).text
+        )
+
+    def __str__(self):
+        return '{language_code} ("{language}")'.format(
+            language=self.language,
+            language_code=self.language_code,
        )

 # TODO integrate translations in future release