added public list_transcripts method

2019-12-30 15:20:47 +01:00 · 2019-12-30 15:20:47 +01:00 · 1bc5087575
parent 8287d1088e
commit 1bc5087575
2 changed files with 68 additions and 17 deletions
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -4,17 +4,68 @@ from ._transcripts import TranscriptListFetcher


 class YouTubeTranscriptApi():
+    @classmethod
+    def list_transcripts(cls, video_id, proxies=None):
+        """
+        Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
+        which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
+        over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
+        metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
+        `transcript.translate('en')`. Example::
+
+            # retrieve the available transcripts
+            transcript_list = YouTubeTranscriptApi.get('video_id')
+
+            # iterate over all available transcripts
+            for transcript in transcript_list:
+                # the Transcript object provides metadata properties
+                print(
+                    transcript.video_id,
+                    transcript.language,
+                    transcript.language_code,
+                    # whether it has been manually created or generated by YouTube
+                    transcript.is_generated,
+                    # a list of languages the transcript can be translated to
+                    transcript.translation_languages,
+                )
+
+                # fetch the actual transcript data
+                print(transcript.fetch())
+
+                # translating the transcript will return another transcript object
+                print(transcript.translate('en').fetch())
+
+            # you can also directly filter for the language you are looking for, using the transcript list
+            transcript = transcript_list.find_transcript(['de', 'en'])
+
+            # or just filter for manually created transcripts
+            transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
+
+            # or automatically generated ones
+            transcript = transcript_list.find_generated_transcript(['de', 'en'])
+
+        :param video_id: the youtube video id
+        :type video_id: str
+        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
+        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :return: the list of available transcripts
+        :rtype TranscriptList:
+        """
+        with requests.Session() as http_client:
+            http_client.proxies = proxies if proxies else {}
+            return TranscriptListFetcher(http_client).fetch(video_id)
+
    @classmethod
    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
        """
        Retrieves the transcripts for a list of videos.

        :param video_ids: a list of youtube video ids
-        :type video_ids: [str]
+        :type video_ids: list[str]
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
        do so.
-        :type languages: [str]
+        :type languages: list[str]
        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
        one of the video transcripts
        :type continue_after_error: bool
@ -22,7 +73,7 @@ class YouTubeTranscriptApi():
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
        video ids, which could not be retrieved
-        :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
+        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
        """
        data = {}
        unretrievable_videos = []
@ -41,19 +92,19 @@ class YouTubeTranscriptApi():
    @classmethod
    def get_transcript(cls, video_id, languages=('en',), proxies=None):
        """
-        Retrieves the transcript for a single video.
+        Retrieves the transcript for a single video. This is just a shortcut for calling::
+
+            YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()

        :param video_id: the youtube video id
        :type video_id: str
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
        do so.
-        :type languages: [str]
+        :type languages: list[str]
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
-        :rtype: [{'text': str, 'start': float, 'end': float}]
+        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
-        with requests.Session() as http_client:
-            http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
+        return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@ -95,7 +95,7 @@ class TranscriptList():
        :param captions_json: the JSON parsed from the YouTube pages static HTML
        :type captions_json: dict
        :return: the created TranscriptList
-        :rtype TranscriptList
+        :rtype TranscriptList:
        """
        translation_languages = [
            {
@ -142,9 +142,9 @@ class TranscriptList():
        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
        it fails to do so.
-        :type languages: [str]
+        :type languages: list[str]
        :return: the found Transcript
-        :rtype: Transcript
+        :rtype Transcript:
        :raises: NoTranscriptFound
        """
        return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
@ -156,9 +156,9 @@ class TranscriptList():
        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
        it fails to do so.
-        :type languages: [str]
+        :type languages: list[str]
        :return: the found Transcript
-        :rtype: Transcript
+        :rtype Transcript:
        :raises: NoTranscriptFound
        """
        return self._find_transcript(language_codes, [self._generated_transcripts,])
@ -170,9 +170,9 @@ class TranscriptList():
        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
        it fails to do so.
-        :type languages: [str]
+        :type languages: list[str]
        :return: the found Transcript
-        :rtype: Transcript
+        :rtype Transcript:
        :raises: NoTranscriptFound
        """
        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
@ -252,7 +252,7 @@ class Transcript():
        Loads the actual transcript data.

        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
-        :rtype: [{'text': str, 'start': float, 'end': float}]
+        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
        return _TranscriptParser().parse(
            self._http_client.get(self._url).text