YouTubeTranscriptApi now supports retrieving transcripts for given languages

2019-02-21 12:55:03 +01:00 · 2019-02-21 12:55:03 +01:00 · 18fb0cbaec
parent 48cb31fe3e
commit 18fb0cbaec
5 changed files with 61 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 .idea
 .venv
+virtualenv
 *.pyc
 dist
 build
--- a/README.md
+++ b/README.md
@ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this:
 ]
 ```

+You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english).
+
+```python
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
+```
+
+It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you!
+
 To get transcripts for a list fo video ids you can call:

 ```python
-YouTubeTranscriptApi.get_transcripts(video_ids)
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
 ```

+`languages` also is optional here.
+
 ### CLI

 Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:
--- a/setup.py
+++ b/setup.py
@ -8,13 +8,10 @@ def _get_file_content(file_name):
 def get_long_description():
    return _get_file_content('README.md')

-def get_requirements():
-    return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n')))
-

 setuptools.setup(
    name="youtube_transcript_api",
-    version="0.1.1",
+    version="0.1.2",
    author="Jonas Depoix",
    author_email="jonas.depoix@web.de",
    description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",
@ -29,7 +26,9 @@ setuptools.setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ),
-    install_requires=get_requirements(),
+    install_requires=[
+        'requests',
+    ],
    entry_points={
        'console_scripts': [
            'youtube_transcript_api = youtube_transcript_api.__main__:main',
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -1,3 +1,9 @@
+import sys
+
+if sys.version_info.major == 2:
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+
 from xml.etree import ElementTree

 import re
@ -30,14 +36,18 @@ class YouTubeTranscriptApi():
            )
            self.video_id = video_id

-
    @staticmethod
-    def get_transcripts(video_ids, continue_after_error=False):
+    def get_transcripts(video_ids, languages=None, continue_after_error=False):
        """
        Retrieves the transcripts for a list of videos.

        :param video_ids: a list of youtube video ids
        :type video_ids: [str]
+        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
+        it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
+        do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
+        play around with the language codes a bit, to find the one which is working for you!
+        :type languages: [str]
        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
        one of the video transcripts
        :type continue_after_error: bool
@ -50,7 +60,7 @@ class YouTubeTranscriptApi():

        for video_id in video_ids:
            try:
-                data[video_id] = YouTubeTranscriptApi.get_transcript(video_id)
+                data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages)
            except Exception as exception:
                if not continue_after_error:
                    raise exception
@ -60,17 +70,22 @@ class YouTubeTranscriptApi():
        return data, unretrievable_videos

    @staticmethod
-    def get_transcript(video_id):
+    def get_transcript(video_id, languages=None):
        """
        Retrieves the transcript for a single video.

        :param video_id: the youtube video id
        :type video_id: str
+        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
+        it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
+        do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
+        play around with the language codes a bit, to find the one which is working for you!
+        :type languages: [str]
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype: [{'text': str, 'start': float, 'end': float}]
        """
        try:
-            return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse()
+            return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
        except Exception:
            logger.error(
                YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
@ -83,26 +98,36 @@ class YouTubeTranscriptApi():
 class _TranscriptFetcher():
    WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
    API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
+    LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')

-    def __init__(self, video_id):
+    def __init__(self, video_id, languages):
        self.video_id = video_id
+        self.languages = languages

    def fetch(self):
        fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
-
        timedtext_url_start = fetched_site.find('timedtext')

-        return requests.get(
-            self.API_BASE_URL.format(
-                api_url=fetched_site[
-                    timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
-                ].replace(
-                    '\\u0026', '&'
-                ).replace(
-                    '\\', ''
-                )
+        for language in (self.languages if self.languages else [None,]):
+            response = self._execute_api_request(fetched_site, timedtext_url_start, language)
+            if response:
+                return response
+
+        return None
+
+    def _execute_api_request(self, fetched_site, timedtext_url_start, language):
+        url = self.API_BASE_URL.format(
+            api_url=fetched_site[
+                timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
+            ].replace(
+                '\\u0026', '&'
+            ).replace(
+                '\\', ''
            )
-        ).text
+        )
+        if language:
+            url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
+        return requests.get(url).text


 class _TranscriptParser():