Merge pull request #46 from danielcliu/feature/issue-45-use-authentication-cookies

Feature/issue 45 use authentication cookies
2020-01-31 11:29:25 +01:00 · 2020-01-31 11:29:25 +01:00 · 6da4d19978
parent 7dfe20fde4 a0823ea36f
commit 6da4d19978
9 changed files with 188 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -213,8 +213,28 @@ Using the CLI:
  
 ```  
 youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port  
-```  
+```
+
+## Cookies
+
+Some videos are age restricted, so this module won't be able to access those videos without some sort of authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to download that pages cookies into a text file. You can use the Chrome extension [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg?hl=en) or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/).
+
+Once you have that, you can use it with the module to access age-restricted videos' captions like so. 
+
+```python  
+from youtube_transcript_api import YouTubeTranscriptApi  
  
+YouTubeTranscriptApi.get_transcript(video_id, cookies='/path/to/your/cookies.txt')
+  
+YouTubeTranscriptApi.get_transcripts([video_id], cookies='/path/to/your/cookies.txt')
+```
+
+Using the CLI:
+
+```
+youtube_transcript_api <first_video_id> <second_video_id> --cookies /path/to/your/cookies.txt
+```
+
  
 ## Warning  
  
@ -224,4 +244,4 @@ youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://us
  
 If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :)  
  
-[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
+[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
--- a/youtube_transcript_api/init.py
+++ b/youtube_transcript_api/init.py
@ -8,4 +8,6 @@ from ._errors import (
    NotTranslatable,
    TranslationLanguageNotAvailable,
    NoTranscriptAvailable,
+    CookiePathInvalid,
+    CookiesInvalid
 )
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -1,11 +1,21 @@
 import requests
+try:
+    import http.cookiejar as cookiejar
+    CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
+except ImportError:
+    import cookielib as cookiejar
+    CookieLoadError = IOError

 from ._transcripts import TranscriptListFetcher

+from ._errors import (
+    CookiePathInvalid,
+    CookiesInvalid
+)

 class YouTubeTranscriptApi():
    @classmethod
-    def list_transcripts(cls, video_id, proxies=None):
+    def list_transcripts(cls, video_id, proxies=None, cookies=None):
        """
        Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
        which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@ -48,15 +58,19 @@ class YouTubeTranscriptApi():
        :type video_id: str
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
        :return: the list of available transcripts
        :rtype TranscriptList:
        """
        with requests.Session() as http_client:
+            if cookies:
+                http_client.cookies = cls._load_cookies(cookies, video_id)
            http_client.proxies = proxies if proxies else {}
            return TranscriptListFetcher(http_client).fetch(video_id)

    @classmethod
-    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
+    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
        """
        Retrieves the transcripts for a list of videos.

@ -71,6 +85,8 @@ class YouTubeTranscriptApi():
        :type continue_after_error: bool
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
        video ids, which could not be retrieved
        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@ -80,7 +96,7 @@ class YouTubeTranscriptApi():

        for video_id in video_ids:
            try:
-                data[video_id] = cls.get_transcript(video_id, languages, proxies)
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
            except Exception as exception:
                if not continue_after_error:
                    raise exception
@ -90,7 +106,7 @@ class YouTubeTranscriptApi():
        return data, unretrievable_videos

    @classmethod
-    def get_transcript(cls, video_id, languages=('en',), proxies=None):
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
        """
        Retrieves the transcript for a single video. This is just a shortcut for calling::

@ -104,7 +120,21 @@ class YouTubeTranscriptApi():
        :type languages: list[str]
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
+        :param cookies: a string of the path to a text file containing youtube authorization cookies
+        :type cookies: str
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
-        return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
+        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
+    
+    @classmethod
+    def _load_cookies(cls, cookies, video_id):
+        cookie_jar = {}
+        try:
+            cookie_jar = cookiejar.MozillaCookieJar()
+            cookie_jar.load(cookies)
+        except CookieLoadError:
+            raise CookiePathInvalid(video_id)
+        if not cookie_jar:
+            raise CookiesInvalid(video_id)
+        return cookie_jar 
--- a/youtube_transcript_api/_cli.py
+++ b/youtube_transcript_api/_cli.py
@ -21,12 +21,14 @@ class YouTubeTranscriptCli():
        if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
            proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}

+        cookies = parsed_args.cookies
+
        transcripts = []
        exceptions = []

        for video_id in parsed_args.video_ids:
            try:
-                transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id))
+                transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
            except Exception as exception:
                exceptions.append(exception)

@ -35,8 +37,8 @@ class YouTubeTranscriptCli():
            + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else [])
        )

-    def _fetch_transcript(self, parsed_args, proxies, video_id):
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies)
+    def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)

        if parsed_args.list_transcripts:
            return str(transcript_list)
@ -123,5 +125,10 @@ class YouTubeTranscriptCli():
            metavar='URL',
            help='Use the specified HTTPS proxy.'
        )
-
+        parser.add_argument(
+            '--cookies',
+            default=None,
+            help='The cookie file that will be used for authorization with youtube.'
+        )
+            
        return parser.parse_args(self._args)
--- a/youtube_transcript_api/_errors.py
+++ b/youtube_transcript_api/_errors.py
@ -55,6 +55,14 @@ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
    CAUSE_MESSAGE = 'The requested translation language is not available'


+class CookiePathInvalid(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
+
+
+class CookiesInvalid(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
+
+
 class NoTranscriptFound(CouldNotRetrieveTranscript):
    CAUSE_MESSAGE = (
        'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
--- a/youtube_transcript_api/test/example_cookies.txt
+++ b/youtube_transcript_api/test/example_cookies.txt
@ -0,0 +1,9 @@
+# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
+# This file can be used by wget, curl, aria2c and other standard compliant tools.
+# Usage Examples:
+#   1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
+#   2) curl --cookie cookies.txt "https://www.youtube.com/"
+#   3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
+#
+.example.com	TRUE	/	TRUE	3594431874	TEST_FIELD	TEST_VALUE
+.example.com	TRUE	/	TRUE	31874	BAD_TEST_FIELD	BAD_TEST_VALUE
--- a/youtube_transcript_api/test/expired_example_cookies.txt
+++ b/youtube_transcript_api/test/expired_example_cookies.txt
@ -0,0 +1,8 @@
+# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
+# This file can be used by wget, curl, aria2c and other standard compliant tools.
+# Usage Examples:
+#   1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
+#   2) curl --cookie cookies.txt "https://www.youtube.com/"
+#   3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
+#
+.example.com	TRUE	/	TRUE	31874	BAD_TEST_FIELD	BAD_TEST_VALUE
--- a/youtube_transcript_api/test/test_api.py
+++ b/youtube_transcript_api/test/test_api.py
@ -1,8 +1,10 @@
 from unittest import TestCase
-from mock import MagicMock
+from mock import patch

 import os

+import requests
+
 import httpretty

 from youtube_transcript_api import (
@ -13,6 +15,8 @@ from youtube_transcript_api import (
    NoTranscriptAvailable,
    NotTranslatable,
    TranslationLanguageNotAvailable,
+    CookiePathInvalid,
+    CookiesInvalid
 )


@ -151,39 +155,24 @@ class TestYouTubeTranscriptApi(TestCase):
        with self.assertRaises(NoTranscriptAvailable):
            YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E')

-    def test_get_transcripts(self):
-        video_id_1 = 'video_id_1'
-        video_id_2 = 'video_id_2'
-        languages = ['de', 'en']
-        YouTubeTranscriptApi.get_transcript = MagicMock()
-
-        YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
-
-        YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
-        YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
-        self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
-
-    def test_get_transcripts__stop_on_error(self):
-        YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
-
-        with self.assertRaises(Exception):
-            YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
-
-    def test_get_transcripts__continue_on_error(self):
-        video_id_1 = 'video_id_1'
-        video_id_2 = 'video_id_2'
-        YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
-
-        YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
-
-        YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None)
-        YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None)
-
-    def test_get_transcript__with_proxies(self):
+    def test_get_transcript__with_proxy(self):
        proxies = {'http': '', 'https:': ''}
        transcript = YouTubeTranscriptApi.get_transcript(
            'GJLlxj_dtq8', proxies=proxies
        )
+        self.assertEqual(
+            transcript,
+            [
+                {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
+                {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
+                {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
+            ]
+        )
+    
+    def test_get_transcript__with_cookies(self):
+        dirname, filename = os.path.split(os.path.abspath(__file__))
+        cookies = dirname + '/example_cookies.txt'
+        transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies)

        self.assertEqual(
            transcript,
@ -193,6 +182,59 @@ class TestYouTubeTranscriptApi(TestCase):
                {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
            ]
        )
-        YouTubeTranscriptApi.get_transcript = MagicMock()
+
+    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
+    def test_get_transcripts(self, mock_get_transcript):
+        video_id_1 = 'video_id_1'
+        video_id_2 = 'video_id_2'
+        languages = ['de', 'en']
+
+        YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
+
+        mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
+        mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
+        self.assertEqual(mock_get_transcript.call_count, 2)
+
+    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
+    def test_get_transcripts__stop_on_error(self, mock_get_transcript):
+        with self.assertRaises(Exception):
+            YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
+
+    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
+    def test_get_transcripts__continue_on_error(self, mock_get_transcript):
+        video_id_1 = 'video_id_1'
+        video_id_2 = 'video_id_2'
+
+        YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
+
+        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
+        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
+    
+    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
+    def test_get_transcripts__with_cookies(self, mock_get_transcript):
+        cookies = '/example_cookies.txt'
+        YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies)
+
+    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
+    def test_get_transcripts__with_proxies(self, mock_get_transcript):
+        proxies = {'http': '', 'https:': ''}
        YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
-        YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None)
+
+    def test_load_cookies(self):
+        dirname, filename = os.path.split(os.path.abspath(__file__))
+        cookies = dirname + '/example_cookies.txt'
+        session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8')
+        self.assertEqual({'TEST_FIELD': 'TEST_VALUE'},  requests.utils.dict_from_cookiejar(session_cookies))
+
+    def test_load_cookies__bad_file_path(self):
+        bad_cookies = 'nonexistent_cookies.txt'
+        with self.assertRaises(CookiePathInvalid):
+            YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8')
+
+    def test_load_cookies__no_valid_cookies(self):
+        dirname, filename = os.path.split(os.path.abspath(__file__))
+        expired_cookies = dirname + '/expired_example_cookies.txt'
+        with self.assertRaises(CookiesInvalid):
+            YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8')
--- a/youtube_transcript_api/test/test_cli.py
+++ b/youtube_transcript_api/test/test_cli.py
@ -164,8 +164,8 @@ class TestYouTubeTranscriptCli(TestCase):
    def test_run(self):
        YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()

-        YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
-        YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)

        self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en'])

@ -200,8 +200,8 @@ class TestYouTubeTranscriptCli(TestCase):
    def test_run__list_transcripts(self):
        YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run()

-        YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
-        YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)

    def test_run__json_output(self):
        output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
@ -220,10 +220,23 @@ class TestYouTubeTranscriptCli(TestCase):

        YouTubeTranscriptApi.list_transcripts.assert_any_call(
            'v1',
-            proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
+            proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
+            cookies= None
        )

        YouTubeTranscriptApi.list_transcripts.assert_any_call(
            'v2',
-            proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
+            proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
+            cookies=None
        )
+
+    def test_run__cookies(self):
+        YouTubeTranscriptCli(
+            (
+                'v1 v2 --languages de en '
+                '--cookies blahblah.txt'
+            ).split()
+        ).run()
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt')
+        YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt')
+