From 02b1978217e099925937b5fe4885ab528704c488 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 15 Jan 2020 22:48:00 -0800 Subject: [PATCH 1/8] Added cookies parameter to api class methods --- youtube_transcript_api/_api.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index c1519ae..dd91cfd 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,11 +1,15 @@ import requests +try: + import http.cookiejar as cookiejar +except ImportError: + import cookielib as cookiejar from ._transcripts import TranscriptListFetcher class YouTubeTranscriptApi(): @classmethod - def list_transcripts(cls, video_id, proxies=None): + def list_transcripts(cls, video_id, proxies=None, cookies=None): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -51,12 +55,17 @@ class YouTubeTranscriptApi(): :return: the list of available transcripts :rtype TranscriptList: """ + print(cookies) with requests.Session() as http_client: + if cookies: + cj = cookiejar.MozillaCookieJar() + cj.load(cookies) + http_client.cookies = cj http_client.proxies = proxies if proxies else {} return TranscriptListFetcher(http_client).fetch(video_id) @classmethod - def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): + def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): """ Retrieves the transcripts for a list of videos. @@ -80,7 +89,7 @@ class YouTubeTranscriptApi(): for video_id in video_ids: try: - data[video_id] = cls.get_transcript(video_id, languages, proxies) + data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) except Exception as exception: if not continue_after_error: raise exception @@ -90,7 +99,7 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=('en',), proxies=None): + def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): """ Retrieves the transcript for a single video. This is just a shortcut for calling:: @@ -107,4 +116,4 @@ class YouTubeTranscriptApi(): :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ - return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch() + return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() From dc9fc2ee9342649e09e311b3b87dfc70dace4833 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 15 Jan 2020 22:48:26 -0800 Subject: [PATCH 2/8] Updated Readme to include how to use cookies with the module --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f4516f6..b5b5d20 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,19 @@ Using the CLI: ``` youtube_transcript_api --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port ``` +## Cookies + +Some videos are age restricted, so this module won't be able to access those videos without some sort of authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to download that pages cookies into a text file. You can use the Chrome extension [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg?hl=en) or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/). + +Once you have that, you can use it with the module to access age-restricted videos' captions like so. + +```python +from youtube_transcript_api import YouTubeTranscriptApi +YouTubeTranscriptApi.get_transcript(video_id, cookies=) + +YouTubeTranscriptApi.get_transcripts([video_id], cookies=) +``` ## Warning @@ -224,4 +236,4 @@ youtube_transcript_api --http-proxy http://us If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :) -[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) \ No newline at end of file +[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) From f9e553ebafc1cd9ded9d095f4f0dff2b0f5716f8 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 20 Jan 2020 23:04:46 -0800 Subject: [PATCH 3/8] Added cli support, fixed testing --- youtube_transcript_api/_api.py | 19 +++++++++++++++---- youtube_transcript_api/_cli.py | 15 +++++++++++---- youtube_transcript_api/test/test_api.py | 18 ++++++++++++------ youtube_transcript_api/test/test_cli.py | 25 +++++++++++++++++++------ 4 files changed, 57 insertions(+), 20 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index dd91cfd..ca2a000 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -52,15 +52,22 @@ class YouTubeTranscriptApi(): :type video_id: str :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies + :param cookies: a string of the path to a text file containing youtube authorization cookies + :type cookies: str - cookies.txt :return: the list of available transcripts :rtype TranscriptList: """ - print(cookies) with requests.Session() as http_client: if cookies: - cj = cookiejar.MozillaCookieJar() - cj.load(cookies) - http_client.cookies = cj + try: + cj = cookiejar.MozillaCookieJar() + cj.load(cookies) + http_client.cookies = cj + except IOError as e: + print("Warning: Path for cookies file was not valid. Did not load any cookies") + except FileNotFoundError as e: + print("Warning: Path for cookies file was not valid. Did not load any cookies") + http_client.proxies = proxies if proxies else {} return TranscriptListFetcher(http_client).fetch(video_id) @@ -80,6 +87,8 @@ class YouTubeTranscriptApi(): :type continue_after_error: bool :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies + :param cookies: a string of the path to a text file containing youtube authorization cookies + :type cookies: str - cookies.txt :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): @@ -113,6 +122,8 @@ class YouTubeTranscriptApi(): :type languages: list[str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies + :param cookies: a string of the path to a text file containing youtube authorization cookies + :type cookies: str - cookies.txt :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 043bf19..405d6e1 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -21,12 +21,14 @@ class YouTubeTranscriptCli(): if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} + cookies = parsed_args.cookies + transcripts = [] exceptions = [] for video_id in parsed_args.video_ids: try: - transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id)) + transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id)) except Exception as exception: exceptions.append(exception) @@ -35,8 +37,8 @@ class YouTubeTranscriptCli(): + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) ) - def _fetch_transcript(self, parsed_args, proxies, video_id): - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies) + def _fetch_transcript(self, parsed_args, proxies, cookies, video_id): + transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies) if parsed_args.list_transcripts: return str(transcript_list) @@ -123,5 +125,10 @@ class YouTubeTranscriptCli(): metavar='URL', help='Use the specified HTTPS proxy.' ) - + parser.add_argument( + '--cookies', + default=None, + help='The cookie file that will be used for authorization with youtube.' + ) + return parser.parse_args(self._args) diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index f506d33..e13e7ac 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -159,8 +159,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None, None) self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) def test_get_transcripts__stop_on_error(self): @@ -176,15 +176,21 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None, None) + + def test_get_transcripts__check_cookies(self): + cookies='example_cookies.txt' + YouTubeTranscriptApi.get_transcript = MagicMock() + YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) + YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) + def test_get_transcript__with_proxies(self): proxies = {'http': '', 'https:': ''} transcript = YouTubeTranscriptApi.get_transcript( 'GJLlxj_dtq8', proxies=proxies ) - self.assertEqual( transcript, [ @@ -195,4 +201,4 @@ class TestYouTubeTranscriptApi(TestCase): ) YouTubeTranscriptApi.get_transcript = MagicMock() YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies) + YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index d2676d8..158cd35 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -164,8 +164,8 @@ class TestYouTubeTranscriptCli(TestCase): def test_run(self): YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() - YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) - YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None) self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en']) @@ -200,8 +200,8 @@ class TestYouTubeTranscriptCli(TestCase): def test_run__list_transcripts(self): YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run() - YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) - YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None) def test_run__json_output(self): output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() @@ -220,10 +220,23 @@ class TestYouTubeTranscriptCli(TestCase): YouTubeTranscriptApi.list_transcripts.assert_any_call( 'v1', - proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} + proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}, + cookies= None ) YouTubeTranscriptApi.list_transcripts.assert_any_call( 'v2', - proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} + proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}, + cookies=None ) + + def test_run__cookies(self): + YouTubeTranscriptCli( + ( + 'v1 v2 --languages de en ' + '--cookies blahblah.txt' + ).split() + ).run() + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt') + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt') + From 67604ec46c7bdc9329899f27ce4e9526a28b3c0a Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 22 Jan 2020 22:35:35 -0800 Subject: [PATCH 4/8] Made testing more robust --- youtube_transcript_api/_api.py | 30 ++++++++++++++++--------- youtube_transcript_api/test/test_api.py | 2 ++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index ca2a000..93bbf48 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -53,20 +53,13 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies - :type cookies: str - cookies.txt + :type cookies: str :return: the list of available transcripts :rtype TranscriptList: """ with requests.Session() as http_client: if cookies: - try: - cj = cookiejar.MozillaCookieJar() - cj.load(cookies) - http_client.cookies = cj - except IOError as e: - print("Warning: Path for cookies file was not valid. Did not load any cookies") - except FileNotFoundError as e: - print("Warning: Path for cookies file was not valid. Did not load any cookies") + http_client.cookies = cls.load_cookies(cookies) http_client.proxies = proxies if proxies else {} return TranscriptListFetcher(http_client).fetch(video_id) @@ -88,7 +81,7 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies - :type cookies: str - cookies.txt + :type cookies: str :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): @@ -123,8 +116,23 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies - :type cookies: str - cookies.txt + :type cookies: str :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() + + + @classmethod + def load_cookies(cls, cookies): + cj = {} + try: + cj = cookiejar.MozillaCookieJar() + cj.load(cookies) + except IOError as e: + print("Warning: Path for cookies file was not valid. Did not load any cookies") + except FileNotFoundError as e: + print("Warning: Path for cookies file was not valid. Did not load any cookies") + if not cj: + raise IOError + return cj diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index e13e7ac..5d050f0 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -185,6 +185,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) + session_cookies = YouTubeTranscriptApi.load_cookies(cookies) + print("here: ", session_cookies.items()) def test_get_transcript__with_proxies(self): proxies = {'http': '', 'https:': ''} From 42d4f59e016daf44c800af41b165433245ee5e8d Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 27 Jan 2020 21:46:37 -0800 Subject: [PATCH 5/8] Moved cookie loader to its own function, made errors for cookies failing --- youtube_transcript_api/_api.py | 12 +++++++----- youtube_transcript_api/_errors.py | 8 ++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 93bbf48..43a760b 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -6,6 +6,10 @@ except ImportError: from ._transcripts import TranscriptListFetcher +from ._errors import ( + CookiePathInvalid, + CookiesInvalid +) class YouTubeTranscriptApi(): @classmethod @@ -60,7 +64,6 @@ class YouTubeTranscriptApi(): with requests.Session() as http_client: if cookies: http_client.cookies = cls.load_cookies(cookies) - http_client.proxies = proxies if proxies else {} return TranscriptListFetcher(http_client).fetch(video_id) @@ -121,7 +124,6 @@ class YouTubeTranscriptApi(): :rtype [{'text': str, 'start': float, 'end': float}]: """ return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() - @classmethod def load_cookies(cls, cookies): @@ -130,9 +132,9 @@ class YouTubeTranscriptApi(): cj = cookiejar.MozillaCookieJar() cj.load(cookies) except IOError as e: - print("Warning: Path for cookies file was not valid. Did not load any cookies") + raise CookiePathInvalid except FileNotFoundError as e: - print("Warning: Path for cookies file was not valid. Did not load any cookies") + raise CookiePathInvalid if not cj: - raise IOError + raise CookiesInvalid return cj diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 2b67e9e..8a663e5 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -55,6 +55,14 @@ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The requested translation language is not available' +class CookiePathInvalid(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'Path to cookie file was not valid' + + +class CookiesInvalid(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)' + + class NoTranscriptFound(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' From 49ccba7f957d5b83a75210cafd550f158efad63c Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 27 Jan 2020 21:47:04 -0800 Subject: [PATCH 6/8] Fixed testing with mock, added testing for cookies --- .../test/example_cookies.txt | 9 ++ .../test/expired_example_cookies.txt | 8 ++ youtube_transcript_api/test/test_api.py | 99 +++++++++++-------- 3 files changed, 75 insertions(+), 41 deletions(-) create mode 100644 youtube_transcript_api/test/example_cookies.txt create mode 100644 youtube_transcript_api/test/expired_example_cookies.txt diff --git a/youtube_transcript_api/test/example_cookies.txt b/youtube_transcript_api/test/example_cookies.txt new file mode 100644 index 0000000..12d5bd4 --- /dev/null +++ b/youtube_transcript_api/test/example_cookies.txt @@ -0,0 +1,9 @@ +# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous +# This file can be used by wget, curl, aria2c and other standard compliant tools. +# Usage Examples: +# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/" +# 2) curl --cookie cookies.txt "https://www.youtube.com/" +# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/" +# +.example.com TRUE / TRUE 3594431874 TEST_FIELD TEST_VALUE +.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE diff --git a/youtube_transcript_api/test/expired_example_cookies.txt b/youtube_transcript_api/test/expired_example_cookies.txt new file mode 100644 index 0000000..a6c09db --- /dev/null +++ b/youtube_transcript_api/test/expired_example_cookies.txt @@ -0,0 +1,8 @@ +# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous +# This file can be used by wget, curl, aria2c and other standard compliant tools. +# Usage Examples: +# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/" +# 2) curl --cookie cookies.txt "https://www.youtube.com/" +# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/" +# +.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 5d050f0..1af6bd1 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -1,8 +1,10 @@ from unittest import TestCase -from mock import MagicMock +from mock import patch import os +import requests + import httpretty from youtube_transcript_api import ( @@ -151,44 +153,7 @@ class TestYouTubeTranscriptApi(TestCase): with self.assertRaises(NoTranscriptAvailable): YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E') - def test_get_transcripts(self): - video_id_1 = 'video_id_1' - video_id_2 = 'video_id_2' - languages = ['de', 'en'] - YouTubeTranscriptApi.get_transcript = MagicMock() - - YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) - - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None, None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None, None) - self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) - - def test_get_transcripts__stop_on_error(self): - YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) - - with self.assertRaises(Exception): - YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2']) - - def test_get_transcripts__continue_on_error(self): - video_id_1 = 'video_id_1' - video_id_2 = 'video_id_2' - YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) - - YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None, None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None, None) - - def test_get_transcripts__check_cookies(self): - cookies='example_cookies.txt' - YouTubeTranscriptApi.get_transcript = MagicMock() - YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) - YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) - - session_cookies = YouTubeTranscriptApi.load_cookies(cookies) - print("here: ", session_cookies.items()) - - def test_get_transcript__with_proxies(self): + def test_get_transcript__with_proxy(self): proxies = {'http': '', 'https:': ''} transcript = YouTubeTranscriptApi.get_transcript( 'GJLlxj_dtq8', proxies=proxies @@ -201,6 +166,58 @@ class TestYouTubeTranscriptApi(TestCase): {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] ) - YouTubeTranscriptApi.get_transcript = MagicMock() + + @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + def test_get_transcripts(self, mock_get_transcript): + video_id_1 = 'video_id_1' + video_id_2 = 'video_id_2' + languages = ['de', 'en'] + + YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) + + mock_get_transcript.assert_any_call(video_id_1, languages, None, None) + mock_get_transcript.assert_any_call(video_id_2, languages, None, None) + self.assertEqual(mock_get_transcript.call_count, 2) + + @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) + def test_get_transcripts__stop_on_error(self, mock_get_transcript): + with self.assertRaises(Exception): + YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2']) + + @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) + def test_get_transcripts__continue_on_error(self, mock_get_transcript): + video_id_1 = 'video_id_1' + video_id_2 = 'video_id_2' + + YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) + + mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) + mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) + + @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + def test_get_transcripts__with_cookies(self, mock_get_transcript): + cookies = '/example_cookies.txt' + YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) + + @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + def test_get_transcripts__with_proxies(self, mock_get_transcript): + proxies = {'http': '', 'https:': ''} YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) + + def test_load_cookies(self): + dirname, filename = os.path.split(os.path.abspath(__file__)) + cookies = dirname + '/example_cookies.txt' + session_cookies = YouTubeTranscriptApi.load_cookies(cookies) + self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies)) + + def test_load_cookies__bad_files(self): + bad_cookies = 'nonexistent_cookies.txt' + with self.assertRaises(Exception): + YouTubeTranscriptApi.load_cookies(bad_cookies) + + dirname, filename = os.path.split(os.path.abspath(__file__)) + expired_cookies = dirname + '/expired_example_cookies.txt' + with self.assertRaises(Exception): + YouTubeTranscriptApi.load_cookies(expired_cookies) From 31b8f4a17913489c08963e7e11155c1694cc2c13 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Thu, 30 Jan 2020 21:53:18 -0800 Subject: [PATCH 7/8] Redid errors for cookies, improved testing coverage --- youtube_transcript_api/__init__.py | 2 ++ youtube_transcript_api/_api.py | 24 ++++++++++---------- youtube_transcript_api/_errors.py | 2 +- youtube_transcript_api/test/test_api.py | 29 ++++++++++++++++++++----- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index 34e9ba7..1fe0f73 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -8,4 +8,6 @@ from ._errors import ( NotTranslatable, TranslationLanguageNotAvailable, NoTranscriptAvailable, + CookiePathInvalid, + CookiesInvalid ) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 43a760b..389cf31 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,8 +1,10 @@ import requests try: import http.cookiejar as cookiejar + CookieLoadError = (FileNotFoundError, cookiejar.LoadError) except ImportError: import cookielib as cookiejar + CookieLoadError = IOError from ._transcripts import TranscriptListFetcher @@ -63,7 +65,7 @@ class YouTubeTranscriptApi(): """ with requests.Session() as http_client: if cookies: - http_client.cookies = cls.load_cookies(cookies) + http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} return TranscriptListFetcher(http_client).fetch(video_id) @@ -126,15 +128,13 @@ class YouTubeTranscriptApi(): return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() @classmethod - def load_cookies(cls, cookies): - cj = {} + def _load_cookies(cls, cookies, video_id): + cookie_jar = {} try: - cj = cookiejar.MozillaCookieJar() - cj.load(cookies) - except IOError as e: - raise CookiePathInvalid - except FileNotFoundError as e: - raise CookiePathInvalid - if not cj: - raise CookiesInvalid - return cj + cookie_jar = cookiejar.MozillaCookieJar() + cookie_jar.load(cookies) + except CookieLoadError: + raise CookiePathInvalid(video_id) + if not cookie_jar: + raise CookiesInvalid(video_id) + return cookie_jar diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 8a663e5..2f83a16 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -56,7 +56,7 @@ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript): class CookiePathInvalid(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'Path to cookie file was not valid' + CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded' class CookiesInvalid(CouldNotRetrieveTranscript): diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 1af6bd1..a081711 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -15,6 +15,8 @@ from youtube_transcript_api import ( NoTranscriptAvailable, NotTranslatable, TranslationLanguageNotAvailable, + CookiePathInvalid, + CookiesInvalid ) @@ -166,6 +168,20 @@ class TestYouTubeTranscriptApi(TestCase): {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] ) + + def test_get_transcript__with_cookies(self): + dirname, filename = os.path.split(os.path.abspath(__file__)) + cookies = dirname + '/example_cookies.txt' + transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies) + + self.assertEqual( + transcript, + [ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ] + ) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') def test_get_transcripts(self, mock_get_transcript): @@ -209,15 +225,16 @@ class TestYouTubeTranscriptApi(TestCase): def test_load_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) cookies = dirname + '/example_cookies.txt' - session_cookies = YouTubeTranscriptApi.load_cookies(cookies) + session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8') self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies)) - def test_load_cookies__bad_files(self): + def test_load_cookies__bad_file_path(self): bad_cookies = 'nonexistent_cookies.txt' - with self.assertRaises(Exception): - YouTubeTranscriptApi.load_cookies(bad_cookies) + with self.assertRaises(CookiePathInvalid): + YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8') + def test_load_cookies__no_valid_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) expired_cookies = dirname + '/expired_example_cookies.txt' - with self.assertRaises(Exception): - YouTubeTranscriptApi.load_cookies(expired_cookies) + with self.assertRaises(CookiesInvalid): + YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8') From a0823ea36fcc07069e43019019004596700998f3 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Fri, 31 Jan 2020 11:26:39 +0100 Subject: [PATCH 8/8] added documentation for using cookies feature via CLI to the README --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b5b5d20..a03fa2e 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,8 @@ Using the CLI: ``` youtube_transcript_api --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port -``` +``` + ## Cookies Some videos are age restricted, so this module won't be able to access those videos without some sort of authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to download that pages cookies into a text file. You can use the Chrome extension [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg?hl=en) or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/). @@ -223,10 +224,17 @@ Once you have that, you can use it with the module to access age-restricted vide ```python from youtube_transcript_api import YouTubeTranscriptApi -YouTubeTranscriptApi.get_transcript(video_id, cookies=) +YouTubeTranscriptApi.get_transcript(video_id, cookies='/path/to/your/cookies.txt') -YouTubeTranscriptApi.get_transcripts([video_id], cookies=) -``` +YouTubeTranscriptApi.get_transcripts([video_id], cookies='/path/to/your/cookies.txt') +``` + +Using the CLI: + +``` +youtube_transcript_api --cookies /path/to/your/cookies.txt +``` + ## Warning