From 86cd1666c070258437271888317591db620f3d8c Mon Sep 17 00:00:00 2001 From: Danny Aziz Date: Sat, 16 Mar 2019 16:23:42 +0000 Subject: [PATCH 1/5] Rebased on PR #11 and added tests --- README.md | 27 ++++++++++++----- youtube_transcript_api/_api.py | 25 +++++++++++----- youtube_transcript_api/_cli.py | 15 +++++++++- youtube_transcript_api/test/test_api.py | 22 +++++++++++--- youtube_transcript_api/test/test_cli.py | 39 ++++++++++++++++++++++++- 5 files changed, 108 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 57bf78d..bf90a9e 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,5 @@ # YouTube Transcript/Subtitle API (including automatically generated subtitles) -[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) -[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) -[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) -[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) -[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) - This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! ## Install @@ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application, youtube_transcript_api ... --languages de en --json > transcripts.json ``` +### Proxy + +You can pass a proxy to use during the network requests + +Code: +```python +from youtube_transcript_api import YouTubeTranscriptApi + +YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"}) + +``` + +CLI: +``` +youtube_transcript_api --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port +``` + +Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies + ## Warning -This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! +This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! \ No newline at end of file diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 35a5abe..46a2154 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -38,7 +38,7 @@ class YouTubeTranscriptApi(): self.video_id = video_id @classmethod - def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): + def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): """ Retrieves the transcripts for a list of videos. @@ -55,13 +55,15 @@ class YouTubeTranscriptApi(): :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} + :param proxies: a dictionary mapping of http and https proxies to be used for the network requests + :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies """ data = {} unretrievable_videos = [] for video_id in video_ids: try: - data[video_id] = cls.get_transcript(video_id, languages) + data[video_id] = cls.get_transcript(video_id, languages, proxies) except Exception as exception: if not continue_after_error: raise exception @@ -71,7 +73,7 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=None): + def get_transcript(cls, video_id, languages=None, proxies=None): """ Retrieves the transcript for a single video. @@ -84,9 +86,11 @@ class YouTubeTranscriptApi(): :type languages: [str] :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype: [{'text': str, 'start': float, 'end': float}] + :param proxies: a dictionary mapping of http and https proxies to be used for the network requests + :rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies """ try: - return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() + return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse() except Exception: logger.error( YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( @@ -101,12 +105,16 @@ class _TranscriptFetcher(): API_BASE_URL = 'https://www.youtube.com/api/{api_url}' LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') - def __init__(self, video_id, languages): + def __init__(self, video_id, languages, proxies): self.video_id = video_id self.languages = languages + self.proxies = proxies def fetch(self): - fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text + if self.proxies: + fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text + else: + fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text timedtext_url_start = fetched_site.find('timedtext') for language in (self.languages if self.languages else [None,]): @@ -128,7 +136,10 @@ class _TranscriptFetcher(): ) if language: url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) - return requests.get(url).text + if self.proxies: + return requests.get(url, proxies=self.proxies).text + else: + return requests.get(url).text class _TranscriptParser(): diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index dc80934..d49684b 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -14,10 +14,13 @@ class YouTubeTranscriptCli(): def run(self): parsed_args = self._parse_args() + proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} + transcripts, _ = YouTubeTranscriptApi.get_transcripts( parsed_args.video_ids, languages=parsed_args.languages, - continue_after_error=True + continue_after_error=True, + proxies=proxies ) if parsed_args.json: @@ -53,5 +56,15 @@ class YouTubeTranscriptCli(): default=False, help='If this flag is set the output will be JSON formatted.', ) + parser.add_argument( + '--http-proxy', dest='http_proxy', + default='', metavar='URL', + help='Use the specified HTTP proxy.' + ) + parser.add_argument( + '--https-proxy', dest='https_proxy', + default='', metavar='URL', + help='Use the specified HTTPS proxy.' + ) return parser.parse_args(self._args) diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index a8105e7..a21a02f 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None) self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) def test_get_transcripts__stop_on_error(self): @@ -99,5 +99,19 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) \ No newline at end of file + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None) + + def test_get_transcript__with_proxies(self): + transcript = YouTubeTranscriptApi.get_transcript( + 'GJLlxj_dtq8', proxies={'http': '', 'https:': ''} + ) + + self.assertEqual( + transcript, + [ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ] + ) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 8a63c23..f525e66 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -23,6 +23,31 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + def test_argument_parsing__only_video_ids(self): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) @@ -50,6 +75,17 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.json, False) self.assertEqual(parsed_args.languages, ['de', 'en']) + def test_argument_parsing__proxies(self): + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --http-proxy http://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --https-proxy https://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + def test_run(self): YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() @@ -57,7 +93,8 @@ class TestYouTubeTranscriptCli(TestCase): YouTubeTranscriptApi.get_transcripts.assert_called_once_with( ['v1', 'v2'], languages=['de', 'en'], - continue_after_error=True + continue_after_error=True, + proxies={"http": "", "https": ""} ) def test_run__json_output(self): From a9270f67fd70b9607bea4ada17116cb4cf005895 Mon Sep 17 00:00:00 2001 From: Danny Aziz Date: Tue, 19 Mar 2019 13:34:17 +0000 Subject: [PATCH 2/5] Refactored Tests --- youtube_transcript_api/_cli.py | 4 +++- youtube_transcript_api/test/test_api.py | 8 ++++++-- youtube_transcript_api/test/test_cli.py | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index d49684b..bc35c21 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -14,7 +14,9 @@ class YouTubeTranscriptCli(): def run(self): parsed_args = self._parse_args() - proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} + proxies = None + if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': + proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} transcripts, _ = YouTubeTranscriptApi.get_transcripts( parsed_args.video_ids, diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index a21a02f..a912d69 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -5,7 +5,7 @@ import os import httpretty -from youtube_transcript_api._api import YouTubeTranscriptApi +from youtube_transcript_api._api import YouTubeTranscriptApi, _TranscriptFetcher def load_asset(filename): @@ -103,8 +103,9 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None) def test_get_transcript__with_proxies(self): + proxies = {'http': '', 'https:': ''} transcript = YouTubeTranscriptApi.get_transcript( - 'GJLlxj_dtq8', proxies={'http': '', 'https:': ''} + 'GJLlxj_dtq8', proxies=proxies ) self.assertEqual( @@ -115,3 +116,6 @@ class TestYouTubeTranscriptApi(TestCase): {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] ) + YouTubeTranscriptApi.get_transcript = MagicMock() + YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) + YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', None, proxies) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index f525e66..11fdf29 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -39,6 +39,7 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + self.assertEqual(parsed_args.https_proxy, '') parsed_args = YouTubeTranscriptCli( 'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split() @@ -47,6 +48,7 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + self.assertEqual(parsed_args.http_proxy, '') def test_argument_parsing__only_video_ids(self): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() @@ -94,7 +96,7 @@ class TestYouTubeTranscriptCli(TestCase): ['v1', 'v2'], languages=['de', 'en'], continue_after_error=True, - proxies={"http": "", "https": ""} + proxies=None ) def test_run__json_output(self): From 2b8f213f3d4808bedde565ba34d4903bd86715db Mon Sep 17 00:00:00 2001 From: Danny Aziz Date: Tue, 19 Mar 2019 14:15:01 +0000 Subject: [PATCH 3/5] Removed _TranscriptFetcher --- youtube_transcript_api/test/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index a912d69..81da989 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -5,7 +5,7 @@ import os import httpretty -from youtube_transcript_api._api import YouTubeTranscriptApi, _TranscriptFetcher +from youtube_transcript_api._api import YouTubeTranscriptApi def load_asset(filename): From 7eb9e38eb65b3c78b9914c14e1f59456eec431f4 Mon Sep 17 00:00:00 2001 From: Danny Aziz Date: Wed, 20 Mar 2019 17:06:05 +0000 Subject: [PATCH 4/5] Added more proxies tests --- youtube_transcript_api/test/test_cli.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 11fdf29..4e1495e 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -50,6 +50,15 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') self.assertEqual(parsed_args.http_proxy, '') + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --languages de en --json'.split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.https_proxy, '') + def test_argument_parsing__only_video_ids(self): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) @@ -88,6 +97,18 @@ class TestYouTubeTranscriptCli(TestCase): )._parse_args() self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + parsed_args = YouTubeTranscriptCli( + 'v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() + )._parse_args() + self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + + parsed_args = YouTubeTranscriptCli( + 'v1 v2'.split() + )._parse_args() + self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.https_proxy, '') + def test_run(self): YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() From d6d301a612e0f0d7ae952e03cb389558b2a61724 Mon Sep 17 00:00:00 2001 From: Danny Aziz Date: Tue, 26 Mar 2019 17:32:28 +0000 Subject: [PATCH 5/5] Run proxies test --- youtube_transcript_api/test/test_cli.py | 27 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 4e1495e..6f01967 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -12,16 +12,22 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.https_proxy, '') parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.https_proxy, '') parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.https_proxy, '') parsed_args = YouTubeTranscriptCli( 'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() @@ -50,15 +56,6 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') self.assertEqual(parsed_args.http_proxy, '') - parsed_args = YouTubeTranscriptCli( - 'v1 v2 --languages de en --json'.split() - )._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.json, True) - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, '') - self.assertEqual(parsed_args.https_proxy, '') - def test_argument_parsing__only_video_ids(self): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) @@ -126,3 +123,15 @@ class TestYouTubeTranscriptCli(TestCase): # will fail if output is not valid json json.loads(output) + + def test_run__proxies(self): + YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) + YouTubeTranscriptCli( + 'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run() + + YouTubeTranscriptApi.get_transcripts.assert_called_once_with( + ['v1', 'v2'], + languages=['de', 'en'], + continue_after_error=True, + proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} + )