From f8416ab0043025f56aa73c03280ffedbb4972b81 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 17:36:48 +0100 Subject: [PATCH] added new params to cli to make new features accessible using the cli --- youtube_transcript_api/_api.py | 10 +- youtube_transcript_api/_cli.py | 76 ++++++++++++--- youtube_transcript_api/test/test_cli.py | 120 +++++++++++++++++++++--- 3 files changed, 175 insertions(+), 31 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 2a321ea..c1519ae 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -72,11 +72,11 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of - exceptions which occurred for the videos which could not be retrieved - :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [CouldNotRetrieveTranscript]}): + video ids, which could not be retrieved + :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): """ data = {} - exceptions = [] + unretrievable_videos = [] for video_id in video_ids: try: @@ -85,9 +85,9 @@ class YouTubeTranscriptApi(): if not continue_after_error: raise exception - exceptions.append(exception) + unretrievable_videos.append(video_id) - return data, exceptions + return data, unretrievable_videos @classmethod def get_transcript(cls, video_id, languages=('en',), proxies=None): diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 34a3c91..4aa79f9 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -14,22 +14,42 @@ class YouTubeTranscriptCli(): def run(self): parsed_args = self._parse_args() + if parsed_args.exclude_manually_created and parsed_args.exclude_generated: + return '' + proxies = None if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} - transcripts, unretrievable_videos = YouTubeTranscriptApi.get_transcripts( - parsed_args.video_ids, - languages=parsed_args.languages, - continue_after_error=True, - proxies=proxies - ) + transcripts = [] + exceptions = [] + + for video_id in parsed_args.video_ids: + try: + transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id)) + except Exception as exception: + exceptions.append(exception) return '\n\n'.join( - [str(exception) for exception in unretrievable_videos] + [str(exception) for exception in exceptions] + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) ) + def _fetch_transcript(self, parsed_args, proxies, video_id): + transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies) + + if parsed_args.exclude_manually_created: + transcript = transcript_list.find_generated_transcript(parsed_args.languages) + elif parsed_args.exclude_generated: + transcript = transcript_list.find_manually_created_transcript(parsed_args.languages) + else: + transcript = transcript_list.find_transcript(parsed_args.languages) + + if parsed_args.translate: + transcript = transcript.translate(parsed_args.translate) + + return transcript.fetch() + def _parse_args(self): parser = argparse.ArgumentParser( description=( @@ -38,6 +58,13 @@ class YouTubeTranscriptCli(): 'other selenium based solutions do!' ) ) + parser.add_argument( + '--list-transcripts', + action='store_const', + const=True, + default=False, + help='This will list the languages in which the given videos are available in.', + ) parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.') parser.add_argument( '--languages', @@ -46,11 +73,25 @@ class YouTubeTranscriptCli(): type=str, help=( 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' - 'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails ' + 'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails ' 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you ' 'may have to play around with the language codes a bit, to find the one which is working for you!' ), ) + parser.add_argument( + '--exclude-generated', + action='store_const', + const=True, + default=False, + help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.', + ) + parser.add_argument( + '--exclude-manually-created', + action='store_const', + const=True, + default=False, + help='If this flag is set transcripts which have been manually created will not be retrieved.', + ) parser.add_argument( '--json', action='store_const', @@ -59,13 +100,24 @@ class YouTubeTranscriptCli(): help='If this flag is set the output will be JSON formatted.', ) parser.add_argument( - '--http-proxy', dest='http_proxy', - default='', metavar='URL', + '--translate', + default='', + help=( + 'The language code for the language you want this transcript to be translated to. Use the ' + '--list-transcripts feature to find out which languages are translatable and which translation ' + 'languages are available.' + ) + ) + parser.add_argument( + '--http-proxy', + default='', + metavar='URL', help='Use the specified HTTP proxy.' ) parser.add_argument( - '--https-proxy', dest='https_proxy', - default='', metavar='URL', + '--https-proxy', + default='', + metavar='URL', help='Use the specified HTTPS proxy.' ) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 6f01967..e46789e 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -3,10 +3,27 @@ from mock import MagicMock import json -from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi +from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable +from youtube_transcript_api._cli import YouTubeTranscriptCli class TestYouTubeTranscriptCli(TestCase): + def setUp(self): + self.transcript_mock = MagicMock() + self.transcript_mock.fetch = MagicMock(return_value=[ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ]) + self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) + + self.transcript_list_mock = MagicMock() + self.transcript_list_mock.find_generated_transcript = MagicMock(return_value=self.transcript_mock) + self.transcript_list_mock.find_manually_created_transcript = MagicMock(return_value=self.transcript_mock) + self.transcript_list_mock.find_transcript = MagicMock(return_value=self.transcript_mock) + + YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock) + def test_argument_parsing(self): parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) @@ -106,32 +123,107 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.http_proxy, '') self.assertEqual(parsed_args.https_proxy, '') + def test_argument_parsing__list_transcripts(self): + parsed_args = YouTubeTranscriptCli('--list-transcripts v1 v2'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.list_transcripts) + + parsed_args = YouTubeTranscriptCli('v1 v2 --list-transcripts'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.list_transcripts) + + def test_argument_parsing__translate(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.translate, 'cz') + + parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.translate, 'cz') + + def test_argument_parsing__manually_or_generated(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.exclude_manually_created) + self.assertFalse(parsed_args.exclude_generated) + + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-generated'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertFalse(parsed_args.exclude_manually_created) + self.assertTrue(parsed_args.exclude_generated) + + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created --exclude-generated'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.exclude_manually_created) + self.assertTrue(parsed_args.exclude_generated) + def test_run(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() - YouTubeTranscriptApi.get_transcripts.assert_called_once_with( - ['v1', 'v2'], - languages=['de', 'en'], - continue_after_error=True, - proxies=None + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + + self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en']) + + def test_run__failing_transcripts(self): + YouTubeTranscriptApi.list_transcripts = MagicMock(side_effect=VideoUnavailable('video_id')) + + output = YouTubeTranscriptCli('v1 --languages de en'.split()).run() + + self.assertEqual(output, str(VideoUnavailable('video_id'))) + + def test_run__exclude_generated(self): + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-generated'.split()).run() + + self.transcript_list_mock.find_manually_created_transcript.assert_any_call(['de', 'en']) + + def test_run__exclude_manually_created(self): + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created'.split()).run() + + self.transcript_list_mock.find_generated_transcript.assert_any_call(['de', 'en']) + + def test_run__exclude_manually_created_and_generated(self): + self.assertEqual( + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split()).run(), + '' ) + def test_run__translate(self): + YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split()).run(), + + self.transcript_mock.translate.assert_any_call('cz') + + def test_run__list_transcripts(self): + YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run() + + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + def test_run__json_output(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], [])) output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() # will fail if output is not valid json json.loads(output) def test_run__proxies(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli( - 'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run() + ( + 'v1 v2 --languages de en ' + '--http-proxy http://user:pass@domain:port ' + '--https-proxy https://user:pass@domain:port' + ).split() + ).run() - YouTubeTranscriptApi.get_transcripts.assert_called_once_with( - ['v1', 'v2'], - languages=['de', 'en'], - continue_after_error=True, + YouTubeTranscriptApi.list_transcripts.assert_any_call( + 'v1', + proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} + ) + + YouTubeTranscriptApi.list_transcripts.assert_any_call( + 'v2', proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} )