From de1ddf0824efcb4182ebbc42395abe9505c6366f Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 16 Oct 2019 22:01:30 -0700 Subject: [PATCH 01/26] Implemented code to more thoroughly find a lanuages captions --- youtube_transcript_api/_api.py | 35 +- .../test/assets/youtubeWW1.html.static | 1519 +++++++++++++++++ youtube_transcript_api/test/test_api.py | 6 +- 3 files changed, 1534 insertions(+), 26 deletions(-) create mode 100644 youtube_transcript_api/test/assets/youtubeWW1.html.static diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 5664a1c..c418fd3 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -99,7 +99,7 @@ class YouTubeTranscriptApi(): class _TranscriptFetcher(): WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' - API_BASE_URL = 'https://www.youtube.com/api/{api_url}' + API_BASE_URL = 'https://www.youtube.com/api/' LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') TIMEDTEXT_STRING = 'timedtext?v=' @@ -107,39 +107,28 @@ class _TranscriptFetcher(): self.video_id = video_id self.languages = languages self.proxies = proxies + self.matched_splits = [] def fetch(self): if self.proxies: fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text else: fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text - timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING) - timedtext_url_start = ( - timedtext_splits[2].find(self.TIMEDTEXT_STRING) - + len(timedtext_splits[0]) - + len(timedtext_splits[1]) - + len(self.TIMEDTEXT_STRING) + 1 - ) - - for language in (self.languages if self.languages else [None,]): - response = self._execute_api_request(fetched_site, timedtext_url_start, language) + timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] + for language in (self.languages if self.languages else ['en']): + self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split] + if self.matched_splits: + break + if self.matched_splits: + timedtext_url = min(self.matched_splits, key=len) + response = self._execute_api_request(timedtext_url, language) if response: return response return None - def _execute_api_request(self, fetched_site, timedtext_url_start, language): - url = self.API_BASE_URL.format( - api_url=fetched_site[ - timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') - ].replace( - '\\u0026', '&' - ).replace( - '\\', '' - ) - ) - if language: - url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) + def _execute_api_request(self, timedtext_url, language): + url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}' if self.proxies: return requests.get(url, proxies=self.proxies).text else: diff --git a/youtube_transcript_api/test/assets/youtubeWW1.html.static b/youtube_transcript_api/test/assets/youtubeWW1.html.static new file mode 100644 index 0000000..f5e5149 --- /dev/null +++ b/youtube_transcript_api/test/assets/youtubeWW1.html.static @@ -0,0 +1,1519 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +The Tide Is Turning - Russian Civil War Fall 1919 I THE GREAT WAR 1919 - YouTube + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+
+ +
+
+
+ +
+
+
+

+ + + +Loading... + +

+ +
+
+
+ +
+
+
+ +
+
+ + +
+
+
+ +
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+ +
+
+
+

+ + + + + The Tide Is Turning - Russian Civil War Fall 1919 I THE GREAT WAR 1919 + + +

+
+
+ + +
+ + + + + +
+
36,240 views
+
+
+
+
+
+ + + + + +
+
+ + + +
+
+
+
+

+ + + +Loading... + +

+ +
+
+
+ +
+ +
+
+
+

+ + + +Loading... + +

+ +
+
+
+

+Transcript +

+
+ +
+ + + +
+
+The interactive transcript could not be loaded. +
+ + +
+
+ +
+ +
+
+

+ + + +Loading... + +

+ +
+
+ + +
+
+ Rating is available when the video has been rented. +
+ +
+ +
+
+ This feature is not available right now. Please try again later. +
+
+ + +
+ + +
+ + +
Published on Oct 14, 2019

Support 16 Days in Berlin: https://realtimehistory.net/indiegogo

The White Russian advance on Moscow comes to a crashing end as the Red Army manages to turn the tide of the Russian Civil War in Fall 1919.

» SUPPORT THE CHANNEL
Patreon: https://www.patreon.com/thegreatwar
Merchandise: https://shop.spreadshirt.de/thegreatwar/
Become a member: https://www.youtube.com/channel/UCUcy...

» BUY OUR SOURCES IN OUR AMAZON STORES
Our Amazon US Store: https://www.amazon.com/shop/influence...
Our Amazon CA Store: https://www.amazon.ca/shop/influencer...
Our Amazon UK Store: https://www.amazon.co.uk/shop/influen...

» SOURCES
Figes, Orlando. A People’s Tragedy. The Russian Revolution (London: The Bodley Head, 2017 [1996]).
Mawdsley, Evan. The Russian Civil War (New York: Pegasus Books, 2005).
Smele, Jonathan. The ‘Russian’ Civil Wars 1916-1926 (London: Hurst, 2015).
Sumpf, Alexandre. “Russian Civil War,” in 1914-1918 online. International Encyclopedia of the First World War. https://encyclopedia.1914-1918-online.
Engelstein, Laura. Russia in Flames (Oxford University Press, 2017).

» SOCIAL MEDIA
Facebook: https://facebook.com/TheGreatWarYT
Instagram: https://instagram.com/the_great_war
Twitter: https://twitter.com/WW1_Series
Reddit: htpps://reddit.com/r/TheGreatWarChannel

»CREDITS
Presented by: Jesse Alexander
Written by: Jesse Alexander
Director: Toni Steller & Florian Wittig
Director of Photography: Toni Steller
Sound: Toni Steller
Editing: Toni Steller
Mixing, Mastering & Sound Design: http://above-zero.com
Maps: Daniel Kogosov (https://www.patreon.com/Zalezsky)
Research by: Jesse Alexander
Fact checking: Florian Wittig

Channel Design: Alexander Clark
Original Logo: David van Stephold


A Mediakraft Networks Original Channel

Contains licensed material by getty images
All rights reserved - Real Time History GmbH 2019

+ +
+
+ +
+ + +
+
+

+ + + +Loading... + +

+ +
+ +
+ + +
+
+
+ + + +
+
+ +
+ +
+
+
+Advertisement +
+
+
+
+ + +
+
+
+
+
+ + + +When autoplay is enabled, a suggested video will automatically play next. + + + +
+

+ Up next +

+ + +
+
+ + +
+
+
+ +
+
+ +
+
+ +
+
+
+ + +
+ +
+ +
+
+ + +
+
+ + +
+ to add this to Watch Later + +
+
+

+Add to +

+
+
+

+ + + + Loading playlists... + +

+ +
+
+ + + + + + + \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index e298900..6fd48b5 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -53,11 +53,11 @@ class TestYouTubeTranscriptApi(TestCase): def test_get_transcript__fallback_language_is_used(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/api/timedtext', - body='' + 'https://www.youtube.com/watch', + body=load_asset('youtubeWW1.html.static') ) - YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) + YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en']) query_string = httpretty.last_request().querystring self.assertIn('lang', query_string) From 8fb9e75ba1abe156193553a7d57bec6973e4338e Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 21 Oct 2019 22:21:32 -0700 Subject: [PATCH 02/26] Formatted list comprehension for readability --- youtube_transcript_api/_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index c418fd3..493a42d 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -114,7 +114,10 @@ class _TranscriptFetcher(): fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text else: fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text - timedtext_splits = [split[:split.find('"')].replace('\\u0026', '&').replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] + timedtext_splits = [split[:split.find('"')] + .replace('\\u0026', '&') + .replace('\\', '') + for split in fetched_site.split(self.TIMEDTEXT_STRING)] for language in (self.languages if self.languages else ['en']): self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split] if self.matched_splits: From d8b5208824147b4c214cac0d6d35ac09004ff70e Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 21 Oct 2019 22:38:17 -0700 Subject: [PATCH 03/26] Modified README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 86f42dd..5eaff5a 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ This will return a list of dictionaries looking somewhat like this: ] ``` -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english). +You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). ```python YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) @@ -118,4 +118,4 @@ youtube_transcript_api --http-proxy http://us If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :) -[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) \ No newline at end of file +[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) From 630b33f3b86ae4238d491bf07fe21ea164819dba Mon Sep 17 00:00:00 2001 From: danielcliu Date: Tue, 22 Oct 2019 20:37:46 -0700 Subject: [PATCH 04/26] Removed fstrings to make compatable with python 3.5 and lower --- youtube_transcript_api/_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 493a42d..dd65e78 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -131,7 +131,7 @@ class _TranscriptFetcher(): return None def _execute_api_request(self, timedtext_url, language): - url = f'{self.API_BASE_URL}{self.TIMEDTEXT_STRING}{timedtext_url}' + url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) if self.proxies: return requests.get(url, proxies=self.proxies).text else: From db7b8524dcf44cfd99bceef51d2b30bba83237d7 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Tue, 22 Oct 2019 21:03:41 -0700 Subject: [PATCH 05/26] Removed another f-string instance --- youtube_transcript_api/_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index dd65e78..964e4d0 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -119,7 +119,7 @@ class _TranscriptFetcher(): .replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] for language in (self.languages if self.languages else ['en']): - self.matched_splits = [split for split in timedtext_splits if f'&lang={language}' in split] + self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] if self.matched_splits: break if self.matched_splits: From 7ac7d3266b0afc969e6b6a28961555588e3444e5 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 6 Nov 2019 21:19:24 -0800 Subject: [PATCH 06/26] Changed youtubeWWW1.html.static's name --- ...tubeWW1.html.static => youtube_ww1_nl_en.html.static} | 0 youtube_transcript_api/test/test_api.py | 9 +++++---- 2 files changed, 5 insertions(+), 4 deletions(-) rename youtube_transcript_api/test/assets/{youtubeWW1.html.static => youtube_ww1_nl_en.html.static} (100%) diff --git a/youtube_transcript_api/test/assets/youtubeWW1.html.static b/youtube_transcript_api/test/assets/youtube_ww1_nl_en.html.static similarity index 100% rename from youtube_transcript_api/test/assets/youtubeWW1.html.static rename to youtube_transcript_api/test/assets/youtube_ww1_nl_en.html.static diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 6fd48b5..c53f81c 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -54,7 +54,7 @@ class TestYouTubeTranscriptApi(TestCase): httpretty.register_uri( httpretty.GET, 'https://www.youtube.com/watch', - body=load_asset('youtubeWW1.html.static') + body=load_asset('youtube_ww1_nl_en.html.static') ) YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en']) @@ -99,8 +99,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ['en'], None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ['en'], None) def test_get_transcript__with_proxies(self): proxies = {'http': '', 'https:': ''} @@ -118,4 +118,5 @@ class TestYouTubeTranscriptApi(TestCase): ) YouTubeTranscriptApi.get_transcript = MagicMock() YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', None, proxies) + print(YouTubeTranscriptApi.get_transcript.mock_calls) + YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ['en'], proxies) From c7cb3117be607f3a4fa5ff90eb63a1a65e0b1391 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Wed, 6 Nov 2019 21:20:51 -0800 Subject: [PATCH 07/26] Removed unnecessary language variables, sort split matches by len while ignoring name arguement --- youtube_transcript_api/_api.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 964e4d0..9805f80 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -40,7 +40,7 @@ class YouTubeTranscriptApi(): self.video_id = video_id @classmethod - def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): + def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None): """ Retrieves the transcripts for a list of videos. @@ -75,7 +75,7 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=None, proxies=None): + def get_transcript(cls, video_id, languages=['en'], proxies=None): """ Retrieves the transcript for a single video. @@ -100,14 +100,14 @@ class YouTubeTranscriptApi(): class _TranscriptFetcher(): WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' API_BASE_URL = 'https://www.youtube.com/api/' - LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') TIMEDTEXT_STRING = 'timedtext?v=' + NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)') def __init__(self, video_id, languages, proxies): self.video_id = video_id self.languages = languages + print(languages) self.proxies = proxies - self.matched_splits = [] def fetch(self): if self.proxies: @@ -118,19 +118,25 @@ class _TranscriptFetcher(): .replace('\\u0026', '&') .replace('\\', '') for split in fetched_site.split(self.TIMEDTEXT_STRING)] - for language in (self.languages if self.languages else ['en']): - self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] - if self.matched_splits: + matched_splits = [] + for language in self.languages: + matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] + if matched_splits: break - if self.matched_splits: - timedtext_url = min(self.matched_splits, key=len) - response = self._execute_api_request(timedtext_url, language) + if matched_splits: + timedtext_url = min(matched_splits, key=self._sort_splits) + response = self._execute_api_request(timedtext_url) if response: return response return None - def _execute_api_request(self, timedtext_url, language): + #Sorting the matched splits by string length because we want non-asr options returned first + #However, we don't want to include the length of the 'name' argument as it could possible throw this off + def _sort_splits(self, matched_split): + return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) + + def _execute_api_request(self, timedtext_url): url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) if self.proxies: return requests.get(url, proxies=self.proxies).text From d224b02a80517de075178f6cb707f0a43516c176 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Sun, 10 Nov 2019 22:44:24 -0800 Subject: [PATCH 08/26] Languages argument defaults to a tuple instead of a list. --- youtube_transcript_api/_api.py | 16 +++++++++++----- youtube_transcript_api/test/test_api.py | 7 +++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 9805f80..2a38340 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -40,7 +40,7 @@ class YouTubeTranscriptApi(): self.video_id = video_id @classmethod - def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None): + def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): """ Retrieves the transcripts for a list of videos. @@ -75,7 +75,7 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=['en'], proxies=None): + def get_transcript(cls, video_id, languages=('en',), proxies=None): """ Retrieves the transcript for a single video. @@ -106,7 +106,6 @@ class _TranscriptFetcher(): def __init__(self, video_id, languages, proxies): self.video_id = video_id self.languages = languages - print(languages) self.proxies = proxies def fetch(self): @@ -131,9 +130,16 @@ class _TranscriptFetcher(): return None - #Sorting the matched splits by string length because we want non-asr options returned first - #However, we don't want to include the length of the 'name' argument as it could possible throw this off def _sort_splits(self, matched_split): + """Returns a value related to a given caption track url. + + This function is used to sort the matched splits by string + length because we want non-asr and non-dialect options returned first. + With this in mind, it is remove the 'name' arugument from the url as + it could possibly make the values inaccurate to what we desire. + + matched_split: The caption track url we want to return a value for. + """ return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) def _execute_api_request(self, timedtext_url): diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index c53f81c..a151c6b 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -99,8 +99,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ['en'], None) - YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ['en'], None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None) def test_get_transcript__with_proxies(self): proxies = {'http': '', 'https:': ''} @@ -118,5 +118,4 @@ class TestYouTubeTranscriptApi(TestCase): ) YouTubeTranscriptApi.get_transcript = MagicMock() YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - print(YouTubeTranscriptApi.get_transcript.mock_calls) - YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ['en'], proxies) + YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies) From 54ef72fafdac8a8a5c8a8098732bde7eabd0b081 Mon Sep 17 00:00:00 2001 From: danielcliu Date: Mon, 11 Nov 2019 19:58:12 -0800 Subject: [PATCH 09/26] Improve Name regex to use 1 group, find first & --- youtube_transcript_api/_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 2a38340..df8d2e2 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -101,7 +101,7 @@ class _TranscriptFetcher(): WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' API_BASE_URL = 'https://www.youtube.com/api/' TIMEDTEXT_STRING = 'timedtext?v=' - NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)') + NAME_REGEX = re.compile(r'&name=.*?(&)|&name=.*') def __init__(self, video_id, languages, proxies): self.video_id = video_id From df417be9156911ff8a3b3aaf4f63f65b798de5af Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Sun, 8 Dec 2019 14:40:57 +0100 Subject: [PATCH 10/26] refactored the way transcript information is retrieved and thereby improved error messages --- youtube_transcript_api/__init__.py | 2 + youtube_transcript_api/_api.py | 117 +- youtube_transcript_api/_errors.py | 62 + youtube_transcript_api/_settings.py | 1 + youtube_transcript_api/_transcripts.py | 202 ++ .../youtube_transcripts_disabled.html.static | 2160 +++++++++++++++++ .../youtube_video_unavailable.html.static | 806 ++++++ youtube_transcript_api/test/test_api.py | 26 +- 8 files changed, 3258 insertions(+), 118 deletions(-) create mode 100644 youtube_transcript_api/_errors.py create mode 100644 youtube_transcript_api/_settings.py create mode 100644 youtube_transcript_api/_transcripts.py create mode 100644 youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static create mode 100644 youtube_transcript_api/test/assets/youtube_video_unavailable.html.static diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index 3f22674..e2ed0aa 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -1 +1,3 @@ from ._api import YouTubeTranscriptApi +from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript +from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index df8d2e2..98ce16c 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,44 +1,9 @@ -import sys - -# This can only be tested by using different python versions, therefore it is not covered by coverage.py -if sys.version_info.major == 2: # pragma: no cover - reload(sys) - sys.setdefaultencoding('utf-8') - -from xml.etree import ElementTree - -import re - import requests -from ._html_unescaping import unescape +from ._transcripts import TranscriptDataFetcher class YouTubeTranscriptApi(): - class CouldNotRetrieveTranscript(Exception): - """ - Raised if a transcript could not be retrieved. - """ - - ERROR_MESSAGE = ( - 'Could not get the transcript for the video {video_url}! ' - 'This usually happens if one of the following things is the case:\n' - ' - subtitles have been disabled by the uploader\n' - ' - none of the language codes you provided are valid\n' - ' - none of the languages you provided are supported by the video\n' - ' - the video is no longer available.\n\n' - 'If none of these things is the case, please create an issue at ' - 'https://github.com/jdepoix/youtube-transcript-api/issues.' - 'Please add which version of youtube_transcript_api you are using and make sure that there ' - 'are no open issues which already describe your problem!' - ) - - def __init__(self, video_id): - super(YouTubeTranscriptApi.CouldNotRetrieveTranscript, self).__init__( - self.ERROR_MESSAGE.format(video_url=_TranscriptFetcher.WATCH_URL.format(video_id=video_id)) - ) - self.video_id = video_id - @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): """ @@ -47,7 +12,7 @@ class YouTubeTranscriptApi(): :param video_ids: a list of youtube video ids :type video_ids: [str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] - it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to + it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you! :type languages: [str] @@ -91,78 +56,6 @@ class YouTubeTranscriptApi(): :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype: [{'text': str, 'start': float, 'end': float}] """ - try: - return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse() - except Exception: - raise YouTubeTranscriptApi.CouldNotRetrieveTranscript(video_id) - - -class _TranscriptFetcher(): - WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' - API_BASE_URL = 'https://www.youtube.com/api/' - TIMEDTEXT_STRING = 'timedtext?v=' - NAME_REGEX = re.compile(r'&name=.*?(&)|&name=.*') - - def __init__(self, video_id, languages, proxies): - self.video_id = video_id - self.languages = languages - self.proxies = proxies - - def fetch(self): - if self.proxies: - fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text - else: - fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text - timedtext_splits = [split[:split.find('"')] - .replace('\\u0026', '&') - .replace('\\', '') - for split in fetched_site.split(self.TIMEDTEXT_STRING)] - matched_splits = [] - for language in self.languages: - matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] - if matched_splits: - break - if matched_splits: - timedtext_url = min(matched_splits, key=self._sort_splits) - response = self._execute_api_request(timedtext_url) - if response: - return response - - return None - - def _sort_splits(self, matched_split): - """Returns a value related to a given caption track url. - - This function is used to sort the matched splits by string - length because we want non-asr and non-dialect options returned first. - With this in mind, it is remove the 'name' arugument from the url as - it could possibly make the values inaccurate to what we desire. - - matched_split: The caption track url we want to return a value for. - """ - return len(re.sub(self.NAME_REGEX, r'\1', matched_split)) - - def _execute_api_request(self, timedtext_url): - url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) - if self.proxies: - return requests.get(url, proxies=self.proxies).text - else: - return requests.get(url).text - - -class _TranscriptParser(): - HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) - - def __init__(self, plain_data): - self.plain_data = plain_data - - def parse(self): - return [ - { - 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), - 'start': float(xml_element.attrib['start']), - 'duration': float(xml_element.attrib['dur']), - } - for xml_element in ElementTree.fromstring(self.plain_data) - if xml_element.text is not None - ] + with requests.Session() as http_client: + http_client.proxies = proxies if proxies else {} + return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py new file mode 100644 index 0000000..5dc4d8e --- /dev/null +++ b/youtube_transcript_api/_errors.py @@ -0,0 +1,62 @@ +from ._settings import WATCH_URL + + +class CouldNotRetrieveTranscript(Exception): + """ + Raised if a transcript could not be retrieved. + """ + ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!' + CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}' + CAUSE_MESSAGE = '' + GITHUB_REFERRAL = ( + '\n\nIf you are sure that the described cause is not responsible for this error ' + 'and that a transcript should be retrievable, please create an issue at ' + 'https://github.com/jdepoix/youtube-transcript-api/issues.' + 'Please add which version of youtube_transcript_api you are using ' + 'and provide the information needed to replicate the error. ' + 'Also make sure that there are no open issues which already describe your problem!' + ) + + def __init__(self, video_id): + self.video_id = video_id + super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message()) + + def _build_error_message(self): + cause = self.cause + error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id)) + + if cause: + error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL + + return error_message + + @property + def cause(self): + return self.CAUSE_MESSAGE + + +class VideoUnavailable(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'The video is no longer available' + + +class TranscriptsDisabled(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'Subtitles are disabled for this video' + + +class NoTranscriptFound(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = ( + 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' + '{transcript_data}' + ) + + def __init__(self, video_id, requested_language_codes, transcript_data): + self._requested_language_codes = requested_language_codes + self._transcript_data = transcript_data + super(NoTranscriptFound, self).__init__(video_id) + + @property + def cause(self): + return self.CAUSE_MESSAGE.format( + requested_language_codes=self._requested_language_codes, + transcript_data=str(self._transcript_data), + ) diff --git a/youtube_transcript_api/_settings.py b/youtube_transcript_api/_settings.py new file mode 100644 index 0000000..b1f7dfe --- /dev/null +++ b/youtube_transcript_api/_settings.py @@ -0,0 +1 @@ +WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py new file mode 100644 index 0000000..37a272b --- /dev/null +++ b/youtube_transcript_api/_transcripts.py @@ -0,0 +1,202 @@ +import sys + +# This can only be tested by using different python versions, therefore it is not covered by coverage.py +if sys.version_info.major == 2: # pragma: no cover + reload(sys) + sys.setdefaultencoding('utf-8') + +import json + +from xml.etree import ElementTree + +import re + +from ._html_unescaping import unescape +from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled +from ._settings import WATCH_URL + + +class TranscriptDataFetcher(): + def __init__(self, http_client): + self._http_client = http_client + + def fetch(self, video_id): + return TranscriptData.build( + self._http_client, + video_id, + self._extract_captions_json(self._fetch_html(video_id), video_id) + ) + + def _extract_captions_json(self, html, video_id): + splitted_html = html.split('"captions":') + + if len(splitted_html) <= 1: + if '"playabilityStatus":' not in html: + raise VideoUnavailable(video_id) + + raise TranscriptsDisabled(video_id) + + return json.loads(splitted_html[1].split(',"videoDetails')[0].replace('\n', ''))[ + 'playerCaptionsTracklistRenderer' + ] + + def _fetch_html(self, video_id): + return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace( + '\\u0026', '&' + ).replace( + '\\', '' + ) + + +class TranscriptData(): + # TODO implement iterator + + def __init__( + self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages + ): + self._http_client = http_client + self.video_id = video_id + self._manually_created_transcripts = manually_created_transcripts + self._generated_transcripts = generated_transcripts + self._translation_languages = translation_languages + + @staticmethod + def build(http_client, video_id, captions_json): + manually_created_transcripts = [] + generated_transcripts = [] + + for caption in captions_json['captionTracks']: + (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( + { + 'url': caption['baseUrl'], + 'language': caption['name']['simpleText'], + 'language_code': caption['languageCode'], + 'is_generated': caption.get('kind', '') == 'asr', + 'is_translatable': caption['isTranslatable'], + } + ) + + return TranscriptData( + http_client, + video_id, + manually_created_transcripts, + generated_transcripts, + [ + { + 'language': translation_language['languageName']['simpleText'], + 'language_code': translation_language['languageCode'], + } for translation_language in captions_json['translationLanguages'] + ], + ) + + def find_transcript(self, language_codes): + try: + return self.find_manually_created_transcript(language_codes) + except NoTranscriptFound: + pass + + return self.find_generated_transcript(language_codes) + + def find_generated_transcript(self, language_codes): + return self._find_transcript(language_codes, generated=True) + + def find_manually_created_transcript(self, language_codes): + return self._find_transcript(language_codes, generated=False) + + def _find_transcript(self, language_codes, generated): + transcripts = self._generated_transcripts if generated else self._manually_created_transcripts + + for language_code in language_codes: + for transcript in transcripts: + if transcript['language_code'] == language_code: + return Transcript( + self._http_client, + transcript['url'], + transcript['language'], + transcript['language_code'], + transcript['is_generated'], + self._translation_languages if transcript['is_translatable'] else [] + ) + + raise NoTranscriptFound( + self.video_id, + language_codes, + self + ) + + def __str__(self): + return ( + 'For this video ({video_id}) transcripts are available in the following languages:\n\n' + '(MANUALLY CREATED)\n' + '{available_manually_created_transcript_languages}\n\n' + '(GENERATED)\n' + '{available_generated_transcripts}' + ).format( + video_id=self.video_id, + available_manually_created_transcript_languages=self._get_language_description( + self._manually_created_transcripts + ), + available_generated_transcripts=self._get_language_description( + self._generated_transcripts + ), + ) + + def _get_language_description(self, transcripts): + return '\n'.join( + ' - {language_code} ("{language}")'.format( + language=transcript['language'], + language_code=transcript['language_code'], + ) for transcript in transcripts + ) if transcripts else 'None' + + +class Transcript(): + def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): + self._http_client = http_client + self.url = url + self.language = language + self.language_code = language_code + self.is_generated = is_generated + self.translation_languages = translation_languages + + def fetch(self): + return _TranscriptParser().parse( + self._http_client.get(self.url).text + ) + +# TODO integrate translations in future release +# @property +# def is_translatable(self): +# return len(self.translation_languages) > 0 +# +# +# class TranslatableTranscript(Transcript): +# def __init__(self, http_client, url, translation_languages): +# super(TranslatableTranscript, self).__init__(http_client, url) +# self._translation_languages = translation_languages +# self._translation_language_codes = {language['language_code'] for language in translation_languages} +# +# +# def translate(self, language_code): +# if language_code not in self._translation_language_codes: +# raise TranslatableTranscript.TranslationLanguageNotAvailable() +# +# return Transcript( +# self._http_client, +# '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code) +# ) + + +class _TranscriptParser(): + HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) + + def parse(self, plain_data): + return [ + { + 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), + 'start': float(xml_element.attrib['start']), + 'duration': float(xml_element.attrib['dur']), + } + for xml_element in ElementTree.fromstring(plain_data) + if xml_element.text is not None + ] diff --git a/youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static b/youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static new file mode 100644 index 0000000..626cc67 --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static @@ -0,0 +1,2160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Angèle - Eels x Richard Cocciante | A Take Away Show - YouTube + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+ + +
+
+
+
+ +
+ DE +
+
+ +
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+
+
+
+
+
+ +
+ +
+ +
+
+ + +
+
+ + +
+
+
+
+ +
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ #Angèle #BrolLaSuite #Blogothèque + +
+

+ + + + + Angèle - Eels x Richard Cocciante | A Take Away Show + + +

+
+
+ + +
+
+ + + + + +
+ + +
+
+
+
+
491.364 Aufrufe
+
+
+
+
+
+ + + + + +
+
+
+ + + +
+
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+ +
+
+

+ + + +Wird geladen... + +

+ +
+
+ + +
+
+ Die Bewertungsfunktion ist nach Ausleihen des Videos verfügbar. +
+ +
+ +
+
+ Diese Funktion ist gerade nicht verfügbar. Bitte versuche es später noch einmal. +
+
+ + +
+ + +
+ + +
+
+
+
+
Am 28.11.2019 veröffentlicht +
+
+

Abonnez-vous ! http://bit.ly/SubBlogo
Retrouvez le concert en intégralité sur CANAL+ via myCANAL : + http://bit.ly/2srC54F


La Blogothèque & Off Productions
avec la participation de Canal+

Filmé au Comptoir Général, Paris, en octobre 2019
Réalisation: Xavier Reim
Directeur de la photographie: Thibaut Charlut
Cadreur: Célidja Pornon

Réalisation son: Jean-Baptiste Aubonnet & Guillaume De La Villéon
Opérateur son: Alban Lejeune

Producteur délégué: Christophe Abric
Producteur Exécutif: Anousonne Savanchomkeo
Directeur de Production: Rémi Veyrié

#Angèle #BrolLaSuite #Blogothèque

— Follow La Blogothèque : +
http://blogotheque.net
http://facebook.com/blogotheque
http://instagram.com/blogotheque
http://twitter.com/blogotheque

— Stay a while :
Take Away Shows, the Very Best : + http://bit.ly/TASBest
Take Away Shows 2018 : http://bit.ly/TAShow18
Take Away Shows 2017 : http://bit.ly/TAShow17
Take Away Shows 2016 : http://bit.ly/TAShow16

For more than ten years, La Blogotheque has changed the way people experience music videos. We film beautiful, rare and intimate sessions with your favorite artists, and the ones you are soon to fall in love with. Come, stay a while, and be taken away.

+
+
+ +
+
+
+
+ + +
+ + +
+
+

+ + + +Wird geladen... + +

+ +
+ +
+ + +
+
+
+ + + +
+
+ +
+ +
+
+
+ Anzeige +
+
+
+
+ + +
+
+
+
+
+ + + + Wenn Autoplay aktiviert ist, wird die Wiedergabe automatisch mit einem der aktuellen Videovorschläge fortgesetzt. + + + + + +
+

+ Nächstes Video +

+ + +
+
+ + +
+
+
+ +
+
+ +
+
+ +
+
+
+ + +
+ +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+ , um dieses Video zur Playlist "Später ansehen" hinzuzufügen. + +
+
+
+

+ Hinzufügen +

+
+
+

+ + + + Playlists werden geladen... + +

+ +
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/youtube_transcript_api/test/assets/youtube_video_unavailable.html.static b/youtube_transcript_api/test/assets/youtube_video_unavailable.html.static new file mode 100644 index 0000000..d9c0106 --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube_video_unavailable.html.static @@ -0,0 +1,806 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + YouTube + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+ + +
+
+
+
+ +
+ DE +
+
+ +
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+
+
+
+
+ +
+ +
+ +
+
+
+
+
+ +
+ +
+ +
+
+ + +
+
+ + +
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + +
+ +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+ , um dieses Video zur Playlist "Später ansehen" hinzuzufügen. + +
+
+
+

+ Hinzufügen +

+
+
+

+ + + + Playlists werden geladen... + +

+ +
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index a151c6b..b2897af 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -5,7 +5,7 @@ import os import httpretty -from youtube_transcript_api._api import YouTubeTranscriptApi +from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable, NoTranscriptFound, TranscriptsDisabled def load_asset(filename): @@ -64,15 +64,29 @@ class TestYouTubeTranscriptApi(TestCase): self.assertEqual(len(query_string['lang']), 1) self.assertEqual(query_string['lang'][0], 'en') - def test_get_transcript__exception_is_raised_when_not_available(self): + def test_get_transcript__exception_if_video_unavailable(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/api/timedtext', - body='' + 'https://www.youtube.com/watch', + body=load_asset('youtube_video_unavailable.html.static') ) - with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript): - YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') + with self.assertRaises(VideoUnavailable): + YouTubeTranscriptApi.get_transcript('abc') + + def test_get_transcript__exception_if_transcripts_disabled(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube_transcripts_disabled.html.static') + ) + + with self.assertRaises(TranscriptsDisabled): + YouTubeTranscriptApi.get_transcript('dsMFmonKDD4') + + def test_get_transcript__exception_if_language_unavailable(self): + with self.assertRaises(NoTranscriptFound): + YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz']) def test_get_transcripts(self): video_id_1 = 'video_id_1' From c2c49c3c1751695f210fa2a5c3a9b0b380933293 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Wed, 11 Dec 2019 11:42:14 +0100 Subject: [PATCH 11/26] fixed bug; added doctstrings for public methods --- youtube_transcript_api/__init__.py | 2 +- youtube_transcript_api/_api.py | 12 +- youtube_transcript_api/_transcripts.py | 170 ++++++++++++++++++------- 3 files changed, 131 insertions(+), 53 deletions(-) diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index e2ed0aa..c9bb4eb 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -1,3 +1,3 @@ from ._api import YouTubeTranscriptApi -from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript +from ._transcripts import TranscriptList, Transcript from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 98ce16c..3476b9b 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,6 +1,6 @@ import requests -from ._transcripts import TranscriptDataFetcher +from ._transcripts import TranscriptListFetcher class YouTubeTranscriptApi(): @@ -13,8 +13,7 @@ class YouTubeTranscriptApi(): :type video_ids: [str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to - do so. As I can't provide a complete list of all working language codes with full certainty, you may have to - play around with the language codes a bit, to find the one which is working for you! + do so. :type languages: [str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts @@ -23,7 +22,7 @@ class YouTubeTranscriptApi(): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved - :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} + :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) """ data = {} unretrievable_videos = [] @@ -48,8 +47,7 @@ class YouTubeTranscriptApi(): :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to - do so. As I can't provide a complete list of all working language codes with full certainty, you may have to - play around with the language codes a bit, to find the one which is working for you! + do so. :type languages: [str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies @@ -58,4 +56,4 @@ class YouTubeTranscriptApi(): """ with requests.Session() as http_client: http_client.proxies = proxies if proxies else {} - return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() + return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 37a272b..9e09258 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled from ._settings import WATCH_URL -class TranscriptDataFetcher(): +class TranscriptListFetcher(): def __init__(self, http_client): self._http_client = http_client def fetch(self, video_id): - return TranscriptData.build( + return TranscriptList.build( self._http_client, video_id, self._extract_captions_json(self._fetch_html(video_id), video_id) @@ -48,48 +48,89 @@ class TranscriptDataFetcher(): ) -class TranscriptData(): +class TranscriptList(): + """ + This object represents a list of transcripts. It can be iterated over to list all transcripts which are available + for a given YouTube video. Also it provides functionality to search for a transcript in a given language. + """ + # TODO implement iterator - def __init__( - self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages - ): - self._http_client = http_client + def __init__(self, video_id, manually_created_transcripts, generated_transcripts): + """ + The constructor is only for internal use. Use the static build method instead. + + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param manually_created_transcripts: dict mapping language codes to the manually created transcripts + :type manually_created_transcripts: dict[str, Transcript] + :param generated_transcripts: dict mapping language codes to the generated transcripts + :type generated_transcripts: dict[str, Transcript] + """ self.video_id = video_id self._manually_created_transcripts = manually_created_transcripts self._generated_transcripts = generated_transcripts - self._translation_languages = translation_languages @staticmethod def build(http_client, video_id, captions_json): - manually_created_transcripts = [] - generated_transcripts = [] + """ + Factory method for TranscriptList. + + :param http_client: http client which is used to make the transcript retrieving http calls + :type http_client: requests.Session + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param captions_json: the JSON parsed from the YouTube pages static HTML + :type captions_json: dict + :return: the created TranscriptList + :rtype TranscriptList + """ + translation_languages = [ + { + 'language': translation_language['languageName']['simpleText'], + 'language_code': translation_language['languageCode'], + } for translation_language in captions_json['translationLanguages'] + ] + + manually_created_transcripts = {} + generated_transcripts = {} for caption in captions_json['captionTracks']: - (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( - { - 'url': caption['baseUrl'], - 'language': caption['name']['simpleText'], - 'language_code': caption['languageCode'], - 'is_generated': caption.get('kind', '') == 'asr', - 'is_translatable': caption['isTranslatable'], - } + if caption.get('kind', '') == 'asr': + transcript_dict = generated_transcripts + else: + transcript_dict = manually_created_transcripts + + transcript_dict[caption['languageCode']] = Transcript( + http_client, + video_id, + caption['baseUrl'], + caption['name']['simpleText'], + caption['languageCode'], + caption.get('kind', '') == 'asr', + translation_languages if caption['isTranslatable'] else [] ) - return TranscriptData( - http_client, + return TranscriptList( video_id, manually_created_transcripts, generated_transcripts, - [ - { - 'language': translation_language['languageName']['simpleText'], - 'language_code': translation_language['languageCode'], - } for translation_language in captions_json['translationLanguages'] - ], ) def find_transcript(self, language_codes): + """ + Finds a transcript for a given language code. Manually created transcripts are returned first and only if none + are found, generated transcripts are used. If you only want generated transcripts use + find_manually_created_transcript instead. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ try: return self.find_manually_created_transcript(language_codes) except NoTranscriptFound: @@ -98,25 +139,39 @@ class TranscriptData(): return self.find_generated_transcript(language_codes) def find_generated_transcript(self, language_codes): + """ + Finds a automatically generated transcript for a given language code. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ return self._find_transcript(language_codes, generated=True) def find_manually_created_transcript(self, language_codes): + """ + Finds a manually created transcript for a given language code. + + :param language_codes: A list of language codes in a descending priority. For example, if this is set to + ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if + it fails to do so. + :type languages: [str] + :return: the found Transcript + :rtype: Transcript + :raises: NoTranscriptFound + """ return self._find_transcript(language_codes, generated=False) def _find_transcript(self, language_codes, generated): transcripts = self._generated_transcripts if generated else self._manually_created_transcripts for language_code in language_codes: - for transcript in transcripts: - if transcript['language_code'] == language_code: - return Transcript( - self._http_client, - transcript['url'], - transcript['language'], - transcript['language_code'], - transcript['is_generated'], - self._translation_languages if transcript['is_translatable'] else [] - ) + if language_code in transcripts: + return transcripts[language_code] raise NoTranscriptFound( self.video_id, @@ -134,34 +189,59 @@ class TranscriptData(): ).format( video_id=self.video_id, available_manually_created_transcript_languages=self._get_language_description( - self._manually_created_transcripts + self._manually_created_transcripts.values() ), available_generated_transcripts=self._get_language_description( - self._generated_transcripts + self._generated_transcripts.values() ), ) def _get_language_description(self, transcripts): return '\n'.join( - ' - {language_code} ("{language}")'.format( - language=transcript['language'], - language_code=transcript['language_code'], - ) for transcript in transcripts + ' - {transcript}'.format(transcript=str(transcript)) + for transcript in transcripts ) if transcripts else 'None' class Transcript(): - def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): + """ + You probably don't want to initialize this directly. Usually you'll access Transcript objects using a + TranscriptList. + + :param http_client: http client which is used to make the transcript retrieving http calls + :type http_client: requests.Session + :param video_id: the id of the video this TranscriptList is for + :type video_id: str + :param url: the url which needs to be called to fetch the transcript + :param language: the name of the language this transcript uses + :param language_code: + :param is_generated: + :param translation_languages: + """ self._http_client = http_client - self.url = url + self.video_id = video_id + self._url = url self.language = language self.language_code = language_code self.is_generated = is_generated self.translation_languages = translation_languages def fetch(self): + """ + Loads the actual transcript data. + + :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys + :rtype: [{'text': str, 'start': float, 'end': float}] + """ return _TranscriptParser().parse( - self._http_client.get(self.url).text + self._http_client.get(self._url).text + ) + + def __str__(self): + return '{language_code} ("{language}")'.format( + language=self.language, + language_code=self.language_code, ) # TODO integrate translations in future release From 41300585a15d468b63a0910795fe205c0a69e150 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Fri, 13 Dec 2019 11:15:32 +0100 Subject: [PATCH 12/26] fixed bug where undesirable results where returned if the desired language was only available as generated transcript --- youtube_transcript_api/_transcripts.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 9e09258..0f131ad 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -131,12 +131,7 @@ class TranscriptList(): :rtype: Transcript :raises: NoTranscriptFound """ - try: - return self.find_manually_created_transcript(language_codes) - except NoTranscriptFound: - pass - - return self.find_generated_transcript(language_codes) + return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts]) def find_generated_transcript(self, language_codes): """ @@ -150,7 +145,7 @@ class TranscriptList(): :rtype: Transcript :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, generated=True) + return self._find_transcript(language_codes, [self._generated_transcripts,]) def find_manually_created_transcript(self, language_codes): """ @@ -164,14 +159,13 @@ class TranscriptList(): :rtype: Transcript :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, generated=False) - - def _find_transcript(self, language_codes, generated): - transcripts = self._generated_transcripts if generated else self._manually_created_transcripts + return self._find_transcript(language_codes, [self._manually_created_transcripts,]) + def _find_transcript(self, language_codes, transcript_dicts): for language_code in language_codes: - if language_code in transcripts: - return transcripts[language_code] + for transcript_dict in transcript_dicts: + if language_code in transcript_dict: + return transcript_dict[language_code] raise NoTranscriptFound( self.video_id, From 409141ab51b761f3908976e3b2e9a6fdbad575a3 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 16 Dec 2019 16:58:26 +0100 Subject: [PATCH 13/26] added translate feature; added iterator to TranscriptList --- youtube_transcript_api/_errors.py | 8 ++++ youtube_transcript_api/_transcripts.py | 55 ++++++++++++++------------ 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 5dc4d8e..6f033c0 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -43,6 +43,14 @@ class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' +class NotTranslatable(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'The requested language is not translatable' + + +class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'The requested translation language is not available' + + class NoTranscriptFound(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 0f131ad..04ce8b9 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -12,7 +12,9 @@ from xml.etree import ElementTree import re from ._html_unescaping import unescape -from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled +from ._errors import ( + VideoUnavailable, NoTranscriptFound, TranscriptsDisabled, NotTranslatable, TranslationLanguageNotAvailable +) from ._settings import WATCH_URL @@ -53,9 +55,6 @@ class TranscriptList(): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - - # TODO implement iterator - def __init__(self, video_id, manually_created_transcripts, generated_transcripts): """ The constructor is only for internal use. Use the static build method instead. @@ -117,6 +116,9 @@ class TranscriptList(): generated_transcripts, ) + def __iter__(self): + return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values())) + def find_transcript(self, language_codes): """ Finds a transcript for a given language code. Manually created transcripts are returned first and only if none @@ -220,6 +222,10 @@ class Transcript(): self.language_code = language_code self.is_generated = is_generated self.translation_languages = translation_languages + self._translation_languages_dict = { + translation_language['language_code']: translation_language['language'] + for translation_language in translation_languages + } def fetch(self): """ @@ -238,27 +244,26 @@ class Transcript(): language_code=self.language_code, ) -# TODO integrate translations in future release -# @property -# def is_translatable(self): -# return len(self.translation_languages) > 0 -# -# -# class TranslatableTranscript(Transcript): -# def __init__(self, http_client, url, translation_languages): -# super(TranslatableTranscript, self).__init__(http_client, url) -# self._translation_languages = translation_languages -# self._translation_language_codes = {language['language_code'] for language in translation_languages} -# -# -# def translate(self, language_code): -# if language_code not in self._translation_language_codes: -# raise TranslatableTranscript.TranslationLanguageNotAvailable() -# -# return Transcript( -# self._http_client, -# '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code) -# ) + @property + def is_translatable(self): + return len(self.translation_languages) > 0 + + def translate(self, language_code): + if not self.is_translatable: + raise NotTranslatable(self.video_id) + + if language_code not in self._translation_languages_dict: + raise TranslationLanguageNotAvailable(self.video_id) + + return Transcript( + self._http_client, + self.video_id, + '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code), + self._translation_languages_dict[language_code], + language_code, + True, + [], + ) class _TranscriptParser(): From 20612ea7a3820d40b523fab63b3867776d7bc880 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 16 Dec 2019 17:08:14 +0100 Subject: [PATCH 14/26] improved tostring methods --- youtube_transcript_api/_transcripts.py | 31 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 04ce8b9..35601db 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -55,7 +55,7 @@ class TranscriptList(): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - def __init__(self, video_id, manually_created_transcripts, generated_transcripts): + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): """ The constructor is only for internal use. Use the static build method instead. @@ -65,10 +65,13 @@ class TranscriptList(): :type manually_created_transcripts: dict[str, Transcript] :param generated_transcripts: dict mapping language codes to the generated transcripts :type generated_transcripts: dict[str, Transcript] + :param translation_languages: list of languages which can be used for translatable languages + :type translation_languages: list[dict[str, str]] """ self.video_id = video_id self._manually_created_transcripts = manually_created_transcripts self._generated_transcripts = generated_transcripts + self._translation_languages = translation_languages @staticmethod def build(http_client, video_id, captions_json): @@ -114,6 +117,7 @@ class TranscriptList(): video_id, manually_created_transcripts, generated_transcripts, + translation_languages, ) def __iter__(self): @@ -181,22 +185,28 @@ class TranscriptList(): '(MANUALLY CREATED)\n' '{available_manually_created_transcript_languages}\n\n' '(GENERATED)\n' - '{available_generated_transcripts}' + '{available_generated_transcripts}\n\n' + '(TRANSLATION LANGUAGES)\n' + '{available_translation_languages}' ).format( video_id=self.video_id, available_manually_created_transcript_languages=self._get_language_description( - self._manually_created_transcripts.values() + str(transcript) for transcript in self._manually_created_transcripts.values() ), available_generated_transcripts=self._get_language_description( - self._generated_transcripts.values() + str(transcript) for transcript in self._generated_transcripts.values() ), + available_translation_languages=self._get_language_description( + '{language_code} ("{language}")'.format( + language=translation_language['language'], + language_code=translation_language['language_code'], + ) for translation_language in self._translation_languages + ) ) - def _get_language_description(self, transcripts): - return '\n'.join( - ' - {transcript}'.format(transcript=str(transcript)) - for transcript in transcripts - ) if transcripts else 'None' + def _get_language_description(self, transcript_strings): + description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings) + return description if description else 'None' class Transcript(): @@ -239,9 +249,10 @@ class Transcript(): ) def __str__(self): - return '{language_code} ("{language}")'.format( + return '{language_code} ("{language}"){translation_description}'.format( language=self.language, language_code=self.language_code, + translation_description='[TRANSLATABLE]' if self.is_translatable else '' ) @property From f1e4754ca47ed182f121eca60f1aec0d0dde8984 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Fri, 20 Dec 2019 16:28:35 +0100 Subject: [PATCH 15/26] added check if any transcripts are actually available --- youtube_transcript_api/__init__.py | 10 +- youtube_transcript_api/_errors.py | 6 +- youtube_transcript_api/_transcripts.py | 18 +- ...outube_no_transcript_available.html.static | 1349 +++++++++++++++++ youtube_transcript_api/test/test_api.py | 18 +- 5 files changed, 1394 insertions(+), 7 deletions(-) create mode 100644 youtube_transcript_api/test/assets/youtube_no_transcript_available.html.static diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index c9bb4eb..34e9ba7 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -1,3 +1,11 @@ from ._api import YouTubeTranscriptApi from ._transcripts import TranscriptList, Transcript -from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable +from ._errors import ( + TranscriptsDisabled, + NoTranscriptFound, + CouldNotRetrieveTranscript, + VideoUnavailable, + NotTranslatable, + TranslationLanguageNotAvailable, + NoTranscriptAvailable, +) diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 6f033c0..2b67e9e 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -11,7 +11,7 @@ class CouldNotRetrieveTranscript(Exception): GITHUB_REFERRAL = ( '\n\nIf you are sure that the described cause is not responsible for this error ' 'and that a transcript should be retrievable, please create an issue at ' - 'https://github.com/jdepoix/youtube-transcript-api/issues.' + 'https://github.com/jdepoix/youtube-transcript-api/issues. ' 'Please add which version of youtube_transcript_api you are using ' 'and provide the information needed to replicate the error. ' 'Also make sure that there are no open issues which already describe your problem!' @@ -43,6 +43,10 @@ class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' +class NoTranscriptAvailable(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = 'No transcripts are available for this video' + + class NotTranslatable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The requested language is not translatable' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 35601db..95f4ead 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -13,7 +13,12 @@ import re from ._html_unescaping import unescape from ._errors import ( - VideoUnavailable, NoTranscriptFound, TranscriptsDisabled, NotTranslatable, TranslationLanguageNotAvailable + VideoUnavailable, + NoTranscriptFound, + TranscriptsDisabled, + NotTranslatable, + TranslationLanguageNotAvailable, + NoTranscriptAvailable, ) from ._settings import WATCH_URL @@ -38,9 +43,14 @@ class TranscriptListFetcher(): raise TranscriptsDisabled(video_id) - return json.loads(splitted_html[1].split(',"videoDetails')[0].replace('\n', ''))[ - 'playerCaptionsTracklistRenderer' - ] + captions_json = json.loads( + splitted_html[1].split(',"videoDetails')[0].replace('\n', '') + )['playerCaptionsTracklistRenderer'] + + if 'captionTracks' not in captions_json: + raise NoTranscriptAvailable(video_id) + + return captions_json def _fetch_html(self, video_id): return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace( diff --git a/youtube_transcript_api/test/assets/youtube_no_transcript_available.html.static b/youtube_transcript_api/test/assets/youtube_no_transcript_available.html.static new file mode 100644 index 0000000..1ea7ed2 --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube_no_transcript_available.html.static @@ -0,0 +1,1349 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +MTG Top 10: BAD Cards That Suddenly Became Good - YouTube + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+
+ DE +
+
+
+ +
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+
+
+ +
+
+ + +
+
+
+ +
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

+ + + + + MTG Top 10: BAD Cards That Suddenly Became Good + + +

+
+
+ + +
+ + + + + +
+
305.276 Aufrufe
+
+
+
+
+
+ + + + + +
+
+ + + +
+
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+ +
+
+

+ + + +Wird geladen... + +

+ +
+
+ + +
+
+ Die Bewertungsfunktion ist nach Ausleihen des Videos verfügbar. +
+ +
+ +
+
+ Diese Funktion ist gerade nicht verfügbar. Bitte versuche es später noch einmal. +
+
+ + +
+ + +
+ + +
Am 25.07.2019 veröffentlicht

This video is sponsored by CardKingdom! Check out their awesome store here: http://www.cardkingdom.com/?utm_sourc...

Want to see me draft live? You can on Twitch! http://www.twitch.tv/Nizzahon

Want to support the channel? You can on Patreon!: https://www.patreon.com/Nizzahon_Magic

Follow me on Twitter for channel updates and other Magic musings: https://twitter.com/NizzahonMagic

Animations by Mike from Mythic Tales. Find his channel filled with awesome MTG animation here: https://www.youtube.com/user/RadioCom...

I Can Feel it Coming Kevin MacLeod (http://incompetech.com )
Licensed under Creative Commons: By Attribution 3.0 License
http://creativecommons.org/licenses/b...

+
    +
  • +

    + Kategorie +

    + +
  • + +
+
+
+ +
+ + +
+
+

+ + + +Wird geladen... + +

+ +
+ +
+ + +
+
+
+ + + +
+
+ +
+ +
+
+
+Anzeige +
+
+
+
+ + +
+
+
+
+
+ + + +Wenn Autoplay aktiviert ist, wird die Wiedergabe automatisch mit einem der aktuellen Videovorschläge fortgesetzt. + + + +
+

+ Nächstes Video +

+ + +
+
+ + +
+
+
+ +
+
+ +
+
+ +
+
+
+ + +
+ +
+ +
+
+ + +
+
+ + +
+ , um dieses Video zur Playlist "Später ansehen" hinzuzufügen. + +
+
+

+Hinzufügen +

+
+
+

+ + + + Playlists werden geladen... + +

+ +
+
+ + + + + + + \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index b2897af..91a4de0 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -5,7 +5,13 @@ import os import httpretty -from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable, NoTranscriptFound, TranscriptsDisabled +from youtube_transcript_api import ( + YouTubeTranscriptApi, + TranscriptsDisabled, + NoTranscriptFound, + VideoUnavailable, + NoTranscriptAvailable, +) def load_asset(filename): @@ -88,6 +94,16 @@ class TestYouTubeTranscriptApi(TestCase): with self.assertRaises(NoTranscriptFound): YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz']) + def test_get_transcript__exception_if_no_transcript_available(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube_no_transcript_available.html.static') + ) + + with self.assertRaises(NoTranscriptAvailable): + YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E') + def test_get_transcripts(self): video_id_1 = 'video_id_1' video_id_2 = 'video_id_2' From 8287d1088ef0efcc2df08ec9547f7bfbdfa3aed4 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 14:38:27 +0100 Subject: [PATCH 16/26] fixed bug where nontranslatable transcripts would throw an exception --- youtube_transcript_api/_transcripts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 95f4ead..19e9044 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -120,7 +120,7 @@ class TranscriptList(): caption['name']['simpleText'], caption['languageCode'], caption.get('kind', '') == 'asr', - translation_languages if caption['isTranslatable'] else [] + translation_languages if caption.get('isTranslatable', False) else [] ) return TranscriptList( @@ -295,7 +295,7 @@ class _TranscriptParser(): { 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), - 'duration': float(xml_element.attrib['dur']), + 'duration': float(xml_element.attrib.get('dur', '0.0')), } for xml_element in ElementTree.fromstring(plain_data) if xml_element.text is not None From 1bc50875754d69aac0de519a07749b6ccc54eec3 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 15:20:47 +0100 Subject: [PATCH 17/26] added public list_transcripts method --- youtube_transcript_api/_api.py | 69 ++++++++++++++++++++++---- youtube_transcript_api/_transcripts.py | 16 +++--- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 3476b9b..c1519ae 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -4,17 +4,68 @@ from ._transcripts import TranscriptListFetcher class YouTubeTranscriptApi(): + @classmethod + def list_transcripts(cls, video_id, proxies=None): + """ + Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object + which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating + over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide + metadata and can either be fetched by calling `transcript.fetch()` or translated by calling + `transcript.translate('en')`. Example:: + + # retrieve the available transcripts + transcript_list = YouTubeTranscriptApi.get('video_id') + + # iterate over all available transcripts + for transcript in transcript_list: + # the Transcript object provides metadata properties + print( + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # a list of languages the transcript can be translated to + transcript.translation_languages, + ) + + # fetch the actual transcript data + print(transcript.fetch()) + + # translating the transcript will return another transcript object + print(transcript.translate('en').fetch()) + + # you can also directly filter for the language you are looking for, using the transcript list + transcript = transcript_list.find_transcript(['de', 'en']) + + # or just filter for manually created transcripts + transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + + # or automatically generated ones + transcript = transcript_list.find_generated_transcript(['de', 'en']) + + :param video_id: the youtube video id + :type video_id: str + :param proxies: a dictionary mapping of http and https proxies to be used for the network requests + :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies + :return: the list of available transcripts + :rtype TranscriptList: + """ + with requests.Session() as http_client: + http_client.proxies = proxies if proxies else {} + return TranscriptListFetcher(http_client).fetch(video_id) + @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): """ Retrieves the transcripts for a list of videos. :param video_ids: a list of youtube video ids - :type video_ids: [str] + :type video_ids: list[str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts :type continue_after_error: bool @@ -22,7 +73,7 @@ class YouTubeTranscriptApi(): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved - :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) + :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): """ data = {} unretrievable_videos = [] @@ -41,19 +92,19 @@ class YouTubeTranscriptApi(): @classmethod def get_transcript(cls, video_id, languages=('en',), proxies=None): """ - Retrieves the transcript for a single video. + Retrieves the transcript for a single video. This is just a shortcut for calling:: + + YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch() :param video_id: the youtube video id :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys - :rtype: [{'text': str, 'start': float, 'end': float}] + :rtype [{'text': str, 'start': float, 'end': float}]: """ - with requests.Session() as http_client: - http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() + return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch() diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 19e9044..6b767ff 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -95,7 +95,7 @@ class TranscriptList(): :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict :return: the created TranscriptList - :rtype TranscriptList + :rtype TranscriptList: """ translation_languages = [ { @@ -142,9 +142,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts]) @@ -156,9 +156,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._generated_transcripts,]) @@ -170,9 +170,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._manually_created_transcripts,]) @@ -252,7 +252,7 @@ class Transcript(): Loads the actual transcript data. :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys - :rtype: [{'text': str, 'start': float, 'end': float}] + :rtype [{'text': str, 'start': float, 'end': float}]: """ return _TranscriptParser().parse( self._http_client.get(self._url).text From 66d02c08a177bd1f44a1c839f9cf9b3bcfb84be0 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 15:34:35 +0100 Subject: [PATCH 18/26] added tests for new public api method --- youtube_transcript_api/test/test_api.py | 47 +++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 91a4de0..f506d33 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -11,6 +11,8 @@ from youtube_transcript_api import ( NoTranscriptFound, VideoUnavailable, NoTranscriptAvailable, + NotTranslatable, + TranslationLanguageNotAvailable, ) @@ -48,6 +50,51 @@ class TestYouTubeTranscriptApi(TestCase): ] ) + def test_list_transcripts(self): + transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + + language_codes = {transcript.language_code for transcript in transcript_list} + + self.assertEqual(language_codes, {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'}) + + def test_list_transcripts__find_manually_created(self): + transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + transcript = transcript_list.find_manually_created_transcript(['cs']) + + self.assertFalse(transcript.is_generated) + + + def test_list_transcripts__find_generated(self): + transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + + with self.assertRaises(NoTranscriptFound): + transcript_list.find_generated_transcript(['cs']) + + transcript = transcript_list.find_generated_transcript(['en']) + + self.assertTrue(transcript.is_generated) + + def test_translate_transcript(self): + transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + + translated_transcript = transcript.translate('af') + + self.assertEqual(translated_transcript.language_code, 'af') + self.assertIn('&tlang=af', translated_transcript._url) + + def test_translate_transcript__translation_language_not_available(self): + transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + + with self.assertRaises(TranslationLanguageNotAvailable): + transcript.translate('xyz') + + def test_translate_transcript__not_translatable(self): + transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + transcript.translation_languages = [] + + with self.assertRaises(NotTranslatable): + transcript.translate('af') + def test_get_transcript__correct_language_is_used(self): YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) query_string = httpretty.last_request().querystring From 4b75a47a74ff2715eb098e79a6890c3cd8e99b50 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 16:13:18 +0100 Subject: [PATCH 19/26] get_transcripts now returns a list of exceptions instead of video ids of failed videos --- youtube_transcript_api/_api.py | 10 +++++----- youtube_transcript_api/_cli.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index c1519ae..2a321ea 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -72,11 +72,11 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of - video ids, which could not be retrieved - :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): + exceptions which occurred for the videos which could not be retrieved + :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [CouldNotRetrieveTranscript]}): """ data = {} - unretrievable_videos = [] + exceptions = [] for video_id in video_ids: try: @@ -85,9 +85,9 @@ class YouTubeTranscriptApi(): if not continue_after_error: raise exception - unretrievable_videos.append(video_id) + exceptions.append(exception) - return data, unretrievable_videos + return data, exceptions @classmethod def get_transcript(cls, video_id, languages=('en',), proxies=None): diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index c0fcf40..34a3c91 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -26,7 +26,7 @@ class YouTubeTranscriptCli(): ) return '\n\n'.join( - [str(YouTubeTranscriptApi.CouldNotRetrieveTranscript(video_id)) for video_id in unretrievable_videos] + [str(exception) for exception in unretrievable_videos] + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) ) From f8416ab0043025f56aa73c03280ffedbb4972b81 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 17:36:48 +0100 Subject: [PATCH 20/26] added new params to cli to make new features accessible using the cli --- youtube_transcript_api/_api.py | 10 +- youtube_transcript_api/_cli.py | 76 ++++++++++++--- youtube_transcript_api/test/test_cli.py | 120 +++++++++++++++++++++--- 3 files changed, 175 insertions(+), 31 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 2a321ea..c1519ae 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -72,11 +72,11 @@ class YouTubeTranscriptApi(): :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of - exceptions which occurred for the videos which could not be retrieved - :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [CouldNotRetrieveTranscript]}): + video ids, which could not be retrieved + :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): """ data = {} - exceptions = [] + unretrievable_videos = [] for video_id in video_ids: try: @@ -85,9 +85,9 @@ class YouTubeTranscriptApi(): if not continue_after_error: raise exception - exceptions.append(exception) + unretrievable_videos.append(video_id) - return data, exceptions + return data, unretrievable_videos @classmethod def get_transcript(cls, video_id, languages=('en',), proxies=None): diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 34a3c91..4aa79f9 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -14,22 +14,42 @@ class YouTubeTranscriptCli(): def run(self): parsed_args = self._parse_args() + if parsed_args.exclude_manually_created and parsed_args.exclude_generated: + return '' + proxies = None if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} - transcripts, unretrievable_videos = YouTubeTranscriptApi.get_transcripts( - parsed_args.video_ids, - languages=parsed_args.languages, - continue_after_error=True, - proxies=proxies - ) + transcripts = [] + exceptions = [] + + for video_id in parsed_args.video_ids: + try: + transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id)) + except Exception as exception: + exceptions.append(exception) return '\n\n'.join( - [str(exception) for exception in unretrievable_videos] + [str(exception) for exception in exceptions] + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) ) + def _fetch_transcript(self, parsed_args, proxies, video_id): + transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies) + + if parsed_args.exclude_manually_created: + transcript = transcript_list.find_generated_transcript(parsed_args.languages) + elif parsed_args.exclude_generated: + transcript = transcript_list.find_manually_created_transcript(parsed_args.languages) + else: + transcript = transcript_list.find_transcript(parsed_args.languages) + + if parsed_args.translate: + transcript = transcript.translate(parsed_args.translate) + + return transcript.fetch() + def _parse_args(self): parser = argparse.ArgumentParser( description=( @@ -38,6 +58,13 @@ class YouTubeTranscriptCli(): 'other selenium based solutions do!' ) ) + parser.add_argument( + '--list-transcripts', + action='store_const', + const=True, + default=False, + help='This will list the languages in which the given videos are available in.', + ) parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.') parser.add_argument( '--languages', @@ -46,11 +73,25 @@ class YouTubeTranscriptCli(): type=str, help=( 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' - 'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails ' + 'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails ' 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you ' 'may have to play around with the language codes a bit, to find the one which is working for you!' ), ) + parser.add_argument( + '--exclude-generated', + action='store_const', + const=True, + default=False, + help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.', + ) + parser.add_argument( + '--exclude-manually-created', + action='store_const', + const=True, + default=False, + help='If this flag is set transcripts which have been manually created will not be retrieved.', + ) parser.add_argument( '--json', action='store_const', @@ -59,13 +100,24 @@ class YouTubeTranscriptCli(): help='If this flag is set the output will be JSON formatted.', ) parser.add_argument( - '--http-proxy', dest='http_proxy', - default='', metavar='URL', + '--translate', + default='', + help=( + 'The language code for the language you want this transcript to be translated to. Use the ' + '--list-transcripts feature to find out which languages are translatable and which translation ' + 'languages are available.' + ) + ) + parser.add_argument( + '--http-proxy', + default='', + metavar='URL', help='Use the specified HTTP proxy.' ) parser.add_argument( - '--https-proxy', dest='https_proxy', - default='', metavar='URL', + '--https-proxy', + default='', + metavar='URL', help='Use the specified HTTPS proxy.' ) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 6f01967..e46789e 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -3,10 +3,27 @@ from mock import MagicMock import json -from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi +from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable +from youtube_transcript_api._cli import YouTubeTranscriptCli class TestYouTubeTranscriptCli(TestCase): + def setUp(self): + self.transcript_mock = MagicMock() + self.transcript_mock.fetch = MagicMock(return_value=[ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ]) + self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) + + self.transcript_list_mock = MagicMock() + self.transcript_list_mock.find_generated_transcript = MagicMock(return_value=self.transcript_mock) + self.transcript_list_mock.find_manually_created_transcript = MagicMock(return_value=self.transcript_mock) + self.transcript_list_mock.find_transcript = MagicMock(return_value=self.transcript_mock) + + YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock) + def test_argument_parsing(self): parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) @@ -106,32 +123,107 @@ class TestYouTubeTranscriptCli(TestCase): self.assertEqual(parsed_args.http_proxy, '') self.assertEqual(parsed_args.https_proxy, '') + def test_argument_parsing__list_transcripts(self): + parsed_args = YouTubeTranscriptCli('--list-transcripts v1 v2'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.list_transcripts) + + parsed_args = YouTubeTranscriptCli('v1 v2 --list-transcripts'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.list_transcripts) + + def test_argument_parsing__translate(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.translate, 'cz') + + parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, ['de', 'en']) + self.assertEqual(parsed_args.translate, 'cz') + + def test_argument_parsing__manually_or_generated(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.exclude_manually_created) + self.assertFalse(parsed_args.exclude_generated) + + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-generated'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertFalse(parsed_args.exclude_manually_created) + self.assertTrue(parsed_args.exclude_generated) + + parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created --exclude-generated'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertTrue(parsed_args.exclude_manually_created) + self.assertTrue(parsed_args.exclude_generated) + def test_run(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() - YouTubeTranscriptApi.get_transcripts.assert_called_once_with( - ['v1', 'v2'], - languages=['de', 'en'], - continue_after_error=True, - proxies=None + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + + self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en']) + + def test_run__failing_transcripts(self): + YouTubeTranscriptApi.list_transcripts = MagicMock(side_effect=VideoUnavailable('video_id')) + + output = YouTubeTranscriptCli('v1 --languages de en'.split()).run() + + self.assertEqual(output, str(VideoUnavailable('video_id'))) + + def test_run__exclude_generated(self): + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-generated'.split()).run() + + self.transcript_list_mock.find_manually_created_transcript.assert_any_call(['de', 'en']) + + def test_run__exclude_manually_created(self): + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created'.split()).run() + + self.transcript_list_mock.find_generated_transcript.assert_any_call(['de', 'en']) + + def test_run__exclude_manually_created_and_generated(self): + self.assertEqual( + YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split()).run(), + '' ) + def test_run__translate(self): + YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split()).run(), + + self.transcript_mock.translate.assert_any_call('cz') + + def test_run__list_transcripts(self): + YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run() + + YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) + def test_run__json_output(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], [])) output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() # will fail if output is not valid json json.loads(output) def test_run__proxies(self): - YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptCli( - 'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run() + ( + 'v1 v2 --languages de en ' + '--http-proxy http://user:pass@domain:port ' + '--https-proxy https://user:pass@domain:port' + ).split() + ).run() - YouTubeTranscriptApi.get_transcripts.assert_called_once_with( - ['v1', 'v2'], - languages=['de', 'en'], - continue_after_error=True, + YouTubeTranscriptApi.list_transcripts.assert_any_call( + 'v1', + proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} + ) + + YouTubeTranscriptApi.list_transcripts.assert_any_call( + 'v2', proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} ) From 936ef3c1d09e285c303ee94f4eaf98e4e1ca4755 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 17:40:32 +0100 Subject: [PATCH 21/26] added list-transcripts param to cli --- youtube_transcript_api/_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 4aa79f9..21f816b 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -38,6 +38,9 @@ class YouTubeTranscriptCli(): def _fetch_transcript(self, parsed_args, proxies, video_id): transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies) + if parsed_args.list_transcripts: + return str(transcript_list) + if parsed_args.exclude_manually_created: transcript = transcript_list.find_generated_transcript(parsed_args.languages) elif parsed_args.exclude_generated: From aa34a2ceb33d4edcaedcb508698e6babbd00e1b0 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 18:21:45 +0100 Subject: [PATCH 22/26] updated README --- README.md | 287 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 199 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 5eaff5a..8d7ee03 100644 --- a/README.md +++ b/README.md @@ -1,121 +1,232 @@ -# YouTube Transcript/Subtitle API (including automatically generated subtitles) -[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) -[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) -[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) -[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) -[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) -[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) +# YouTube Transcript/Subtitle API (including automatically generated subtitles and subtitle translations) + +[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) +[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) +[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) +[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) +[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) +[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) + +This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! + +## Install + +It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): + +``` +pip install youtube_transcript_api +``` + +If you want to use it from source, you'll have to install the dependencies manually: + +``` +pip install -r requirements.txt +``` -This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! +You can either integrate this module [into an existing application](#api), or just use it via an [CLI](#cli). + +## API + +The easiest way to get a transcript for a given video is to execute: + +```python +from youtube_transcript_api import YouTubeTranscriptApi + +YouTubeTranscriptApi.get_transcript(video_id) +``` + +This will return a list of dictionaries looking somewhat like this: + +```python +[ + { + 'text': 'Hey there', + 'start': 7.58, + 'duration': 6.13 + }, + { + 'text': 'how are you', + 'start': 14.08, + 'duration': 7.58 + }, + # ... +] +``` + +You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + +It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts) + +To get transcripts for a list of video ids you can call: + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + +`languages` also is optional here. -## Install +### List available transcripts -It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): - -``` -pip install youtube_transcript_api -``` - -If you want to use it from source, you'll have to install the dependencies manually: - -``` -pip install -r requirements.txt -``` - -## How to use it - -You could either integrate this module into an existing application, or just use it via an CLI - -### In code - -To get a transcript for a given video you can do: +If you want to list all transcripts which are available for a given video you can call ```python -from youtube_transcript_api import YouTubeTranscriptApi - -YouTubeTranscriptApi.get_transcript(video_id) +transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, languages=['de', 'en']) ``` -This will return a list of dictionaries looking somewhat like this: +This will return a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages and types, like: ```python -[ - { - 'text': 'Hey there', - 'start': 7.58, - 'duration': 6.13 - }, - { - 'text': 'how are you', - 'start': 14.08, - 'duration': 7.58 - }, - # ... -] +transcript = transcript_list.find_transcript(['de', 'en']) ``` -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). +By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated. The `TranscriptList` allows you to bypass this default behaviour by searching for specific transcript types: ```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +# filter for manually created transcripts +transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + +# or automatically generated ones +transcript = transcript_list.find_generated_transcript(['de', 'en']) ``` -It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you! - -To get transcripts for a list fo video ids you can call: +The methods `find_generated_transcript`, `find_manually_created_transcript`, `find_generated_transcript` return `Transcript` objects. They contain metadata regarding the transcript ```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +print( + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, +) ``` -`languages` also is optional here. - -### CLI - -Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: - -``` -youtube_transcript_api ... -``` - -The CLI also gives you the option to provide a list of preferred languages: - -``` -youtube_transcript_api ... --languages de en -``` - -If you would prefer to write it into a file or pipe it into another application, you can also output the results as json using the following line: - -``` -youtube_transcript_api ... --languages de en --json > transcripts.json -``` - -### Proxy - -You can specify a https/http proxy, which will be used during the requests to YouTube: +and provide the method, which allows you to fetch the actual transcript data: ```python -from youtube_transcript_api import YouTubeTranscriptApi - -YouTubeTranscriptApi.get_transcript(video_id, proxies={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"}) +transcript.fetch() ``` -As the `proxies` dict is passed on to the `requests.get(...)` call, it follows the [format used by the requests library](http://docs.python-requests.org/en/master/user/advanced/#proxies). +### Translate transcript -Using the CLI: +YouTube has a feature which allows you to automatically translate subtitles. This module also makes it possible to access this feature. To do so `Transcript` objects provide a `translate()` method, which returns a new translated `Transcript` object: -``` -youtube_transcript_api --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port +```python +transcript = transcript_list.find_transcript(['en']) +translated_transcript = transcript.translate('de') +print(translated_transcript.fetch()) ``` +### By example +```python +# retrieve the available transcripts +transcript_list = YouTubeTranscriptApi.get('video_id') + +# iterate over all available transcripts +for transcript in transcript_list: -## Warning + # the Transcript object provides metadata properties + print( + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, + ) + + # fetch the actual transcript data + print(transcript.fetch()) + + # translating the transcript will return another transcript object + print(transcript.translate('en').fetch()) + +# you can also directly filter for the language you are looking for, using the transcript list +transcript = transcript_list.find_transcript(['de', 'en']) + +# or just filter for manually created transcripts +transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + +# or automatically generated ones +transcript = transcript_list.find_generated_transcript(['de', 'en']) +``` + +## CLI + +Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: + +``` +youtube_transcript_api ... +``` + +The CLI also gives you the option to provide a list of preferred languages: + +``` +youtube_transcript_api ... --languages de en +``` - This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! +You can also specify if you want to exclude automatically generated or manually created subtitles: -## Donation +``` +youtube_transcript_api ... --languages de en --exclude-generated +youtube_transcript_api ... --languages de en --exclude-manually-created +``` + +If you would prefer to write it into a file or pipe it into another application, you can also output the results as json using the following line: + +``` +youtube_transcript_api ... --languages de en --json > transcripts.json +``` -If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :) +Translating transcripts using the CLI is also possible: -[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) +``` +youtube_transcript_api ... --languages en --translate de +``` + +If you are not sure which languages are available for a given video you can call: + +``` +youtube_transcript_api --list-transcripts +``` + +## Proxy + +You can specify a https/http proxy, which will be used during the requests to YouTube: + +```python +from youtube_transcript_api import YouTubeTranscriptApi + +YouTubeTranscriptApi.get_transcript(video_id, proxies={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"}) +``` + +As the `proxies` dict is passed on to the `requests.get(...)` call, it follows the [format used by the requests library](http://docs.python-requests.org/en/master/user/advanced/#proxies). + +Using the CLI: + +``` +youtube_transcript_api --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port +``` + + +## Warning + + This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! + +## Donation + +If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :) + +[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) \ No newline at end of file From 889cedcbf0d45da73a5858a00f68783ffa643812 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 18:41:08 +0100 Subject: [PATCH 23/26] fixed typo --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8d7ee03..4292061 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) ### List available transcripts -If you want to list all transcripts which are available for a given video you can call +If you want to list all transcripts which are available for a given video you can call: ```python transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, languages=['de', 'en']) @@ -94,7 +94,7 @@ transcript = transcript_list.find_manually_created_transcript(['de', 'en']) transcript = transcript_list.find_generated_transcript(['de', 'en']) ``` -The methods `find_generated_transcript`, `find_manually_created_transcript`, `find_generated_transcript` return `Transcript` objects. They contain metadata regarding the transcript +The methods `find_generated_transcript`, `find_manually_created_transcript`, `find_generated_transcript` return `Transcript` objects. They contain metadata regarding the transcript: ```python print( @@ -131,13 +131,13 @@ print(translated_transcript.fetch()) # retrieve the available transcripts transcript_list = YouTubeTranscriptApi.get('video_id') -# iterate over all available transcripts -for transcript in transcript_list: +# iterate over all available transcripts +for transcript in transcript_list: - # the Transcript object provides metadata properties + # the Transcript object provides metadata properties print( - transcript.video_id, - transcript.language, + transcript.video_id, + transcript.language, transcript.language_code, # whether it has been manually created or generated by YouTube transcript.is_generated, @@ -196,7 +196,7 @@ Translating transcripts using the CLI is also possible: youtube_transcript_api ... --languages en --translate de ``` -If you are not sure which languages are available for a given video you can call: +If you are not sure which languages are available for a given video you can call, to list all available transcripts: ``` youtube_transcript_api --list-transcripts From 0901fe3053116438c202457370b3058e57189d2b Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 23:50:43 +0100 Subject: [PATCH 24/26] v0.2.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2c6aed3..5fc01d9 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def get_test_suite(): setuptools.setup( name="youtube_transcript_api", - version="0.1.9", + version="0.2.0", author="Jonas Depoix", author_email="jonas.depoix@web.de", description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!", From 7159f6082361ea9c944e18b550cfc928080f59a7 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Tue, 31 Dec 2019 00:11:52 +0100 Subject: [PATCH 25/26] fixed bug in cli where no transcript could be retrieved if no language was specified --- README.md | 191 ++++++++++++------------ youtube_transcript_api/_cli.py | 2 +- youtube_transcript_api/test/test_cli.py | 6 +- 3 files changed, 97 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index 4292061..f4516f6 100644 --- a/README.md +++ b/README.md @@ -1,74 +1,69 @@ # YouTube Transcript/Subtitle API (including automatically generated subtitles and subtitle translations) -[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) -[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) -[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) -[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) -[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) -[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) - -This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! - -## Install - -It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): - -``` -pip install youtube_transcript_api -``` - -If you want to use it from source, you'll have to install the dependencies manually: - -``` -pip install -r requirements.txt -``` +[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) [![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) [![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) [![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) [![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) [![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) + +This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! + +## Install + +It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): + +``` +pip install youtube_transcript_api +``` + +If you want to use it from source, you'll have to install the dependencies manually: + +``` +pip install -r requirements.txt +``` You can either integrate this module [into an existing application](#api), or just use it via an [CLI](#cli). - + ## API - -The easiest way to get a transcript for a given video is to execute: - -```python -from youtube_transcript_api import YouTubeTranscriptApi - -YouTubeTranscriptApi.get_transcript(video_id) -``` - -This will return a list of dictionaries looking somewhat like this: - -```python -[ - { - 'text': 'Hey there', - 'start': 7.58, - 'duration': 6.13 - }, - { - 'text': 'how are you', - 'start': 14.08, - 'duration': 7.58 - }, - # ... -] -``` - -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). - -```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) -``` - + +The easiest way to get a transcript for a given video is to execute: + +```python +from youtube_transcript_api import YouTubeTranscriptApi + +YouTubeTranscriptApi.get_transcript(video_id) +``` + +This will return a list of dictionaries looking somewhat like this: + +```python +[ + { + 'text': 'Hey there', + 'start': 7.58, + 'duration': 6.13 + }, + { + 'text': 'how are you', + 'start': 14.08, + 'duration': 7.58 + }, + # ... +] +``` + +You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts) - -To get transcripts for a list of video ids you can call: - -```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) -``` - -`languages` also is optional here. + +To get transcripts for a list of video ids you can call: + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + +`languages` also is optional here. ### List available transcripts @@ -81,16 +76,16 @@ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, languages=['de This will return a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages and types, like: ```python -transcript = transcript_list.find_transcript(['de', 'en']) +transcript = transcript_list.find_transcript(['de', 'en']) ``` By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated. The `TranscriptList` allows you to bypass this default behaviour by searching for specific transcript types: ```python -# filter for manually created transcripts -transcript = transcript_list.find_manually_created_transcript(['de', 'en']) - -# or automatically generated ones +# filter for manually created transcripts +transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + +# or automatically generated ones transcript = transcript_list.find_generated_transcript(['de', 'en']) ``` @@ -98,15 +93,15 @@ The methods `find_generated_transcript`, `find_manually_created_transcript`, `fi ```python print( - transcript.video_id, - transcript.language, - transcript.language_code, - # whether it has been manually created or generated by YouTube - transcript.is_generated, - # whether this transcript can be translated or not - transcript.is_translatable, - # a list of languages the transcript can be translated to - transcript.translation_languages, + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, ) ``` @@ -116,42 +111,42 @@ and provide the method, which allows you to fetch the actual transcript data: transcript.fetch() ``` -### Translate transcript +### Translate transcript YouTube has a feature which allows you to automatically translate subtitles. This module also makes it possible to access this feature. To do so `Transcript` objects provide a `translate()` method, which returns a new translated `Transcript` object: ```python -transcript = transcript_list.find_transcript(['en']) +transcript = transcript_list.find_transcript(['en']) translated_transcript = transcript.translate('de') print(translated_transcript.fetch()) ``` ### By example ```python -# retrieve the available transcripts -transcript_list = YouTubeTranscriptApi.get('video_id') - +# retrieve the available transcripts +transcript_list = YouTubeTranscriptApi.get('video_id') + # iterate over all available transcripts for transcript in transcript_list: # the Transcript object provides metadata properties - print( + print( transcript.video_id, transcript.language, - transcript.language_code, - # whether it has been manually created or generated by YouTube - transcript.is_generated, - # whether this transcript can be translated or not - transcript.is_translatable, - # a list of languages the transcript can be translated to - transcript.translation_languages, - ) - - # fetch the actual transcript data - print(transcript.fetch()) - - # translating the transcript will return another transcript object - print(transcript.translate('en').fetch()) + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, + ) + + # fetch the actual transcript data + print(transcript.fetch()) + + # translating the transcript will return another transcript object + print(transcript.translate('en').fetch()) # you can also directly filter for the language you are looking for, using the transcript list transcript = transcript_list.find_transcript(['de', 'en']) diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 21f816b..043bf19 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -72,7 +72,7 @@ class YouTubeTranscriptCli(): parser.add_argument( '--languages', nargs='*', - default=[], + default=['en',], type=str, help=( 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index e46789e..d2676d8 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -77,7 +77,7 @@ class TestYouTubeTranscriptCli(TestCase): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, False) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) def test_argument_parsing__fail_without_video_ids(self): with self.assertRaises(SystemExit): @@ -87,12 +87,12 @@ class TestYouTubeTranscriptCli(TestCase): parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) def test_argument_parsing__languages(self): parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args() From 7dfe20fde4f8395514d55d969fc609cc4f0eec57 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Tue, 31 Dec 2019 00:13:13 +0100 Subject: [PATCH 26/26] v0.2.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5fc01d9..6152662 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def get_test_suite(): setuptools.setup( name="youtube_transcript_api", - version="0.2.0", + version="0.2.1", author="Jonas Depoix", author_email="jonas.depoix@web.de", description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",