From 14c70359ba6a39cdc0e130e05925942780905e55 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 Jan 2021 19:43:29 +0100 Subject: [PATCH 1/4] Fix "video not available" being shown to the user when when YouTube start asking for captcha resolution due to receiving too many requests from the same IP. Show instead an appropiate message. To be able to keep making requests, the captcha must be solved in a browser and the browser cookie must be passed to youtube-transcript-api. --- youtube_transcript_api/__init__.py | 1 + youtube_transcript_api/_errors.py | 5 +- youtube_transcript_api/_transcripts.py | 3 + .../youtube_too_many_requests.html.static | 239 ++++++++++++++++++ youtube_transcript_api/test/test_api.py | 11 + 5 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 youtube_transcript_api/test/assets/youtube_too_many_requests.html.static diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index 1fe0f73..baefd02 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -5,6 +5,7 @@ from ._errors import ( NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable, + TooManyRequests, NotTranslatable, TranslationLanguageNotAvailable, NoTranscriptAvailable, diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 2f83a16..f7a5658 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -37,7 +37,10 @@ class CouldNotRetrieveTranscript(Exception): class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' - + +class TooManyRequests(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = ('YouTube is receiving too many requests from this IP,' + ' and now requires that a captcha must be solved in order to continue.') class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 6b767ff..9400a1d 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -14,6 +14,7 @@ import re from ._html_unescaping import unescape from ._errors import ( VideoUnavailable, + TooManyRequests, NoTranscriptFound, TranscriptsDisabled, NotTranslatable, @@ -38,6 +39,8 @@ class TranscriptListFetcher(): splitted_html = html.split('"captions":') if len(splitted_html) <= 1: + if 'class="g-recaptcha"' in html: + raise TooManyRequests(video_id) if '"playabilityStatus":' not in html: raise VideoUnavailable(video_id) diff --git a/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static b/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static new file mode 100644 index 0000000..c63003f --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static @@ -0,0 +1,239 @@ + + + + YouTube + + + + + + + + + +
+
+

+ Perdón por la interrupción. Hemos recibido un gran número de + solicitudes de tu red. +

+

+ Para seguir disfrutando de YouTube, rellena el siguiente formulario. +

+
+
+
+
+ +
+ ES + +
+
+ +
+ + diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 5f95451..daf98f8 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -12,6 +12,7 @@ from youtube_transcript_api import ( TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, + TooManyRequests, NoTranscriptAvailable, NotTranslatable, TranslationLanguageNotAvailable, @@ -134,6 +135,16 @@ class TestYouTubeTranscriptApi(TestCase): with self.assertRaises(VideoUnavailable): YouTubeTranscriptApi.get_transcript('abc') + def test_get_transcript__exception_if_video_unavailable(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube_too_many_requests.html.static') + ) + + with self.assertRaises(TooManyRequests): + YouTubeTranscriptApi.get_transcript('abc') + def test_get_transcript__exception_if_transcripts_disabled(self): httpretty.register_uri( httpretty.GET, From fb819c06e4d9c54b7e372de2a8040951357e0fcd Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 Jan 2021 19:53:06 +0100 Subject: [PATCH 2/4] Fix test case name --- youtube_transcript_api/test/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index daf98f8..7650cf4 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -135,7 +135,7 @@ class TestYouTubeTranscriptApi(TestCase): with self.assertRaises(VideoUnavailable): YouTubeTranscriptApi.get_transcript('abc') - def test_get_transcript__exception_if_video_unavailable(self): + def test_get_transcript__exception_if_youtube_request_limit_reached(self): httpretty.register_uri( httpretty.GET, 'https://www.youtube.com/watch', From dbf5eeafe69f7b5e0c0eb437d0debe1dbcf75d6a Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 22 Jan 2021 14:18:56 +0100 Subject: [PATCH 3/4] Error message more descriptive --- youtube_transcript_api/_errors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index f7a5658..c19a820 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -39,8 +39,11 @@ class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' class TooManyRequests(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = ('YouTube is receiving too many requests from this IP,' - ' and now requires that a captcha must be solved in order to continue.') + CAUSE_MESSAGE = ('YouTube is receiving too many requests from this IP, ' + 'and now requires that a captcha must be solved in order to continue. ' + 'You can solve the captcha in a browser and pass the generated cookie file to youtube-transcript-api, ' + 'or you can use a different IP, or maybe wait for the ban to be lifted.' + ) class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' From 23798f205de55a4a5b3b1c787495524d34e6aea2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 25 Jan 2021 17:36:27 +0100 Subject: [PATCH 4/4] improve message as per jdepoix suggestion --- youtube_transcript_api/_errors.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index c19a820..1b8360a 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -39,11 +39,10 @@ class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' class TooManyRequests(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = ('YouTube is receiving too many requests from this IP, ' - 'and now requires that a captcha must be solved in order to continue. ' - 'You can solve the captcha in a browser and pass the generated cookie file to youtube-transcript-api, ' - 'or you can use a different IP, or maybe wait for the ban to be lifted.' - ) + CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\ + - Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ + - Use a different IP address\n\ + - Wait until the ban on your IP has been lifted") class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video'