Merge pull request #108 from jdepoix/bugfix/ISSUE-107

added ability to create consent cookie
This commit is contained in:
jdepoix 2021-03-31 16:03:20 +02:00 committed by GitHub
commit 46be97ae35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 395 additions and 11 deletions

View File

@ -10,5 +10,6 @@ from ._errors import (
TranslationLanguageNotAvailable,
NoTranscriptAvailable,
CookiePathInvalid,
CookiesInvalid
CookiesInvalid,
FailedToCreateConsentCookie,
)

View File

@ -129,12 +129,11 @@ class YouTubeTranscriptApi(object):
@classmethod
def _load_cookies(cls, cookies, video_id):
cookie_jar = {}
try:
cookie_jar = cookiejar.MozillaCookieJar()
cookie_jar.load(cookies)
if not cookie_jar:
raise CookiesInvalid(video_id)
return cookie_jar
except CookieLoadError:
raise CookiePathInvalid(video_id)
if not cookie_jar:
raise CookiesInvalid(video_id)
return cookie_jar

View File

@ -40,10 +40,15 @@ class VideoUnavailable(CouldNotRetrieveTranscript):
class TooManyRequests(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
- Use a different IP address\n\
- Wait until the ban on your IP has been lifted")
CAUSE_MESSAGE = (
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
'One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. '
'Read here how to use that cookie with '
'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
- Use a different IP address\n\
- Wait until the ban on your IP has been lifted'
)
class TranscriptsDisabled(CouldNotRetrieveTranscript):
@ -70,6 +75,10 @@ class CookiesInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
class NoTranscriptFound(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = (
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'

View File

@ -20,6 +20,7 @@ from ._errors import (
NotTranslatable,
TranslationLanguageNotAvailable,
NoTranscriptAvailable,
FailedToCreateConsentCookie,
)
from ._settings import WATCH_URL
@ -32,7 +33,7 @@ class TranscriptListFetcher(object):
return TranscriptList.build(
self._http_client,
video_id,
self._extract_captions_json(self._fetch_html(video_id), video_id)
self._extract_captions_json(self._fetch_video_html(video_id), video_id)
)
def _extract_captions_json(self, html, video_id):
@ -55,6 +56,21 @@ class TranscriptListFetcher(object):
return captions_json
def _create_consent_cookie(self, html, video_id):
match = re.search('name="v" value="(.*?)"', html)
if match is None:
raise FailedToCreateConsentCookie(video_id)
self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
def _fetch_video_html(self, video_id):
html = self._fetch_html(video_id)
if 'action="https://consent.youtube.com/s"' in html:
self._create_consent_cookie(html, video_id)
html = self._fetch_html(video_id)
if 'action="https://consent.youtube.com/s"' in html:
raise FailedToCreateConsentCookie(video_id)
return html
def _fetch_html(self, video_id):
return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
'\\u0026', '&'

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -17,7 +17,8 @@ from youtube_transcript_api import (
NotTranslatable,
TranslationLanguageNotAvailable,
CookiePathInvalid,
CookiesInvalid
CookiesInvalid,
FailedToCreateConsentCookie,
)
@ -44,6 +45,7 @@ class TestYouTubeTranscriptApi(TestCase):
)
def tearDown(self):
httpretty.reset()
httpretty.disable()
def test_get_transcript(self):
@ -125,6 +127,43 @@ class TestYouTubeTranscriptApi(TestCase):
self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'en')
def test_get_transcript__create_consent_cookie_if_needed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
self.assertEqual(len(httpretty.latest_requests()), 3)
for request in httpretty.latest_requests()[1:]:
self.assertEqual(request.headers['cookie'], 'CONSENT=YES+cb.20210328-17-p0.de+FX+119')
def test_get_transcript__exception_if_create_consent_cookie_failed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
def test_get_transcript__exception_if_consent_cookie_age_invalid(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page_invalid.html.static')
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
def test_get_transcript__exception_if_video_unavailable(self):
httpretty.register_uri(
httpretty.GET,