Merge pull request #46 from danielcliu/feature/issue-45-use-authentication-cookies
Feature/issue 45 use authentication cookies
This commit is contained in:
commit
6da4d19978
24
README.md
24
README.md
|
@ -213,8 +213,28 @@ Using the CLI:
|
|||
|
||||
```
|
||||
youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port
|
||||
```
|
||||
```
|
||||
|
||||
## Cookies
|
||||
|
||||
Some videos are age restricted, so this module won't be able to access those videos without some sort of authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to download that pages cookies into a text file. You can use the Chrome extension [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg?hl=en) or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/).
|
||||
|
||||
Once you have that, you can use it with the module to access age-restricted videos' captions like so.
|
||||
|
||||
```python
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
YouTubeTranscriptApi.get_transcript(video_id, cookies='/path/to/your/cookies.txt')
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts([video_id], cookies='/path/to/your/cookies.txt')
|
||||
```
|
||||
|
||||
Using the CLI:
|
||||
|
||||
```
|
||||
youtube_transcript_api <first_video_id> <second_video_id> --cookies /path/to/your/cookies.txt
|
||||
```
|
||||
|
||||
|
||||
## Warning
|
||||
|
||||
|
@ -224,4 +244,4 @@ youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://us
|
|||
|
||||
If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :)
|
||||
|
||||
[](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
|
||||
[](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
|
||||
|
|
|
@ -8,4 +8,6 @@ from ._errors import (
|
|||
NotTranslatable,
|
||||
TranslationLanguageNotAvailable,
|
||||
NoTranscriptAvailable,
|
||||
CookiePathInvalid,
|
||||
CookiesInvalid
|
||||
)
|
||||
|
|
|
@ -1,11 +1,21 @@
|
|||
import requests
|
||||
try:
|
||||
import http.cookiejar as cookiejar
|
||||
CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
|
||||
except ImportError:
|
||||
import cookielib as cookiejar
|
||||
CookieLoadError = IOError
|
||||
|
||||
from ._transcripts import TranscriptListFetcher
|
||||
|
||||
from ._errors import (
|
||||
CookiePathInvalid,
|
||||
CookiesInvalid
|
||||
)
|
||||
|
||||
class YouTubeTranscriptApi():
|
||||
@classmethod
|
||||
def list_transcripts(cls, video_id, proxies=None):
|
||||
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
||||
"""
|
||||
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
||||
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
||||
|
@ -48,15 +58,19 @@ class YouTubeTranscriptApi():
|
|||
:type video_id: str
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
||||
:type cookies: str
|
||||
:return: the list of available transcripts
|
||||
:rtype TranscriptList:
|
||||
"""
|
||||
with requests.Session() as http_client:
|
||||
if cookies:
|
||||
http_client.cookies = cls._load_cookies(cookies, video_id)
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptListFetcher(http_client).fetch(video_id)
|
||||
|
||||
@classmethod
|
||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
|
||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
|
||||
"""
|
||||
Retrieves the transcripts for a list of videos.
|
||||
|
||||
|
@ -71,6 +85,8 @@ class YouTubeTranscriptApi():
|
|||
:type continue_after_error: bool
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
||||
:type cookies: str
|
||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||
video ids, which could not be retrieved
|
||||
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
|
||||
|
@ -80,7 +96,7 @@ class YouTubeTranscriptApi():
|
|||
|
||||
for video_id in video_ids:
|
||||
try:
|
||||
data[video_id] = cls.get_transcript(video_id, languages, proxies)
|
||||
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
|
||||
except Exception as exception:
|
||||
if not continue_after_error:
|
||||
raise exception
|
||||
|
@ -90,7 +106,7 @@ class YouTubeTranscriptApi():
|
|||
return data, unretrievable_videos
|
||||
|
||||
@classmethod
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
|
||||
"""
|
||||
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
||||
|
||||
|
@ -104,7 +120,21 @@ class YouTubeTranscriptApi():
|
|||
:type languages: list[str]
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
||||
:type cookies: str
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||
"""
|
||||
return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
|
||||
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
|
||||
|
||||
@classmethod
|
||||
def _load_cookies(cls, cookies, video_id):
|
||||
cookie_jar = {}
|
||||
try:
|
||||
cookie_jar = cookiejar.MozillaCookieJar()
|
||||
cookie_jar.load(cookies)
|
||||
except CookieLoadError:
|
||||
raise CookiePathInvalid(video_id)
|
||||
if not cookie_jar:
|
||||
raise CookiesInvalid(video_id)
|
||||
return cookie_jar
|
||||
|
|
|
@ -21,12 +21,14 @@ class YouTubeTranscriptCli():
|
|||
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
|
||||
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
|
||||
|
||||
cookies = parsed_args.cookies
|
||||
|
||||
transcripts = []
|
||||
exceptions = []
|
||||
|
||||
for video_id in parsed_args.video_ids:
|
||||
try:
|
||||
transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id))
|
||||
transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
|
||||
except Exception as exception:
|
||||
exceptions.append(exception)
|
||||
|
||||
|
@ -35,8 +37,8 @@ class YouTubeTranscriptCli():
|
|||
+ ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else [])
|
||||
)
|
||||
|
||||
def _fetch_transcript(self, parsed_args, proxies, video_id):
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies)
|
||||
def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
|
||||
|
||||
if parsed_args.list_transcripts:
|
||||
return str(transcript_list)
|
||||
|
@ -123,5 +125,10 @@ class YouTubeTranscriptCli():
|
|||
metavar='URL',
|
||||
help='Use the specified HTTPS proxy.'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--cookies',
|
||||
default=None,
|
||||
help='The cookie file that will be used for authorization with youtube.'
|
||||
)
|
||||
|
||||
return parser.parse_args(self._args)
|
||||
|
|
|
@ -55,6 +55,14 @@ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
|
|||
CAUSE_MESSAGE = 'The requested translation language is not available'
|
||||
|
||||
|
||||
class CookiePathInvalid(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
|
||||
|
||||
|
||||
class CookiesInvalid(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
|
||||
|
||||
|
||||
class NoTranscriptFound(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = (
|
||||
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
|
||||
# This file can be used by wget, curl, aria2c and other standard compliant tools.
|
||||
# Usage Examples:
|
||||
# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
|
||||
# 2) curl --cookie cookies.txt "https://www.youtube.com/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
|
||||
#
|
||||
.example.com TRUE / TRUE 3594431874 TEST_FIELD TEST_VALUE
|
||||
.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE
|
|
@ -0,0 +1,8 @@
|
|||
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
|
||||
# This file can be used by wget, curl, aria2c and other standard compliant tools.
|
||||
# Usage Examples:
|
||||
# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
|
||||
# 2) curl --cookie cookies.txt "https://www.youtube.com/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
|
||||
#
|
||||
.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE
|
|
@ -1,8 +1,10 @@
|
|||
from unittest import TestCase
|
||||
from mock import MagicMock
|
||||
from mock import patch
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
import httpretty
|
||||
|
||||
from youtube_transcript_api import (
|
||||
|
@ -13,6 +15,8 @@ from youtube_transcript_api import (
|
|||
NoTranscriptAvailable,
|
||||
NotTranslatable,
|
||||
TranslationLanguageNotAvailable,
|
||||
CookiePathInvalid,
|
||||
CookiesInvalid
|
||||
)
|
||||
|
||||
|
||||
|
@ -151,39 +155,24 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
with self.assertRaises(NoTranscriptAvailable):
|
||||
YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E')
|
||||
|
||||
def test_get_transcripts(self):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
languages = ['de', 'en']
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock()
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
|
||||
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
|
||||
|
||||
def test_get_transcripts__stop_on_error(self):
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
|
||||
|
||||
def test_get_transcripts__continue_on_error(self):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None)
|
||||
|
||||
def test_get_transcript__with_proxies(self):
|
||||
def test_get_transcript__with_proxy(self):
|
||||
proxies = {'http': '', 'https:': ''}
|
||||
transcript = YouTubeTranscriptApi.get_transcript(
|
||||
'GJLlxj_dtq8', proxies=proxies
|
||||
)
|
||||
self.assertEqual(
|
||||
transcript,
|
||||
[
|
||||
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
|
||||
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
|
||||
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
|
||||
]
|
||||
)
|
||||
|
||||
def test_get_transcript__with_cookies(self):
|
||||
dirname, filename = os.path.split(os.path.abspath(__file__))
|
||||
cookies = dirname + '/example_cookies.txt'
|
||||
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies)
|
||||
|
||||
self.assertEqual(
|
||||
transcript,
|
||||
|
@ -193,6 +182,59 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
|
||||
]
|
||||
)
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock()
|
||||
|
||||
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
|
||||
def test_get_transcripts(self, mock_get_transcript):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
languages = ['de', 'en']
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
|
||||
|
||||
mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
|
||||
mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
|
||||
self.assertEqual(mock_get_transcript.call_count, 2)
|
||||
|
||||
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
|
||||
def test_get_transcripts__stop_on_error(self, mock_get_transcript):
|
||||
with self.assertRaises(Exception):
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
|
||||
|
||||
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
|
||||
def test_get_transcripts__continue_on_error(self, mock_get_transcript):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
||||
|
||||
mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
|
||||
mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
|
||||
|
||||
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
|
||||
def test_get_transcripts__with_cookies(self, mock_get_transcript):
|
||||
cookies = '/example_cookies.txt'
|
||||
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
|
||||
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies)
|
||||
|
||||
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
|
||||
def test_get_transcripts__with_proxies(self, mock_get_transcript):
|
||||
proxies = {'http': '', 'https:': ''}
|
||||
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies)
|
||||
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None)
|
||||
|
||||
def test_load_cookies(self):
|
||||
dirname, filename = os.path.split(os.path.abspath(__file__))
|
||||
cookies = dirname + '/example_cookies.txt'
|
||||
session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8')
|
||||
self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies))
|
||||
|
||||
def test_load_cookies__bad_file_path(self):
|
||||
bad_cookies = 'nonexistent_cookies.txt'
|
||||
with self.assertRaises(CookiePathInvalid):
|
||||
YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8')
|
||||
|
||||
def test_load_cookies__no_valid_cookies(self):
|
||||
dirname, filename = os.path.split(os.path.abspath(__file__))
|
||||
expired_cookies = dirname + '/expired_example_cookies.txt'
|
||||
with self.assertRaises(CookiesInvalid):
|
||||
YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8')
|
||||
|
|
|
@ -164,8 +164,8 @@ class TestYouTubeTranscriptCli(TestCase):
|
|||
def test_run(self):
|
||||
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
||||
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
|
||||
|
||||
self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en'])
|
||||
|
||||
|
@ -200,8 +200,8 @@ class TestYouTubeTranscriptCli(TestCase):
|
|||
def test_run__list_transcripts(self):
|
||||
YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run()
|
||||
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
|
||||
|
||||
def test_run__json_output(self):
|
||||
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
|
||||
|
@ -220,10 +220,23 @@ class TestYouTubeTranscriptCli(TestCase):
|
|||
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call(
|
||||
'v1',
|
||||
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
|
||||
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
|
||||
cookies= None
|
||||
)
|
||||
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call(
|
||||
'v2',
|
||||
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
|
||||
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
|
||||
cookies=None
|
||||
)
|
||||
|
||||
def test_run__cookies(self):
|
||||
YouTubeTranscriptCli(
|
||||
(
|
||||
'v1 v2 --languages de en '
|
||||
'--cookies blahblah.txt'
|
||||
).split()
|
||||
).run()
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt')
|
||||
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt')
|
||||
|
||||
|
|
Loading…
Reference in New Issue