added black formatter

This commit is contained in:
Jonas Depoix 2024-09-26 17:56:36 +02:00
parent 0b6cc5980f
commit 5f96588ada
13 changed files with 805 additions and 471 deletions

111
poetry.lock generated
View File

@ -1,5 +1,51 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "black"
version = "24.8.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.8"
files = [
{file = "black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6"},
{file = "black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb"},
{file = "black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42"},
{file = "black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a"},
{file = "black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1"},
{file = "black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af"},
{file = "black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4"},
{file = "black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af"},
{file = "black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368"},
{file = "black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed"},
{file = "black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018"},
{file = "black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2"},
{file = "black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd"},
{file = "black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2"},
{file = "black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e"},
{file = "black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920"},
{file = "black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c"},
{file = "black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e"},
{file = "black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47"},
{file = "black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb"},
{file = "black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed"},
{file = "black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f"},
]
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "certifi"
version = "2024.8.30"
@ -110,6 +156,20 @@ files = [
{file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
]
[[package]]
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
@ -302,6 +362,17 @@ build = ["blurb", "twine", "wheel"]
docs = ["sphinx"]
test = ["pytest", "pytest-cov"]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
optional = false
python-versions = ">=3.5"
files = [
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
]
[[package]]
name = "packaging"
version = "24.1"
@ -313,6 +384,33 @@ files = [
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
]
[[package]]
name = "pathspec"
version = "0.12.1"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.8"
files = [
{file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
{file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
]
[[package]]
name = "platformdirs"
version = "4.3.6"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.8"
files = [
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
]
[package.extras]
docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
type = ["mypy (>=1.11.2)"]
[[package]]
name = "pluggy"
version = "1.5.0"
@ -382,6 +480,17 @@ files = [
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
[[package]]
name = "typing-extensions"
version = "4.12.2"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]]
name = "urllib3"
version = "2.2.3"
@ -402,4 +511,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8,<3.13"
content-hash = "ae3ea36431a2a24e1d07e7c6e251fe7490b86edd928c22eda084e3cb974aaa99"
content-hash = "4c2e7d294773ea148b69f961053a9469630c48b88248903ead43e41a2838ff94"

View File

@ -40,6 +40,7 @@ youtube_transcript_api = "youtube_transcript_api.__main__:main"
[tool.poe.tasks]
test = "pytest youtube_transcript_api"
coverage.shell = "pytest youtube_transcript_api && coverage report -m"
format = "black youtube_transcript_api"
[tool.poetry.dependencies]
python = ">=3.8,<3.13"
@ -51,6 +52,7 @@ coverage = "^7.6.1"
mock = "^5.1.0"
httpretty = "^1.1.4"
coveralls = "^4.0.1"
black = "^24.8.0"
[tool.coverage.run]
source = ["youtube_transcript_api"]

View File

@ -11,5 +11,5 @@ def main():
print(YouTubeTranscriptCli(sys.argv[1:]).run())
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -1,17 +1,17 @@
import requests
try: # pragma: no cover
try: # pragma: no cover
import http.cookiejar as cookiejar
CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
except ImportError: # pragma: no cover
except ImportError: # pragma: no cover
import cookielib as cookiejar
CookieLoadError = IOError
from ._transcripts import TranscriptListFetcher
from ._errors import (
CookiePathInvalid,
CookiesInvalid
)
from ._errors import CookiePathInvalid, CookiesInvalid
class YouTubeTranscriptApi(object):
@ -71,8 +71,15 @@ class YouTubeTranscriptApi(object):
return TranscriptListFetcher(http_client).fetch(video_id)
@classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
cookies=None, preserve_formatting=False):
def get_transcripts(
cls,
video_ids,
languages=("en",),
continue_after_error=False,
proxies=None,
cookies=None,
preserve_formatting=False,
):
"""
Retrieves the transcripts for a list of videos.
@ -102,7 +109,9 @@ class YouTubeTranscriptApi(object):
for video_id in video_ids:
try:
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
data[video_id] = cls.get_transcript(
video_id, languages, proxies, cookies, preserve_formatting
)
except Exception as exception:
if not continue_after_error:
raise exception
@ -112,7 +121,14 @@ class YouTubeTranscriptApi(object):
return data, unretrievable_videos
@classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
def get_transcript(
cls,
video_id,
languages=("en",),
proxies=None,
cookies=None,
preserve_formatting=False,
):
"""
Retrieves the transcript for a single video. This is just a shortcut for calling::
@ -134,7 +150,11 @@ class YouTubeTranscriptApi(object):
:rtype [{'text': str, 'start': float, 'end': float}]:
"""
assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
return (
cls.list_transcripts(video_id, proxies, cookies)
.find_transcript(languages)
.fetch(preserve_formatting=preserve_formatting)
)
@classmethod
def _load_cookies(cls, cookies, video_id):

View File

@ -13,10 +13,10 @@ class YouTubeTranscriptCli(object):
parsed_args = self._parse_args()
if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
return ''
return ""
proxies = None
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
if parsed_args.http_proxy != "" or parsed_args.https_proxy != "":
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
cookies = parsed_args.cookies
@ -26,25 +26,41 @@ class YouTubeTranscriptCli(object):
for video_id in parsed_args.video_ids:
try:
transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
transcripts.append(
self._fetch_transcript(parsed_args, proxies, cookies, video_id)
)
except Exception as exception:
exceptions.append(exception)
return '\n\n'.join(
return "\n\n".join(
[str(exception) for exception in exceptions]
+ ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
+ (
[
FormatterLoader()
.load(parsed_args.format)
.format_transcripts(transcripts)
]
if transcripts
else []
)
)
def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
transcript_list = YouTubeTranscriptApi.list_transcripts(
video_id, proxies=proxies, cookies=cookies
)
if parsed_args.list_transcripts:
return str(transcript_list)
if parsed_args.exclude_manually_created:
transcript = transcript_list.find_generated_transcript(parsed_args.languages)
transcript = transcript_list.find_generated_transcript(
parsed_args.languages
)
elif parsed_args.exclude_generated:
transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
transcript = transcript_list.find_manually_created_transcript(
parsed_args.languages
)
else:
transcript = transcript_list.find_transcript(parsed_args.languages)
@ -56,80 +72,84 @@ class YouTubeTranscriptCli(object):
def _parse_args(self):
parser = argparse.ArgumentParser(
description=(
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
'It also works for automatically generated subtitles and it does not require a headless browser, like '
'other selenium based solutions do!'
"This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. "
"It also works for automatically generated subtitles and it does not require a headless browser, like "
"other selenium based solutions do!"
)
)
parser.add_argument(
'--list-transcripts',
action='store_const',
"--list-transcripts",
action="store_const",
const=True,
default=False,
help='This will list the languages in which the given videos are available in.',
help="This will list the languages in which the given videos are available in.",
)
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
parser.add_argument(
'--languages',
nargs='*',
default=['en',],
"video_ids", nargs="+", type=str, help="List of YouTube video IDs."
)
parser.add_argument(
"--languages",
nargs="*",
default=[
"en",
],
type=str,
help=(
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
'may have to play around with the language codes a bit, to find the one which is working for you!'
"first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails "
"to do so. As I can't provide a complete list of all working language codes with full certainty, you "
"may have to play around with the language codes a bit, to find the one which is working for you!"
),
)
parser.add_argument(
'--exclude-generated',
action='store_const',
"--exclude-generated",
action="store_const",
const=True,
default=False,
help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
help="If this flag is set transcripts which have been generated by YouTube will not be retrieved.",
)
parser.add_argument(
'--exclude-manually-created',
action='store_const',
"--exclude-manually-created",
action="store_const",
const=True,
default=False,
help='If this flag is set transcripts which have been manually created will not be retrieved.',
help="If this flag is set transcripts which have been manually created will not be retrieved.",
)
parser.add_argument(
'--format',
"--format",
type=str,
default='pretty',
default="pretty",
choices=tuple(FormatterLoader.TYPES.keys()),
)
parser.add_argument(
'--translate',
default='',
"--translate",
default="",
help=(
'The language code for the language you want this transcript to be translated to. Use the '
'--list-transcripts feature to find out which languages are translatable and which translation '
'languages are available.'
)
"The language code for the language you want this transcript to be translated to. Use the "
"--list-transcripts feature to find out which languages are translatable and which translation "
"languages are available."
),
)
parser.add_argument(
'--http-proxy',
default='',
metavar='URL',
help='Use the specified HTTP proxy.'
"--http-proxy",
default="",
metavar="URL",
help="Use the specified HTTP proxy.",
)
parser.add_argument(
'--https-proxy',
default='',
metavar='URL',
help='Use the specified HTTPS proxy.'
"--https-proxy",
default="",
metavar="URL",
help="Use the specified HTTPS proxy.",
)
parser.add_argument(
'--cookies',
"--cookies",
default=None,
help='The cookie file that will be used for authorization with youtube.'
help="The cookie file that will be used for authorization with youtube.",
)
return self._sanitize_video_ids(parser.parse_args(self._args))
def _sanitize_video_ids(self, args):
args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
args.video_ids = [video_id.replace("\\", "") for video_id in args.video_ids]
return args

View File

@ -5,16 +5,17 @@ class CouldNotRetrieveTranscript(Exception):
"""
Raised if a transcript could not be retrieved.
"""
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
CAUSE_MESSAGE = ''
ERROR_MESSAGE = "\nCould not retrieve a transcript for the video {video_url}!"
CAUSE_MESSAGE_INTRO = " This is most likely caused by:\n\n{cause}"
CAUSE_MESSAGE = ""
GITHUB_REFERRAL = (
'\n\nIf you are sure that the described cause is not responsible for this error '
'and that a transcript should be retrievable, please create an issue at '
'https://github.com/jdepoix/youtube-transcript-api/issues. '
'Please add which version of youtube_transcript_api you are using '
'and provide the information needed to replicate the error. '
'Also make sure that there are no open issues which already describe your problem!'
"\n\nIf you are sure that the described cause is not responsible for this error "
"and that a transcript should be retrievable, please create an issue at "
"https://github.com/jdepoix/youtube-transcript-api/issues. "
"Please add which version of youtube_transcript_api you are using "
"and provide the information needed to replicate the error. "
"Also make sure that there are no open issues which already describe your problem!"
)
def __init__(self, video_id):
@ -23,10 +24,14 @@ class CouldNotRetrieveTranscript(Exception):
def _build_error_message(self):
cause = self.cause
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
error_message = self.ERROR_MESSAGE.format(
video_url=WATCH_URL.format(video_id=self.video_id)
)
if cause:
error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
error_message += (
self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
)
return error_message
@ -36,7 +41,7 @@ class CouldNotRetrieveTranscript(Exception):
class YouTubeRequestFailed(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
CAUSE_MESSAGE = "Request to YouTube failed: {reason}"
def __init__(self, video_id, http_error):
self.reason = str(http_error)
@ -50,12 +55,12 @@ class YouTubeRequestFailed(CouldNotRetrieveTranscript):
class VideoUnavailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The video is no longer available'
CAUSE_MESSAGE = "The video is no longer available"
class InvalidVideoId(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = (
'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
"You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n"
'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`'
)
@ -63,48 +68,48 @@ class InvalidVideoId(CouldNotRetrieveTranscript):
class TooManyRequests(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = (
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
'One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. '
'Read here how to use that cookie with '
'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
"YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. "
"One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. "
"Read here how to use that cookie with "
"youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
- Use a different IP address\n\
- Wait until the ban on your IP has been lifted'
- Wait until the ban on your IP has been lifted"
)
class TranscriptsDisabled(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Subtitles are disabled for this video'
CAUSE_MESSAGE = "Subtitles are disabled for this video"
class NoTranscriptAvailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'No transcripts are available for this video'
CAUSE_MESSAGE = "No transcripts are available for this video"
class NotTranslatable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The requested language is not translatable'
CAUSE_MESSAGE = "The requested language is not translatable"
class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The requested translation language is not available'
CAUSE_MESSAGE = "The requested translation language is not available"
class CookiePathInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
CAUSE_MESSAGE = "The provided cookie file was unable to be loaded"
class CookiesInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
CAUSE_MESSAGE = "The cookies provided are not valid (may have expired)"
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies"
class NoTranscriptFound(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = (
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
'{transcript_data}'
"No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n"
"{transcript_data}"
)
def __init__(self, video_id, requested_language_codes, transcript_data):

View File

@ -2,10 +2,10 @@ import sys
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
# Python 3.4+
from html import unescape
else: # pragma: no cover
else: # pragma: no cover
if sys.version_info.major <= 2:
# Python 2
import HTMLParser

View File

@ -1 +1 @@
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
WATCH_URL = "https://www.youtube.com/watch?v={video_id}"

View File

@ -3,7 +3,7 @@ import sys
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
if sys.version_info.major == 2: # pragma: no cover
reload(sys)
sys.setdefaultencoding('utf-8')
sys.setdefaultencoding("utf-8")
import json
@ -52,7 +52,7 @@ class TranscriptListFetcher(object):
splitted_html = html.split('"captions":')
if len(splitted_html) <= 1:
if video_id.startswith('http://') or video_id.startswith('https://'):
if video_id.startswith("http://") or video_id.startswith("https://"):
raise InvalidVideoId(video_id)
if 'class="g-recaptcha"' in html:
raise TooManyRequests(video_id)
@ -62,12 +62,12 @@ class TranscriptListFetcher(object):
raise TranscriptsDisabled(video_id)
captions_json = json.loads(
splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
).get('playerCaptionsTracklistRenderer')
splitted_html[1].split(',"videoDetails')[0].replace("\n", "")
).get("playerCaptionsTracklistRenderer")
if captions_json is None:
raise TranscriptsDisabled(video_id)
if 'captionTracks' not in captions_json:
if "captionTracks" not in captions_json:
raise NoTranscriptAvailable(video_id)
return captions_json
@ -76,7 +76,9 @@ class TranscriptListFetcher(object):
match = re.search('name="v" value="(.*?)"', html)
if match is None:
raise FailedToCreateConsentCookie(video_id)
self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
self._http_client.cookies.set(
"CONSENT", "YES+" + match.group(1), domain=".youtube.com"
)
def _fetch_video_html(self, video_id):
html = self._fetch_html(video_id)
@ -88,7 +90,9 @@ class TranscriptListFetcher(object):
return html
def _fetch_html(self, video_id):
response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
response = self._http_client.get(
WATCH_URL.format(video_id=video_id), headers={"Accept-Language": "en-US"}
)
return unescape(_raise_http_errors(response, video_id).text)
@ -98,7 +102,13 @@ class TranscriptList(object):
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
"""
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
def __init__(
self,
video_id,
manually_created_transcripts,
generated_transcripts,
translation_languages,
):
"""
The constructor is only for internal use. Use the static build method instead.
@ -132,28 +142,29 @@ class TranscriptList(object):
"""
translation_languages = [
{
'language': translation_language['languageName']['simpleText'],
'language_code': translation_language['languageCode'],
} for translation_language in captions_json.get('translationLanguages', [])
"language": translation_language["languageName"]["simpleText"],
"language_code": translation_language["languageCode"],
}
for translation_language in captions_json.get("translationLanguages", [])
]
manually_created_transcripts = {}
generated_transcripts = {}
for caption in captions_json['captionTracks']:
if caption.get('kind', '') == 'asr':
for caption in captions_json["captionTracks"]:
if caption.get("kind", "") == "asr":
transcript_dict = generated_transcripts
else:
transcript_dict = manually_created_transcripts
transcript_dict[caption['languageCode']] = Transcript(
transcript_dict[caption["languageCode"]] = Transcript(
http_client,
video_id,
caption['baseUrl'],
caption['name']['simpleText'],
caption['languageCode'],
caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [],
caption["baseUrl"],
caption["name"]["simpleText"],
caption["languageCode"],
caption.get("kind", "") == "asr",
translation_languages if caption.get("isTranslatable", False) else [],
)
return TranscriptList(
@ -164,7 +175,10 @@ class TranscriptList(object):
)
def __iter__(self):
return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
return iter(
list(self._manually_created_transcripts.values())
+ list(self._generated_transcripts.values())
)
def find_transcript(self, language_codes):
"""
@ -180,7 +194,10 @@ class TranscriptList(object):
:rtype Transcript:
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
return self._find_transcript(
language_codes,
[self._manually_created_transcripts, self._generated_transcripts],
)
def find_generated_transcript(self, language_codes):
"""
@ -208,7 +225,9 @@ class TranscriptList(object):
:rtype Transcript:
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, [self._manually_created_transcripts])
return self._find_transcript(
language_codes, [self._manually_created_transcripts]
)
def _find_transcript(self, language_codes, transcript_dicts):
for language_code in language_codes:
@ -216,44 +235,54 @@ class TranscriptList(object):
if language_code in transcript_dict:
return transcript_dict[language_code]
raise NoTranscriptFound(
self.video_id,
language_codes,
self
)
raise NoTranscriptFound(self.video_id, language_codes, self)
def __str__(self):
return (
'For this video ({video_id}) transcripts are available in the following languages:\n\n'
'(MANUALLY CREATED)\n'
'{available_manually_created_transcript_languages}\n\n'
'(GENERATED)\n'
'{available_generated_transcripts}\n\n'
'(TRANSLATION LANGUAGES)\n'
'{available_translation_languages}'
"For this video ({video_id}) transcripts are available in the following languages:\n\n"
"(MANUALLY CREATED)\n"
"{available_manually_created_transcript_languages}\n\n"
"(GENERATED)\n"
"{available_generated_transcripts}\n\n"
"(TRANSLATION LANGUAGES)\n"
"{available_translation_languages}"
).format(
video_id=self.video_id,
available_manually_created_transcript_languages=self._get_language_description(
str(transcript) for transcript in self._manually_created_transcripts.values()
str(transcript)
for transcript in self._manually_created_transcripts.values()
),
available_generated_transcripts=self._get_language_description(
str(transcript) for transcript in self._generated_transcripts.values()
),
available_translation_languages=self._get_language_description(
'{language_code} ("{language}")'.format(
language=translation_language['language'],
language_code=translation_language['language_code'],
) for translation_language in self._translation_languages
)
language=translation_language["language"],
language_code=translation_language["language_code"],
)
for translation_language in self._translation_languages
),
)
def _get_language_description(self, transcript_strings):
description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
return description if description else 'None'
description = "\n".join(
" - {transcript}".format(transcript=transcript)
for transcript in transcript_strings
)
return description if description else "None"
class Transcript(object):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
def __init__(
self,
http_client,
video_id,
url,
language,
language_code,
is_generated,
translation_languages,
):
"""
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList.
@ -276,7 +305,7 @@ class Transcript(object):
self.is_generated = is_generated
self.translation_languages = translation_languages
self._translation_languages_dict = {
translation_language['language_code']: translation_language['language']
translation_language["language_code"]: translation_language["language"]
for translation_language in translation_languages
}
@ -288,7 +317,9 @@ class Transcript(object):
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]:
"""
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
response = self._http_client.get(
self._url, headers={"Accept-Language": "en-US"}
)
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text,
)
@ -297,7 +328,7 @@ class Transcript(object):
return '{language_code} ("{language}"){translation_description}'.format(
language=self.language,
language_code=self.language_code,
translation_description='[TRANSLATABLE]' if self.is_translatable else ''
translation_description="[TRANSLATABLE]" if self.is_translatable else "",
)
@property
@ -314,7 +345,9 @@ class Transcript(object):
return Transcript(
self._http_client,
self.video_id,
'{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
"{url}&tlang={language_code}".format(
url=self._url, language_code=language_code
),
self._translation_languages_dict[language_code],
language_code,
True,
@ -324,16 +357,16 @@ class Transcript(object):
class _TranscriptParser(object):
_FORMATTING_TAGS = [
'strong', # important
'em', # emphasized
'b', # bold
'i', # italic
'mark', # marked
'small', # smaller
'del', # deleted
'ins', # inserted
'sub', # subscript
'sup', # superscript
"strong", # important
"em", # emphasized
"b", # bold
"i", # italic
"mark", # marked
"small", # smaller
"del", # deleted
"ins", # inserted
"sub", # subscript
"sup", # superscript
]
def __init__(self, preserve_formatting=False):
@ -341,19 +374,19 @@ class _TranscriptParser(object):
def _get_html_regex(self, preserve_formatting):
if preserve_formatting:
formats_regex = '|'.join(self._FORMATTING_TAGS)
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
formats_regex = "|".join(self._FORMATTING_TAGS)
formats_regex = r"<\/?(?!\/?(" + formats_regex + r")\b).*?\b>"
html_regex = re.compile(formats_regex, re.IGNORECASE)
else:
html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
html_regex = re.compile(r"<[^>]*>", re.IGNORECASE)
return html_regex
def parse(self, plain_data):
return [
{
'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib.get('dur', '0.0')),
"text": re.sub(self._html_regex, "", unescape(xml_element.text)),
"start": float(xml_element.attrib["start"]),
"duration": float(xml_element.attrib.get("dur", "0.0")),
}
for xml_element in ElementTree.fromstring(plain_data)
if xml_element.text is not None

View File

@ -12,12 +12,16 @@ class Formatter(object):
"""
def format_transcript(self, transcript, **kwargs):
raise NotImplementedError('A subclass of Formatter must implement ' \
'their own .format_transcript() method.')
raise NotImplementedError(
"A subclass of Formatter must implement "
"their own .format_transcript() method."
)
def format_transcripts(self, transcripts, **kwargs):
raise NotImplementedError('A subclass of Formatter must implement ' \
'their own .format_transcripts() method.')
raise NotImplementedError(
"A subclass of Formatter must implement "
"their own .format_transcripts() method."
)
class PrettyPrintFormatter(Formatter):
@ -68,7 +72,7 @@ class TextFormatter(Formatter):
:return: all transcript text lines separated by newline breaks.'
:rtype str
"""
return '\n'.join(line['text'] for line in transcript)
return "\n".join(line["text"] for line in transcript)
def format_transcripts(self, transcripts, **kwargs):
"""Converts a list of transcripts into plain text with no timestamps.
@ -77,21 +81,30 @@ class TextFormatter(Formatter):
:return: all transcript text lines separated by newline breaks.'
:rtype str
"""
return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
return "\n\n\n".join(
[self.format_transcript(transcript, **kwargs) for transcript in transcripts]
)
class _TextBasedFormatter(TextFormatter):
def _format_timestamp(self, hours, mins, secs, ms):
raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \
'their own .format_timestamp() method.')
raise NotImplementedError(
"A subclass of _TextBasedFormatter must implement "
"their own .format_timestamp() method."
)
def _format_transcript_header(self, lines):
raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \
'their own _format_transcript_header method.')
raise NotImplementedError(
"A subclass of _TextBasedFormatter must implement "
"their own _format_transcript_header method."
)
def _format_transcript_helper(self, i, time_text, line):
raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \
'their own _format_transcript_helper method.')
raise NotImplementedError(
"A subclass of _TextBasedFormatter must implement "
"their own _format_transcript_helper method."
)
def _seconds_to_timestamp(self, time):
"""Helper that converts `time` into a transcript cue timestamp.
@ -109,26 +122,27 @@ class _TextBasedFormatter(TextFormatter):
hours_float, remainder = divmod(time, 3600)
mins_float, secs_float = divmod(remainder, 60)
hours, mins, secs = int(hours_float), int(mins_float), int(secs_float)
ms = int(round((time - int(time))*1000, 2))
ms = int(round((time - int(time)) * 1000, 2))
return self._format_timestamp(hours, mins, secs, ms)
def format_transcript(self, transcript, **kwargs):
"""A basic implementation of WEBVTT/SRT formatting.
:param transcript:
:reference:
:reference:
https://www.w3.org/TR/webvtt1/#introduction-caption
https://www.3playmedia.com/blog/create-srt-file/
"""
lines = []
for i, line in enumerate(transcript):
end = line['start'] + line['duration']
end = line["start"] + line["duration"]
time_text = "{} --> {}".format(
self._seconds_to_timestamp(line['start']),
self._seconds_to_timestamp(line["start"]),
self._seconds_to_timestamp(
transcript[i + 1]['start']
if i < len(transcript) - 1 and transcript[i + 1]['start'] < end else end
)
transcript[i + 1]["start"]
if i < len(transcript) - 1 and transcript[i + 1]["start"] < end
else end
),
)
lines.append(self._format_transcript_helper(i, time_text, line))
@ -138,12 +152,12 @@ class _TextBasedFormatter(TextFormatter):
class SRTFormatter(_TextBasedFormatter):
def _format_timestamp(self, hours, mins, secs, ms):
return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, mins, secs, ms)
def _format_transcript_header(self, lines):
return "\n\n".join(lines) + "\n"
def _format_transcript_helper(self, i, time_text, line):
return "{}\n{}\n{}".format(i + 1, time_text, line['text'])
return "{}\n{}\n{}".format(i + 1, time_text, line["text"])
class WebVTTFormatter(_TextBasedFormatter):
@ -154,29 +168,29 @@ class WebVTTFormatter(_TextBasedFormatter):
return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
def _format_transcript_helper(self, i, time_text, line):
return "{}\n{}".format(time_text, line['text'])
return "{}\n{}".format(time_text, line["text"])
class FormatterLoader(object):
TYPES = {
'json': JSONFormatter,
'pretty': PrettyPrintFormatter,
'text': TextFormatter,
'webvtt': WebVTTFormatter,
'srt' : SRTFormatter,
"json": JSONFormatter,
"pretty": PrettyPrintFormatter,
"text": TextFormatter,
"webvtt": WebVTTFormatter,
"srt": SRTFormatter,
}
class UnknownFormatterType(Exception):
def __init__(self, formatter_type):
super(FormatterLoader.UnknownFormatterType, self).__init__(
'The format \'{formatter_type}\' is not supported. '
'Choose one of the following formats: {supported_formatter_types}'.format(
"The format '{formatter_type}' is not supported. "
"Choose one of the following formats: {supported_formatter_types}".format(
formatter_type=formatter_type,
supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
supported_formatter_types=", ".join(FormatterLoader.TYPES.keys()),
)
)
def load(self, formatter_type='pretty'):
def load(self, formatter_type="pretty"):
"""
Loads the Formatter for the given formatter type.

View File

@ -25,8 +25,9 @@ from youtube_transcript_api import (
def load_asset(filename):
filepath = '{dirname}/assets/{filename}'.format(
dirname=os.path.dirname(__file__), filename=filename)
filepath = "{dirname}/assets/{filename}".format(
dirname=os.path.dirname(__file__), filename=filename
)
with open(filepath, mode="rb") as file:
return file.read()
@ -37,13 +38,13 @@ class TestYouTubeTranscriptApi(TestCase):
httpretty.enable()
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube.html.static"),
)
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/api/timedtext',
body=load_asset('transcript.xml.static')
"https://www.youtube.com/api/timedtext",
body=load_asset("transcript.xml.static"),
)
def tearDown(self):
@ -51,306 +52,362 @@ class TestYouTubeTranscriptApi(TestCase):
httpretty.disable()
def test_get_transcript(self):
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8")
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
{"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54},
{
"text": "this is not the original transcript",
"start": 1.54,
"duration": 4.16,
},
{
"text": "just something shorter, I made up for testing",
"start": 5.7,
"duration": 3.239,
},
],
)
def test_get_transcript_formatted(self):
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True)
transcript = YouTubeTranscriptApi.get_transcript(
"GJLlxj_dtq8", preserve_formatting=True
)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
{"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54},
{
"text": "this is <i>not</i> the original transcript",
"start": 1.54,
"duration": 4.16,
},
{
"text": "just something shorter, I made up for testing",
"start": 5.7,
"duration": 3.239,
},
],
)
def test_list_transcripts(self):
transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8")
language_codes = {transcript.language_code for transcript in transcript_list}
self.assertEqual(language_codes, {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'})
self.assertEqual(
language_codes, {"zh", "de", "en", "hi", "ja", "ko", "es", "cs", "en"}
)
def test_list_transcripts__find_manually_created(self):
transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
transcript = transcript_list.find_manually_created_transcript(['cs'])
transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8")
transcript = transcript_list.find_manually_created_transcript(["cs"])
self.assertFalse(transcript.is_generated)
def test_list_transcripts__find_generated(self):
transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8")
with self.assertRaises(NoTranscriptFound):
transcript_list.find_generated_transcript(['cs'])
transcript_list.find_generated_transcript(["cs"])
transcript = transcript_list.find_generated_transcript(['en'])
transcript = transcript_list.find_generated_transcript(["en"])
self.assertTrue(transcript.is_generated)
def test_list_transcripts__url_as_video_id(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_transcripts_disabled.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_transcripts_disabled.html.static"),
)
with self.assertRaises(InvalidVideoId):
YouTubeTranscriptApi.list_transcripts('https://www.youtube.com/watch?v=GJLlxj_dtq8')
YouTubeTranscriptApi.list_transcripts(
"https://www.youtube.com/watch?v=GJLlxj_dtq8"
)
def test_list_transcripts__no_translation_languages_provided(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_no_translation_languages.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_no_translation_languages.html.static"),
)
transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8")
for transcript in transcript_list:
self.assertEqual(len(transcript.translation_languages), 0)
def test_translate_transcript(self):
transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en'])
transcript = YouTubeTranscriptApi.list_transcripts(
"GJLlxj_dtq8"
).find_transcript(["en"])
translated_transcript = transcript.translate('af')
translated_transcript = transcript.translate("af")
self.assertEqual(translated_transcript.language_code, 'af')
self.assertIn('&tlang=af', translated_transcript._url)
self.assertEqual(translated_transcript.language_code, "af")
self.assertIn("&tlang=af", translated_transcript._url)
def test_translate_transcript__translation_language_not_available(self):
transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en'])
transcript = YouTubeTranscriptApi.list_transcripts(
"GJLlxj_dtq8"
).find_transcript(["en"])
with self.assertRaises(TranslationLanguageNotAvailable):
transcript.translate('xyz')
transcript.translate("xyz")
def test_translate_transcript__not_translatable(self):
transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en'])
transcript = YouTubeTranscriptApi.list_transcripts(
"GJLlxj_dtq8"
).find_transcript(["en"])
transcript.translation_languages = []
with self.assertRaises(NotTranslatable):
transcript.translate('af')
transcript.translate("af")
def test_get_transcript__correct_language_is_used(self):
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", ["de", "en"])
query_string = httpretty.last_request().querystring
self.assertIn('lang', query_string)
self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'de')
self.assertIn("lang", query_string)
self.assertEqual(len(query_string["lang"]), 1)
self.assertEqual(query_string["lang"][0], "de")
def test_get_transcript__fallback_language_is_used(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_ww1_nl_en.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_ww1_nl_en.html.static"),
)
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en'])
YouTubeTranscriptApi.get_transcript("F1xioXWb8CY", ["de", "en"])
query_string = httpretty.last_request().querystring
self.assertIn('lang', query_string)
self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'en')
self.assertIn("lang", query_string)
self.assertEqual(len(query_string["lang"]), 1)
self.assertEqual(query_string["lang"][0], "en")
def test_get_transcript__create_consent_cookie_if_needed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_consent_page.html.static"),
)
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
YouTubeTranscriptApi.get_transcript("F1xioXWb8CY")
self.assertEqual(len(httpretty.latest_requests()), 3)
for request in httpretty.latest_requests()[1:]:
self.assertEqual(request.headers['cookie'], 'CONSENT=YES+cb.20210328-17-p0.de+FX+119')
self.assertEqual(
request.headers["cookie"], "CONSENT=YES+cb.20210328-17-p0.de+FX+119"
)
def test_get_transcript__exception_if_create_consent_cookie_failed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_consent_page.html.static"),
)
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_consent_page.html.static"),
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
YouTubeTranscriptApi.get_transcript("F1xioXWb8CY")
def test_get_transcript__exception_if_consent_cookie_age_invalid(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page_invalid.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_consent_page_invalid.html.static"),
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
YouTubeTranscriptApi.get_transcript("F1xioXWb8CY")
def test_get_transcript__exception_if_video_unavailable(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_video_unavailable.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_video_unavailable.html.static"),
)
with self.assertRaises(VideoUnavailable):
YouTubeTranscriptApi.get_transcript('abc')
YouTubeTranscriptApi.get_transcript("abc")
def test_get_transcript__exception_if_youtube_request_fails(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
status=500
httpretty.GET, "https://www.youtube.com/watch", status=500
)
with self.assertRaises(YouTubeRequestFailed):
YouTubeTranscriptApi.get_transcript('abc')
YouTubeTranscriptApi.get_transcript("abc")
def test_get_transcript__exception_if_youtube_request_limit_reached(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_too_many_requests.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_too_many_requests.html.static"),
)
with self.assertRaises(TooManyRequests):
YouTubeTranscriptApi.get_transcript('abc')
YouTubeTranscriptApi.get_transcript("abc")
def test_get_transcript__exception_if_transcripts_disabled(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_transcripts_disabled.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_transcripts_disabled.html.static"),
)
with self.assertRaises(TranscriptsDisabled):
YouTubeTranscriptApi.get_transcript('dsMFmonKDD4')
YouTubeTranscriptApi.get_transcript("dsMFmonKDD4")
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_transcripts_disabled2.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_transcripts_disabled2.html.static"),
)
with self.assertRaises(TranscriptsDisabled):
YouTubeTranscriptApi.get_transcript('Fjg5lYqvzUs')
YouTubeTranscriptApi.get_transcript("Fjg5lYqvzUs")
def test_get_transcript__exception_if_language_unavailable(self):
with self.assertRaises(NoTranscriptFound):
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz'])
YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", languages=["cz"])
def test_get_transcript__exception_if_no_transcript_available(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_no_transcript_available.html.static')
"https://www.youtube.com/watch",
body=load_asset("youtube_no_transcript_available.html.static"),
)
with self.assertRaises(NoTranscriptAvailable):
YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E')
YouTubeTranscriptApi.get_transcript("MwBPvcYFY2E")
def test_get_transcript__with_proxy(self):
proxies = {'http': '', 'https:': ''}
transcript = YouTubeTranscriptApi.get_transcript(
'GJLlxj_dtq8', proxies=proxies
)
proxies = {"http": "", "https:": ""}
transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", proxies=proxies)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
{"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54},
{
"text": "this is not the original transcript",
"start": 1.54,
"duration": 4.16,
},
{
"text": "just something shorter, I made up for testing",
"start": 5.7,
"duration": 3.239,
},
],
)
def test_get_transcript__with_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
cookies = dirname + '/example_cookies.txt'
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies)
cookies = dirname + "/example_cookies.txt"
transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", cookies=cookies)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
{"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54},
{
"text": "this is not the original transcript",
"start": 1.54,
"duration": 4.16,
},
{
"text": "just something shorter, I made up for testing",
"start": 5.7,
"duration": 3.239,
},
],
)
def test_get_transcript__assertionerror_if_input_not_string(self):
with self.assertRaises(AssertionError):
YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2'])
YouTubeTranscriptApi.get_transcript(["video_id_1", "video_id_2"])
def test_get_transcripts__assertionerror_if_input_not_list(self):
with self.assertRaises(AssertionError):
YouTubeTranscriptApi.get_transcripts('video_id_1')
YouTubeTranscriptApi.get_transcripts("video_id_1")
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
@patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript")
def test_get_transcripts(self, mock_get_transcript):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
languages = ['de', 'en']
video_id_1 = "video_id_1"
video_id_2 = "video_id_2"
languages = ["de", "en"]
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
YouTubeTranscriptApi.get_transcripts(
[video_id_1, video_id_2], languages=languages
)
mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False)
mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False)
self.assertEqual(mock_get_transcript.call_count, 2)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@patch(
"youtube_transcript_api.YouTubeTranscriptApi.get_transcript",
side_effect=Exception("Error"),
)
def test_get_transcripts__stop_on_error(self, mock_get_transcript):
with self.assertRaises(Exception):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
YouTubeTranscriptApi.get_transcripts(["video_id_1", "video_id_2"])
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@patch(
"youtube_transcript_api.YouTubeTranscriptApi.get_transcript",
side_effect=Exception("Error"),
)
def test_get_transcripts__continue_on_error(self, mock_get_transcript):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
video_id_1 = "video_id_1"
video_id_2 = "video_id_2"
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
YouTubeTranscriptApi.get_transcripts(
["video_id_1", "video_id_2"], continue_after_error=True
)
mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False)
mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
mock_get_transcript.assert_any_call(video_id_1, ("en",), None, None, False)
mock_get_transcript.assert_any_call(video_id_2, ("en",), None, None, False)
@patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript")
def test_get_transcripts__with_cookies(self, mock_get_transcript):
cookies = '/example_cookies.txt'
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False)
cookies = "/example_cookies.txt"
YouTubeTranscriptApi.get_transcripts(["GJLlxj_dtq8"], cookies=cookies)
mock_get_transcript.assert_any_call(
"GJLlxj_dtq8", ("en",), None, cookies, False
)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
@patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript")
def test_get_transcripts__with_proxies(self, mock_get_transcript):
proxies = {'http': '', 'https:': ''}
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False)
proxies = {"http": "", "https:": ""}
YouTubeTranscriptApi.get_transcripts(["GJLlxj_dtq8"], proxies=proxies)
mock_get_transcript.assert_any_call(
"GJLlxj_dtq8", ("en",), proxies, None, False
)
def test_load_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
cookies = dirname + '/example_cookies.txt'
session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8')
self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies))
cookies = dirname + "/example_cookies.txt"
session_cookies = YouTubeTranscriptApi._load_cookies(cookies, "GJLlxj_dtq8")
self.assertEqual(
{"TEST_FIELD": "TEST_VALUE"},
requests.utils.dict_from_cookiejar(session_cookies),
)
def test_load_cookies__bad_file_path(self):
bad_cookies = 'nonexistent_cookies.txt'
bad_cookies = "nonexistent_cookies.txt"
with self.assertRaises(CookiePathInvalid):
YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8')
YouTubeTranscriptApi._load_cookies(bad_cookies, "GJLlxj_dtq8")
def test_load_cookies__no_valid_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
expired_cookies = dirname + '/expired_example_cookies.txt'
expired_cookies = dirname + "/expired_example_cookies.txt"
with self.assertRaises(CookiesInvalid):
YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8')
YouTubeTranscriptApi._load_cookies(expired_cookies, "GJLlxj_dtq8")

View File

@ -10,211 +10,269 @@ from youtube_transcript_api._cli import YouTubeTranscriptCli
class TestYouTubeTranscriptCli(TestCase):
def setUp(self):
self.transcript_mock = MagicMock()
self.transcript_mock.fetch = MagicMock(return_value=[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
])
self.transcript_mock.fetch = MagicMock(
return_value=[
{"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54},
{
"text": "this is <i>not</i> the original transcript",
"start": 1.54,
"duration": 4.16,
},
{
"text": "just something shorter, I made up for testing",
"start": 5.7,
"duration": 3.239,
},
]
)
self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)
self.transcript_list_mock = MagicMock()
self.transcript_list_mock.find_generated_transcript = MagicMock(return_value=self.transcript_mock)
self.transcript_list_mock.find_manually_created_transcript = MagicMock(return_value=self.transcript_mock)
self.transcript_list_mock.find_transcript = MagicMock(return_value=self.transcript_mock)
self.transcript_list_mock.find_generated_transcript = MagicMock(
return_value=self.transcript_mock
)
self.transcript_list_mock.find_manually_created_transcript = MagicMock(
return_value=self.transcript_mock
)
self.transcript_list_mock.find_transcript = MagicMock(
return_value=self.transcript_mock
)
YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock)
YouTubeTranscriptApi.list_transcripts = MagicMock(
return_value=self.transcript_list_mock
)
def test_argument_parsing(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --format json --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli(' --format json v1 v2 --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli(
"v1 v2 --format json --languages de en".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.http_proxy, "")
self.assertEqual(parsed_args.https_proxy, "")
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --format json '
'--http-proxy http://user:pass@domain:port '
'--https-proxy https://user:pass@domain:port'.split()
"v1 v2 --languages de en --format json".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.http_proxy, "")
self.assertEqual(parsed_args.https_proxy, "")
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --format json --http-proxy http://user:pass@domain:port'.split()
" --format json v1 v2 --languages de en".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, '')
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.http_proxy, "")
self.assertEqual(parsed_args.https_proxy, "")
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --format json --https-proxy https://user:pass@domain:port'.split()
"v1 v2 --languages de en --format json "
"--http-proxy http://user:pass@domain:port "
"--https-proxy https://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port")
self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port")
parsed_args = YouTubeTranscriptCli(
"v1 v2 --languages de en --format json --http-proxy http://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port")
self.assertEqual(parsed_args.https_proxy, "")
parsed_args = YouTubeTranscriptCli(
"v1 v2 --languages de en --format json --https-proxy https://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port")
self.assertEqual(parsed_args.http_proxy, "")
def test_argument_parsing__only_video_ids(self):
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'pretty')
self.assertEqual(parsed_args.languages, ['en'])
parsed_args = YouTubeTranscriptCli("v1 v2".split())._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "pretty")
self.assertEqual(parsed_args.languages, ["en"])
def test_argument_parsing__video_ids_starting_with_dash(self):
parsed_args = YouTubeTranscriptCli('\-v1 \-\-v2 \--v3'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['-v1', '--v2', '--v3'])
self.assertEqual(parsed_args.format, 'pretty')
self.assertEqual(parsed_args.languages, ['en'])
parsed_args = YouTubeTranscriptCli("\-v1 \-\-v2 \--v3".split())._parse_args()
self.assertEqual(parsed_args.video_ids, ["-v1", "--v2", "--v3"])
self.assertEqual(parsed_args.format, "pretty")
self.assertEqual(parsed_args.languages, ["en"])
def test_argument_parsing__fail_without_video_ids(self):
with self.assertRaises(SystemExit):
YouTubeTranscriptCli('--format json'.split())._parse_args()
YouTubeTranscriptCli("--format json".split())._parse_args()
def test_argument_parsing__json(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --format json'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['en'])
parsed_args = YouTubeTranscriptCli("v1 v2 --format json".split())._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["en"])
parsed_args = YouTubeTranscriptCli('--format json v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'json')
self.assertEqual(parsed_args.languages, ['en'])
parsed_args = YouTubeTranscriptCli("--format json v1 v2".split())._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "json")
self.assertEqual(parsed_args.languages, ["en"])
def test_argument_parsing__languages(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'pretty')
self.assertEqual(parsed_args.languages, ['de', 'en'])
parsed_args = YouTubeTranscriptCli(
"v1 v2 --languages de en".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "pretty")
self.assertEqual(parsed_args.languages, ["de", "en"])
def test_argument_parsing__proxies(self):
parsed_args = YouTubeTranscriptCli(
'v1 v2 --http-proxy http://user:pass@domain:port'.split()
"v1 v2 --http-proxy http://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port")
parsed_args = YouTubeTranscriptCli(
'v1 v2 --https-proxy https://user:pass@domain:port'.split()
"v1 v2 --https-proxy https://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port")
parsed_args = YouTubeTranscriptCli(
'v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()
"v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port".split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port")
self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port")
parsed_args = YouTubeTranscriptCli(
'v1 v2'.split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli("v1 v2".split())._parse_args()
self.assertEqual(parsed_args.http_proxy, "")
self.assertEqual(parsed_args.https_proxy, "")
def test_argument_parsing__list_transcripts(self):
parsed_args = YouTubeTranscriptCli('--list-transcripts v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
parsed_args = YouTubeTranscriptCli(
"--list-transcripts v1 v2".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertTrue(parsed_args.list_transcripts)
parsed_args = YouTubeTranscriptCli('v1 v2 --list-transcripts'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
parsed_args = YouTubeTranscriptCli(
"v1 v2 --list-transcripts".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertTrue(parsed_args.list_transcripts)
def test_argument_parsing__translate(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'pretty')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.translate, 'cz')
parsed_args = YouTubeTranscriptCli(
"v1 v2 --languages de en --translate cz".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "pretty")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.translate, "cz")
parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.format, 'pretty')
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.translate, 'cz')
parsed_args = YouTubeTranscriptCli(
"v1 v2 --translate cz --languages de en".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertEqual(parsed_args.format, "pretty")
self.assertEqual(parsed_args.languages, ["de", "en"])
self.assertEqual(parsed_args.translate, "cz")
def test_argument_parsing__manually_or_generated(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
parsed_args = YouTubeTranscriptCli(
"v1 v2 --exclude-manually-created".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertTrue(parsed_args.exclude_manually_created)
self.assertFalse(parsed_args.exclude_generated)
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-generated'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
parsed_args = YouTubeTranscriptCli(
"v1 v2 --exclude-generated".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertFalse(parsed_args.exclude_manually_created)
self.assertTrue(parsed_args.exclude_generated)
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created --exclude-generated'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
parsed_args = YouTubeTranscriptCli(
"v1 v2 --exclude-manually-created --exclude-generated".split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ["v1", "v2"])
self.assertTrue(parsed_args.exclude_manually_created)
self.assertTrue(parsed_args.exclude_generated)
def test_run(self):
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
YouTubeTranscriptCli("v1 v2 --languages de en".split()).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v1", proxies=None, cookies=None
)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v2", proxies=None, cookies=None
)
self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en'])
self.transcript_list_mock.find_transcript.assert_any_call(["de", "en"])
def test_run__failing_transcripts(self):
YouTubeTranscriptApi.list_transcripts = MagicMock(side_effect=VideoUnavailable('video_id'))
YouTubeTranscriptApi.list_transcripts = MagicMock(
side_effect=VideoUnavailable("video_id")
)
output = YouTubeTranscriptCli('v1 --languages de en'.split()).run()
output = YouTubeTranscriptCli("v1 --languages de en".split()).run()
self.assertEqual(output, str(VideoUnavailable('video_id')))
self.assertEqual(output, str(VideoUnavailable("video_id")))
def test_run__exclude_generated(self):
YouTubeTranscriptCli('v1 v2 --languages de en --exclude-generated'.split()).run()
YouTubeTranscriptCli(
"v1 v2 --languages de en --exclude-generated".split()
).run()
self.transcript_list_mock.find_manually_created_transcript.assert_any_call(['de', 'en'])
self.transcript_list_mock.find_manually_created_transcript.assert_any_call(
["de", "en"]
)
def test_run__exclude_manually_created(self):
YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created'.split()).run()
YouTubeTranscriptCli(
"v1 v2 --languages de en --exclude-manually-created".split()
).run()
self.transcript_list_mock.find_generated_transcript.assert_any_call(['de', 'en'])
self.transcript_list_mock.find_generated_transcript.assert_any_call(
["de", "en"]
)
def test_run__exclude_manually_created_and_generated(self):
self.assertEqual(
YouTubeTranscriptCli(
'v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split()
"v1 v2 --languages de en --exclude-manually-created --exclude-generated".split()
).run(),
''
"",
)
def test_run__translate(self):
YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split()).run(),
YouTubeTranscriptCli("v1 v2 --languages de en --translate cz".split()).run(),
self.transcript_mock.translate.assert_any_call('cz')
self.transcript_mock.translate.assert_any_call("cz")
def test_run__list_transcripts(self):
YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run()
YouTubeTranscriptCli("--list-transcripts v1 v2".split()).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v1", proxies=None, cookies=None
)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v2", proxies=None, cookies=None
)
def test_run__json_output(self):
output = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split()).run()
output = YouTubeTranscriptCli(
"v1 v2 --languages de en --format json".split()
).run()
# will fail if output is not valid json
json.loads(output)
@ -222,31 +280,37 @@ class TestYouTubeTranscriptCli(TestCase):
def test_run__proxies(self):
YouTubeTranscriptCli(
(
'v1 v2 --languages de en '
'--http-proxy http://user:pass@domain:port '
'--https-proxy https://user:pass@domain:port'
"v1 v2 --languages de en "
"--http-proxy http://user:pass@domain:port "
"--https-proxy https://user:pass@domain:port"
).split()
).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call(
'v1',
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
cookies= None
"v1",
proxies={
"http": "http://user:pass@domain:port",
"https": "https://user:pass@domain:port",
},
cookies=None,
)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
'v2',
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
cookies=None
"v2",
proxies={
"http": "http://user:pass@domain:port",
"https": "https://user:pass@domain:port",
},
cookies=None,
)
def test_run__cookies(self):
YouTubeTranscriptCli(
(
'v1 v2 --languages de en '
'--cookies blahblah.txt'
).split()
("v1 v2 --languages de en " "--cookies blahblah.txt").split()
).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt')
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt')
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v1", proxies=None, cookies="blahblah.txt"
)
YouTubeTranscriptApi.list_transcripts.assert_any_call(
"v2", proxies=None, cookies="blahblah.txt"
)

View File

@ -10,16 +10,17 @@ from youtube_transcript_api.formatters import (
TextFormatter,
SRTFormatter,
WebVTTFormatter,
PrettyPrintFormatter, FormatterLoader
PrettyPrintFormatter,
FormatterLoader,
)
class TestFormatters(TestCase):
def setUp(self):
self.transcript = [
{'text': 'Test line 1', 'start': 0.0, 'duration': 1.50},
{'text': 'line between', 'start': 1.5, 'duration': 2.0},
{'text': 'testing the end line', 'start': 2.5, 'duration': 3.25}
{"text": "Test line 1", "start": 0.0, "duration": 1.50},
{"text": "line between", "start": 1.5, "duration": 2.0},
{"text": "testing the end line", "start": 2.5, "duration": 3.25},
]
self.transcripts = [self.transcript, self.transcript]
@ -31,27 +32,27 @@ class TestFormatters(TestCase):
def test_srt_formatter_starting(self):
content = SRTFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
# test starting lines
self.assertEqual(lines[0], "1")
self.assertEqual(lines[1], "00:00:00,000 --> 00:00:01,500")
def test_srt_formatter_middle(self):
content = SRTFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
# test middle lines
self.assertEqual(lines[4], "2")
self.assertEqual(lines[5], "00:00:01,500 --> 00:00:02,500")
self.assertEqual(lines[6], self.transcript[1]['text'])
self.assertEqual(lines[6], self.transcript[1]["text"])
def test_srt_formatter_ending(self):
content = SRTFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
# test ending lines
self.assertEqual(lines[-2], self.transcript[-1]['text'])
self.assertEqual(lines[-2], self.transcript[-1]["text"])
self.assertEqual(lines[-1], "")
def test_srt_formatter_many(self):
@ -59,22 +60,25 @@ class TestFormatters(TestCase):
content = formatter.format_transcripts(self.transcripts)
formatted_single_transcript = formatter.format_transcript(self.transcript)
self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript)
self.assertEqual(
content,
formatted_single_transcript + "\n\n\n" + formatted_single_transcript,
)
def test_webvtt_formatter_starting(self):
content = WebVTTFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
# test starting lines
self.assertEqual(lines[0], "WEBVTT")
self.assertEqual(lines[1], "")
def test_webvtt_formatter_ending(self):
content = WebVTTFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
# test ending lines
self.assertEqual(lines[-2], self.transcript[-1]['text'])
self.assertEqual(lines[-2], self.transcript[-1]["text"])
self.assertEqual(lines[-1], "")
def test_webvtt_formatter_many(self):
@ -82,7 +86,10 @@ class TestFormatters(TestCase):
content = formatter.format_transcripts(self.transcripts)
formatted_single_transcript = formatter.format_transcript(self.transcript)
self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript)
self.assertEqual(
content,
formatted_single_transcript + "\n\n\n" + formatted_single_transcript,
)
def test_pretty_print_formatter(self):
content = PrettyPrintFormatter().format_transcript(self.transcript)
@ -106,7 +113,7 @@ class TestFormatters(TestCase):
def test_text_formatter(self):
content = TextFormatter().format_transcript(self.transcript)
lines = content.split('\n')
lines = content.split("\n")
self.assertEqual(lines[0], self.transcript[0]["text"])
self.assertEqual(lines[-1], self.transcript[-1]["text"])
@ -116,11 +123,14 @@ class TestFormatters(TestCase):
content = formatter.format_transcripts(self.transcripts)
formatted_single_transcript = formatter.format_transcript(self.transcript)
self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript)
self.assertEqual(
content,
formatted_single_transcript + "\n\n\n" + formatted_single_transcript,
)
def test_formatter_loader(self):
loader = FormatterLoader()
formatter = loader.load('json')
formatter = loader.load("json")
self.assertTrue(isinstance(formatter, JSONFormatter))
@ -132,4 +142,4 @@ class TestFormatters(TestCase):
def test_formatter_loader__unknown_format(self):
with self.assertRaises(FormatterLoader.UnknownFormatterType):
FormatterLoader().load('png')
FormatterLoader().load("png")