youtube-transcript-api/youtube_transcript_api/_cli.py

import argparse

from ._api import YouTubeTranscriptApi

from .formatters import FormatterLoader


class YouTubeTranscriptCli(object):
    def __init__(self, args):
        self._args = args

    def run(self):
        parsed_args = self._parse_args()

        if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
            return ""

        proxies = None
        if parsed_args.http_proxy != "" or parsed_args.https_proxy != "":
            proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}

        cookies = parsed_args.cookies

        transcripts = []
        exceptions = []

        for video_id in parsed_args.video_ids:
            try:
                transcripts.append(
                    self._fetch_transcript(parsed_args, proxies, cookies, video_id)
                )
            except Exception as exception:
                exceptions.append(exception)

        return "\n\n".join(
            [str(exception) for exception in exceptions]
            + (
                [
                    FormatterLoader()
                    .load(parsed_args.format)
                    .format_transcripts(transcripts)
                ]
                if transcripts
                else []
            )
        )

    def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
        transcript_list = YouTubeTranscriptApi.list_transcripts(
            video_id, proxies=proxies, cookies=cookies
        )

        if parsed_args.list_transcripts:
            return str(transcript_list)

        if parsed_args.exclude_manually_created:
            transcript = transcript_list.find_generated_transcript(
                parsed_args.languages
            )
        elif parsed_args.exclude_generated:
            transcript = transcript_list.find_manually_created_transcript(
                parsed_args.languages
            )
        else:
            transcript = transcript_list.find_transcript(parsed_args.languages)

        if parsed_args.translate:
            transcript = transcript.translate(parsed_args.translate)

        return transcript.fetch()

    def _parse_args(self):
        parser = argparse.ArgumentParser(
            description=(
                "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. "
                "It also works for automatically generated subtitles and it does not require a headless browser, like "
                "other selenium based solutions do!"
            )
        )
        parser.add_argument(
            "--list-transcripts",
            action="store_const",
            const=True,
            default=False,
            help="This will list the languages in which the given videos are available in.",
        )
        parser.add_argument(
            "video_ids", nargs="+", type=str, help="List of YouTube video IDs."
        )
        parser.add_argument(
            "--languages",
            nargs="*",
            default=[
                "en",
            ],
            type=str,
            help=(
                'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
                "first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails "
                "to do so. As I can't provide a complete list of all working language codes with full certainty, you "
                "may have to play around with the language codes a bit, to find the one which is working for you!"
            ),
        )
        parser.add_argument(
            "--exclude-generated",
            action="store_const",
            const=True,
            default=False,
            help="If this flag is set transcripts which have been generated by YouTube will not be retrieved.",
        )
        parser.add_argument(
            "--exclude-manually-created",
            action="store_const",
            const=True,
            default=False,
            help="If this flag is set transcripts which have been manually created will not be retrieved.",
        )
        parser.add_argument(
            "--format",
            type=str,
            default="pretty",
            choices=tuple(FormatterLoader.TYPES.keys()),
        )
        parser.add_argument(
            "--translate",
            default="",
            help=(
                "The language code for the language you want this transcript to be translated to. Use the "
                "--list-transcripts feature to find out which languages are translatable and which translation "
                "languages are available."
            ),
        )
        parser.add_argument(
            "--http-proxy",
            default="",
            metavar="URL",
            help="Use the specified HTTP proxy.",
        )
        parser.add_argument(
            "--https-proxy",
            default="",
            metavar="URL",
            help="Use the specified HTTPS proxy.",
        )
        parser.add_argument(
            "--cookies",
            default=None,
            help="The cookie file that will be used for authorization with youtube.",
        )

        return self._sanitize_video_ids(parser.parse_args(self._args))

    def _sanitize_video_ids(self, args):
        args.video_ids = [video_id.replace("\\", "") for video_id in args.video_ids]
        return args