From ca93c48fa1684806a5cc887c70fcd589eb8ec9af Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Thu, 13 Apr 2023 12:46:24 -0700 Subject: [PATCH] move preserve_formatting from init to fetch() also remove from transcriptlist & transcriptlistfetcher --- youtube_transcript_api/_api.py | 11 ++++------- youtube_transcript_api/_transcripts.py | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index d57fa3c..24a1236 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -16,7 +16,7 @@ from ._errors import ( class YouTubeTranscriptApi(object): @classmethod - def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False): + def list_transcripts(cls, video_id, proxies=None, cookies=None): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool :return: the list of available transcripts :rtype TranscriptList: """ @@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object): if cookies: http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id, - preserve_formatting=preserve_formatting) + return TranscriptListFetcher(http_client).fetch(video_id) @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, @@ -137,8 +134,8 @@ class YouTubeTranscriptApi(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch() - + return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting) + @classmethod def _load_cookies(cls, cookies, video_id): try: diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 400da3c..2c79fd9 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -40,12 +40,12 @@ class TranscriptListFetcher(object): def __init__(self, http_client): self._http_client = http_client - def fetch(self, video_id, preserve_formatting=False): + def fetch(self, video_id): + return TranscriptList.build( self._http_client, video_id, self._extract_captions_json(self._fetch_video_html(video_id), video_id), - preserve_formatting=preserve_formatting, ) def _extract_captions_json(self, html, video_id): @@ -114,7 +114,7 @@ class TranscriptList(object): self._translation_languages = translation_languages @staticmethod - def build(http_client, video_id, captions_json, preserve_formatting=False): + def build(http_client, video_id, captions_json): """ Factory method for TranscriptList. @@ -124,8 +124,6 @@ class TranscriptList(object): :type video_id: str :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool :return: the created TranscriptList :rtype TranscriptList: """ @@ -153,7 +151,6 @@ class TranscriptList(object): caption['languageCode'], caption.get('kind', '') == 'asr', translation_languages if caption.get('isTranslatable', False) else [], - preserve_formatting=preserve_formatting, ) return TranscriptList( @@ -253,8 +250,7 @@ class TranscriptList(object): class Transcript(object): - def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, - preserve_formatting=False): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. @@ -268,8 +264,6 @@ class Transcript(object): :param language_code: :param is_generated: :param translation_languages: - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool """ self._http_client = http_client self.video_id = video_id @@ -282,17 +276,17 @@ class Transcript(object): translation_language['language_code']: translation_language['language'] for translation_language in translation_languages } - self.preserve_formatting = preserve_formatting - def fetch(self): + def fetch(self, preserve_formatting=False): """ Loads the actual transcript data. - + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get(self._url) - return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( + return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text,) def __str__(self): @@ -321,7 +315,6 @@ class Transcript(object): language_code, True, [], - preserve_formatting=self.preserve_formatting, )