move preserve_formatting from init to fetch()

also remove from transcriptlist & transcriptlistfetcher
This commit is contained in:
E. Seiver 2023-04-13 12:46:24 -07:00
parent 79fd63d585
commit ca93c48fa1
2 changed files with 12 additions and 22 deletions

View File

@ -16,7 +16,7 @@ from ._errors import (
class YouTubeTranscriptApi(object): class YouTubeTranscriptApi(object):
@classmethod @classmethod
def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False): def list_transcripts(cls, video_id, proxies=None, cookies=None):
""" """
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object):
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies :param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str :type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: the list of available transcripts :return: the list of available transcripts
:rtype TranscriptList: :rtype TranscriptList:
""" """
@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object):
if cookies: if cookies:
http_client.cookies = cls._load_cookies(cookies, video_id) http_client.cookies = cls._load_cookies(cookies, video_id)
http_client.proxies = proxies if proxies else {} http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id, return TranscriptListFetcher(http_client).fetch(video_id)
preserve_formatting=preserve_formatting)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
@ -137,8 +134,8 @@ class YouTubeTranscriptApi(object):
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
assert isinstance(video_id, str), "`video_id` must be a string" assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch() return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
@classmethod @classmethod
def _load_cookies(cls, cookies, video_id): def _load_cookies(cls, cookies, video_id):
try: try:

View File

@ -40,12 +40,12 @@ class TranscriptListFetcher(object):
def __init__(self, http_client): def __init__(self, http_client):
self._http_client = http_client self._http_client = http_client
def fetch(self, video_id, preserve_formatting=False): def fetch(self, video_id):
return TranscriptList.build( return TranscriptList.build(
self._http_client, self._http_client,
video_id, video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id), self._extract_captions_json(self._fetch_video_html(video_id), video_id),
preserve_formatting=preserve_formatting,
) )
def _extract_captions_json(self, html, video_id): def _extract_captions_json(self, html, video_id):
@ -114,7 +114,7 @@ class TranscriptList(object):
self._translation_languages = translation_languages self._translation_languages = translation_languages
@staticmethod @staticmethod
def build(http_client, video_id, captions_json, preserve_formatting=False): def build(http_client, video_id, captions_json):
""" """
Factory method for TranscriptList. Factory method for TranscriptList.
@ -124,8 +124,6 @@ class TranscriptList(object):
:type video_id: str :type video_id: str
:param captions_json: the JSON parsed from the YouTube pages static HTML :param captions_json: the JSON parsed from the YouTube pages static HTML
:type captions_json: dict :type captions_json: dict
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: the created TranscriptList :return: the created TranscriptList
:rtype TranscriptList: :rtype TranscriptList:
""" """
@ -153,7 +151,6 @@ class TranscriptList(object):
caption['languageCode'], caption['languageCode'],
caption.get('kind', '') == 'asr', caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [], translation_languages if caption.get('isTranslatable', False) else [],
preserve_formatting=preserve_formatting,
) )
return TranscriptList( return TranscriptList(
@ -253,8 +250,7 @@ class TranscriptList(object):
class Transcript(object): class Transcript(object):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
preserve_formatting=False):
""" """
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList. TranscriptList.
@ -268,8 +264,6 @@ class Transcript(object):
:param language_code: :param language_code:
:param is_generated: :param is_generated:
:param translation_languages: :param translation_languages:
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
""" """
self._http_client = http_client self._http_client = http_client
self.video_id = video_id self.video_id = video_id
@ -282,17 +276,17 @@ class Transcript(object):
translation_language['language_code']: translation_language['language'] translation_language['language_code']: translation_language['language']
for translation_language in translation_languages for translation_language in translation_languages
} }
self.preserve_formatting = preserve_formatting
def fetch(self): def fetch(self, preserve_formatting=False):
""" """
Loads the actual transcript data. Loads the actual transcript data.
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
response = self._http_client.get(self._url) response = self._http_client.get(self._url)
return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text,) _raise_http_errors(response, self.video_id).text,)
def __str__(self): def __str__(self):
@ -321,7 +315,6 @@ class Transcript(object):
language_code, language_code,
True, True,
[], [],
preserve_formatting=self.preserve_formatting,
) )