Propagate formatting up to user level
This commit is contained in:
parent
1f1c8b249b
commit
c1a037c39c
|
@ -16,7 +16,7 @@ from ._errors import (
|
|||
|
||||
class YouTubeTranscriptApi(object):
|
||||
@classmethod
|
||||
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
||||
def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
|
||||
"""
|
||||
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
||||
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
||||
|
@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object):
|
|||
if cookies:
|
||||
http_client.cookies = cls._load_cookies(cookies, video_id)
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptListFetcher(http_client).fetch(video_id)
|
||||
return TranscriptListFetcher(http_client).fetch(video_id,
|
||||
preserve_formatting=preserve_formatting)
|
||||
|
||||
@classmethod
|
||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
|
||||
|
@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object):
|
|||
return data, unretrievable_videos
|
||||
|
||||
@classmethod
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
|
||||
"""
|
||||
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
||||
|
||||
|
@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object):
|
|||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||
"""
|
||||
assert isinstance(video_id, str), "`video_id` must be a string"
|
||||
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
|
||||
return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
|
||||
|
||||
@classmethod
|
||||
def _load_cookies(cls, cookies, video_id):
|
||||
|
|
|
@ -53,11 +53,12 @@ class TranscriptListFetcher(object):
|
|||
def __init__(self, http_client):
|
||||
self._http_client = http_client
|
||||
|
||||
def fetch(self, video_id):
|
||||
def fetch(self, video_id, preserve_formatting=False):
|
||||
return TranscriptList.build(
|
||||
self._http_client,
|
||||
video_id,
|
||||
self._extract_captions_json(self._fetch_video_html(video_id), video_id)
|
||||
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
||||
preserve_formatting=preserve_formatting,
|
||||
)
|
||||
|
||||
def _extract_captions_json(self, html, video_id):
|
||||
|
@ -107,7 +108,8 @@ class TranscriptList(object):
|
|||
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||
"""
|
||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages,
|
||||
):
|
||||
"""
|
||||
The constructor is only for internal use. Use the static build method instead.
|
||||
|
||||
|
@ -126,7 +128,7 @@ class TranscriptList(object):
|
|||
self._translation_languages = translation_languages
|
||||
|
||||
@staticmethod
|
||||
def build(http_client, video_id, captions_json):
|
||||
def build(http_client, video_id, captions_json, preserve_formatting=False):
|
||||
"""
|
||||
Factory method for TranscriptList.
|
||||
|
||||
|
@ -162,7 +164,8 @@ class TranscriptList(object):
|
|||
caption['name']['simpleText'],
|
||||
caption['languageCode'],
|
||||
caption.get('kind', '') == 'asr',
|
||||
translation_languages if caption.get('isTranslatable', False) else []
|
||||
translation_languages if caption.get('isTranslatable', False) else [],
|
||||
preserve_formatting=preserve_formatting,
|
||||
)
|
||||
|
||||
return TranscriptList(
|
||||
|
@ -262,7 +265,8 @@ class TranscriptList(object):
|
|||
|
||||
|
||||
class Transcript(object):
|
||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
|
||||
preserve_formatting=False):
|
||||
"""
|
||||
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||
TranscriptList.
|
||||
|
@ -276,6 +280,7 @@ class Transcript(object):
|
|||
:param language_code:
|
||||
:param is_generated:
|
||||
:param translation_languages:
|
||||
:param preserve_formatting: whether to keep select HTMl text formatting
|
||||
"""
|
||||
self._http_client = http_client
|
||||
self.video_id = video_id
|
||||
|
@ -288,6 +293,7 @@ class Transcript(object):
|
|||
translation_language['language_code']: translation_language['language']
|
||||
for translation_language in translation_languages
|
||||
}
|
||||
self.preserve_formatting = preserve_formatting
|
||||
|
||||
def fetch(self):
|
||||
"""
|
||||
|
@ -297,7 +303,7 @@ class Transcript(object):
|
|||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||
"""
|
||||
response = self._http_client.get(self._url)
|
||||
return _TranscriptParser().parse(
|
||||
return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
|
||||
_raise_http_errors(response, self.video_id).text,
|
||||
)
|
||||
|
||||
|
@ -327,11 +333,12 @@ class Transcript(object):
|
|||
language_code,
|
||||
True,
|
||||
[],
|
||||
preserve_formatting=self.preserve_formatting,
|
||||
)
|
||||
class _TranscriptParser(object):
|
||||
def __init__(self, preserve_formatting=False):
|
||||
self.preserve_formatting = preserve_formatting
|
||||
|
||||
|
||||
@property
|
||||
def html_regex(self):
|
||||
if self.preserve_formatting:
|
||||
|
|
Loading…
Reference in New Issue