diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index dfb790d..c5e835d 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -16,7 +16,7 @@ from ._errors import ( class YouTubeTranscriptApi(object): @classmethod - def list_transcripts(cls, video_id, proxies=None, cookies=None): + def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object): if cookies: http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id) + return TranscriptListFetcher(http_client).fetch(video_id, + preserve_formatting=preserve_formatting) @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): @@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): + def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False): """ Retrieves the transcript for a single video. This is just a shortcut for calling:: @@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() + return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch() @classmethod def _load_cookies(cls, cookies, video_id): diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 64925f3..9107c04 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -53,11 +53,12 @@ class TranscriptListFetcher(object): def __init__(self, http_client): self._http_client = http_client - def fetch(self, video_id): + def fetch(self, video_id, preserve_formatting=False): return TranscriptList.build( self._http_client, video_id, - self._extract_captions_json(self._fetch_video_html(video_id), video_id) + self._extract_captions_json(self._fetch_video_html(video_id), video_id), + preserve_formatting=preserve_formatting, ) def _extract_captions_json(self, html, video_id): @@ -107,7 +108,8 @@ class TranscriptList(object): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages, +): """ The constructor is only for internal use. Use the static build method instead. @@ -126,7 +128,7 @@ class TranscriptList(object): self._translation_languages = translation_languages @staticmethod - def build(http_client, video_id, captions_json): + def build(http_client, video_id, captions_json, preserve_formatting=False): """ Factory method for TranscriptList. @@ -162,7 +164,8 @@ class TranscriptList(object): caption['name']['simpleText'], caption['languageCode'], caption.get('kind', '') == 'asr', - translation_languages if caption.get('isTranslatable', False) else [] + translation_languages if caption.get('isTranslatable', False) else [], + preserve_formatting=preserve_formatting, ) return TranscriptList( @@ -262,7 +265,8 @@ class TranscriptList(object): class Transcript(object): - def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, + preserve_formatting=False): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. @@ -276,6 +280,7 @@ class Transcript(object): :param language_code: :param is_generated: :param translation_languages: + :param preserve_formatting: whether to keep select HTMl text formatting """ self._http_client = http_client self.video_id = video_id @@ -288,6 +293,7 @@ class Transcript(object): translation_language['language_code']: translation_language['language'] for translation_language in translation_languages } + self.preserve_formatting = preserve_formatting def fetch(self): """ @@ -297,7 +303,7 @@ class Transcript(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get(self._url) - return _TranscriptParser().parse( + return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) @@ -327,11 +333,12 @@ class Transcript(object): language_code, True, [], + preserve_formatting=self.preserve_formatting, ) class _TranscriptParser(object): def __init__(self, preserve_formatting=False): self.preserve_formatting = preserve_formatting - + @property def html_regex(self): if self.preserve_formatting: