From 1f1c8b249b931a27fc78f2c0f18c65993be38ef3 Mon Sep 17 00:00:00 2001 From: "E. Seiver" Date: Wed, 15 Mar 2023 15:44:26 -0700 Subject: [PATCH 01/12] Add optional HTML formatting `_TranscriptParser` Text formats in `TEXT_FORMATS` global variable Defaults to False --- youtube_transcript_api/_transcripts.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index cea50c4..64925f3 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -27,6 +27,19 @@ from ._errors import ( ) from ._settings import WATCH_URL +TEXT_FORMATS = [ + 'strong', # important + 'em', # emphasized + 'b', # bold + 'i', # italic + 'mark', # marked + 'small', # smaller + 'del', # deleted + 'ins', # inserted + 'sub', # subscript + 'sup', # superscript +] + def _raise_http_errors(response, video_id): try: @@ -315,15 +328,24 @@ class Transcript(object): True, [], ) - - class _TranscriptParser(object): - HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) + def __init__(self, preserve_formatting=False): + self.preserve_formatting = preserve_formatting + + @property + def html_regex(self): + if self.preserve_formatting: + formats_regex = '|'.join(TEXT_FORMATS) + formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' + html_regex = re.compile(formats_regex, re.IGNORECASE) + else: + html_regex = re.compile(r'<[^>]*>', re.IGNORECASE) + return html_regex def parse(self, plain_data): return [ { - 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), + 'text': re.sub(self.html_regex, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib.get('dur', '0.0')), } From c1a037c39ccd6a232e1f0c419cf5f3ea60422df0 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:20:16 -0700 Subject: [PATCH 02/12] Propagate formatting up to user level --- youtube_transcript_api/_api.py | 9 +++++---- youtube_transcript_api/_transcripts.py | 23 +++++++++++++++-------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index dfb790d..c5e835d 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -16,7 +16,7 @@ from ._errors import ( class YouTubeTranscriptApi(object): @classmethod - def list_transcripts(cls, video_id, proxies=None, cookies=None): + def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object): if cookies: http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id) + return TranscriptListFetcher(http_client).fetch(video_id, + preserve_formatting=preserve_formatting) @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): @@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): + def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False): """ Retrieves the transcript for a single video. This is just a shortcut for calling:: @@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() + return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch() @classmethod def _load_cookies(cls, cookies, video_id): diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 64925f3..9107c04 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -53,11 +53,12 @@ class TranscriptListFetcher(object): def __init__(self, http_client): self._http_client = http_client - def fetch(self, video_id): + def fetch(self, video_id, preserve_formatting=False): return TranscriptList.build( self._http_client, video_id, - self._extract_captions_json(self._fetch_video_html(video_id), video_id) + self._extract_captions_json(self._fetch_video_html(video_id), video_id), + preserve_formatting=preserve_formatting, ) def _extract_captions_json(self, html, video_id): @@ -107,7 +108,8 @@ class TranscriptList(object): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages, +): """ The constructor is only for internal use. Use the static build method instead. @@ -126,7 +128,7 @@ class TranscriptList(object): self._translation_languages = translation_languages @staticmethod - def build(http_client, video_id, captions_json): + def build(http_client, video_id, captions_json, preserve_formatting=False): """ Factory method for TranscriptList. @@ -162,7 +164,8 @@ class TranscriptList(object): caption['name']['simpleText'], caption['languageCode'], caption.get('kind', '') == 'asr', - translation_languages if caption.get('isTranslatable', False) else [] + translation_languages if caption.get('isTranslatable', False) else [], + preserve_formatting=preserve_formatting, ) return TranscriptList( @@ -262,7 +265,8 @@ class TranscriptList(object): class Transcript(object): - def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, + preserve_formatting=False): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. @@ -276,6 +280,7 @@ class Transcript(object): :param language_code: :param is_generated: :param translation_languages: + :param preserve_formatting: whether to keep select HTMl text formatting """ self._http_client = http_client self.video_id = video_id @@ -288,6 +293,7 @@ class Transcript(object): translation_language['language_code']: translation_language['language'] for translation_language in translation_languages } + self.preserve_formatting = preserve_formatting def fetch(self): """ @@ -297,7 +303,7 @@ class Transcript(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get(self._url) - return _TranscriptParser().parse( + return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) @@ -327,11 +333,12 @@ class Transcript(object): language_code, True, [], + preserve_formatting=self.preserve_formatting, ) class _TranscriptParser(object): def __init__(self, preserve_formatting=False): self.preserve_formatting = preserve_formatting - + @property def html_regex(self): if self.preserve_formatting: From c1e5ce4ebb821fe02cbc80848305e5b75541da68 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:21:34 -0700 Subject: [PATCH 03/12] update tests and test doc add (partially escaped) italics to test doc add new test for `preserve_formatting=True` --- .../test/assets/transcript.xml.static | 2 +- youtube_transcript_api/test/test_api.py | 16 ++++++++++++++-- youtube_transcript_api/test/test_cli.py | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_transcript_api/test/assets/transcript.xml.static b/youtube_transcript_api/test/assets/transcript.xml.static index ec777e7..64f9c3c 100644 --- a/youtube_transcript_api/test/assets/transcript.xml.static +++ b/youtube_transcript_api/test/assets/transcript.xml.static @@ -1,7 +1,7 @@ Hey, this is just a test - this is not the original transcript + this is <i>not</i> the original transcript just something shorter, I made up for testing \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 3bda630..122ffd7 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase): ] ) + def test_get_transcript_formatted(self): + transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True) + + self.assertEqual( + transcript, + [ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ] + ) + def test_list_transcripts(self): transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') @@ -254,11 +266,11 @@ class TestYouTubeTranscriptApi(TestCase): {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] ) - + def test_get_transcript__assertionerror_if_input_not_string(self): with self.assertRaises(AssertionError): YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2']) - + def test_get_transcripts__assertionerror_if_input_not_list(self): with self.assertRaises(AssertionError): YouTubeTranscriptApi.get_transcripts('video_id_1') diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index d14f331..26ffabc 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase): self.transcript_mock = MagicMock() self.transcript_mock.fetch = MagicMock(return_value=[ {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ]) self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) From 123763c9686f23d5c1694c6a835699de7ebcae15 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:22:50 -0700 Subject: [PATCH 04/12] update readme with example --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index eb30c25..a25f5f4 100644 --- a/README.md +++ b/README.md @@ -49,10 +49,10 @@ This will return a list of dictionaries looking somewhat like this: ] ``` -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). +You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `` (italics) and `` (bold). ```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True) ``` It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts) From e88783d69e8830b964f0589df66d966d17123147 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:49:35 -0700 Subject: [PATCH 05/12] fix spacing --- youtube_transcript_api/_transcripts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 9107c04..a1b2b5c 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -108,8 +108,7 @@ class TranscriptList(object): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages, -): + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): """ The constructor is only for internal use. Use the static build method instead. @@ -304,8 +303,7 @@ class Transcript(object): """ response = self._http_client.get(self._url) return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( - _raise_http_errors(response, self.video_id).text, - ) + _raise_http_errors(response, self.video_id).text,) def __str__(self): return '{language_code} ("{language}"){translation_description}'.format( @@ -335,6 +333,8 @@ class Transcript(object): [], preserve_formatting=self.preserve_formatting, ) + + class _TranscriptParser(object): def __init__(self, preserve_formatting=False): self.preserve_formatting = preserve_formatting From 393a76ca6a122fb00fc9c69b7a92c201971e8e5b Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Tue, 11 Apr 2023 16:37:06 -0700 Subject: [PATCH 06/12] add preserve_formatting docstrings --- youtube_transcript_api/_api.py | 4 ++++ youtube_transcript_api/_transcripts.py | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index c5e835d..99f7d53 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -61,6 +61,8 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: the list of available transcripts :rtype TranscriptList: """ @@ -126,6 +128,8 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index a1b2b5c..59d2f4c 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -137,6 +137,8 @@ class TranscriptList(object): :type video_id: str :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: the created TranscriptList :rtype TranscriptList: """ @@ -279,7 +281,8 @@ class Transcript(object): :param language_code: :param is_generated: :param translation_languages: - :param preserve_formatting: whether to keep select HTMl text formatting + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool """ self._http_client = http_client self.video_id = video_id From fdedfff681321e162114283c588717acf94efe15 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Tue, 11 Apr 2023 16:38:07 -0700 Subject: [PATCH 07/12] separate out format example in readme --- README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a25f5f4..b2ae4a8 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,12 @@ This will return a list of dictionaries looking somewhat like this: # ... ] ``` +### Translate transcript -You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `` (italics) and `` (bold). +You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). ```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True) +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) ``` It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts) @@ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) `languages` also is optional here. +### Preserve formatting + +You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `` (italics) and `` (bold). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True) +``` + ### List available transcripts If you want to list all transcripts which are available for a given video you can call: From 72e97815289016e6296575f1ee03f175fadce870 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 12 Apr 2023 12:43:48 -0700 Subject: [PATCH 08/12] add preserve_formatting to get_transcripts() + tests also assertion tests for `get_transcripts()` updated to include `False` at the end for new `preserve_formatting` param --- youtube_transcript_api/_api.py | 7 +++++-- youtube_transcript_api/test/test_api.py | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 99f7d53..d57fa3c 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -74,7 +74,8 @@ class YouTubeTranscriptApi(object): preserve_formatting=preserve_formatting) @classmethod - def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): + def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, + cookies=None, preserve_formatting=False): """ Retrieves the transcripts for a list of videos. @@ -91,6 +92,8 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): @@ -102,7 +105,7 @@ class YouTubeTranscriptApi(object): for video_id in video_ids: try: - data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) + data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting) except Exception as exception: if not continue_after_error: raise exception diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 122ffd7..36d60a5 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -283,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) - mock_get_transcript.assert_any_call(video_id_1, languages, None, None) - mock_get_transcript.assert_any_call(video_id_2, languages, None, None) + mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False) + mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False) self.assertEqual(mock_get_transcript.call_count, 2) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) @@ -299,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) - mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) + mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False) + mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') def test_get_transcripts__with_cookies(self, mock_get_transcript): cookies = '/example_cookies.txt' YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') def test_get_transcripts__with_proxies(self, mock_get_transcript): proxies = {'http': '', 'https:': ''} YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False) def test_load_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) From eda8ddb38f229369447b463834d5d3c0b773536f Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Wed, 12 Apr 2023 14:29:19 -0700 Subject: [PATCH 09/12] _html_regex static property of _TranscriptParser() also rename TEXT_FORMATS -> FORMATTING TAGS --- youtube_transcript_api/_transcripts.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 59d2f4c..32e0fc4 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -27,7 +27,7 @@ from ._errors import ( ) from ._settings import WATCH_URL -TEXT_FORMATS = [ +_FORMATTING_TAGS = [ 'strong', # important 'em', # emphasized 'b', # bold @@ -341,11 +341,11 @@ class Transcript(object): class _TranscriptParser(object): def __init__(self, preserve_formatting=False): self.preserve_formatting = preserve_formatting + self._html_regex = self.get_html_regex() - @property - def html_regex(self): + def get_html_regex(self): if self.preserve_formatting: - formats_regex = '|'.join(TEXT_FORMATS) + formats_regex = '|'.join(_FORMATTING_TAGS) formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' html_regex = re.compile(formats_regex, re.IGNORECASE) else: @@ -355,7 +355,7 @@ class _TranscriptParser(object): def parse(self, plain_data): return [ { - 'text': re.sub(self.html_regex, '', unescape(xml_element.text)), + 'text': re.sub(self._html_regex, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib.get('dur', '0.0')), } From 79fd63d585e225eba15cf6ec14edcb221d4475e9 Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Thu, 13 Apr 2023 12:27:34 -0700 Subject: [PATCH 10/12] move _FORMATTING_TAGS inside _TranscriptParser --- youtube_transcript_api/_transcripts.py | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 32e0fc4..400da3c 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -27,19 +27,6 @@ from ._errors import ( ) from ._settings import WATCH_URL -_FORMATTING_TAGS = [ - 'strong', # important - 'em', # emphasized - 'b', # bold - 'i', # italic - 'mark', # marked - 'small', # smaller - 'del', # deleted - 'ins', # inserted - 'sub', # subscript - 'sup', # superscript -] - def _raise_http_errors(response, video_id): try: @@ -341,11 +328,23 @@ class Transcript(object): class _TranscriptParser(object): def __init__(self, preserve_formatting=False): self.preserve_formatting = preserve_formatting + self._FORMATTING_TAGS = [ + 'strong', # important + 'em', # emphasized + 'b', # bold + 'i', # italic + 'mark', # marked + 'small', # smaller + 'del', # deleted + 'ins', # inserted + 'sub', # subscript + 'sup', # superscript + ] self._html_regex = self.get_html_regex() def get_html_regex(self): if self.preserve_formatting: - formats_regex = '|'.join(_FORMATTING_TAGS) + formats_regex = '|'.join(self._FORMATTING_TAGS) formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' html_regex = re.compile(formats_regex, re.IGNORECASE) else: From ca93c48fa1684806a5cc887c70fcd589eb8ec9af Mon Sep 17 00:00:00 2001 From: "E. Seiver" <5547078+eseiver@users.noreply.github.com> Date: Thu, 13 Apr 2023 12:46:24 -0700 Subject: [PATCH 11/12] move preserve_formatting from init to fetch() also remove from transcriptlist & transcriptlistfetcher --- youtube_transcript_api/_api.py | 11 ++++------- youtube_transcript_api/_transcripts.py | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index d57fa3c..24a1236 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -16,7 +16,7 @@ from ._errors import ( class YouTubeTranscriptApi(object): @classmethod - def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False): + def list_transcripts(cls, video_id, proxies=None, cookies=None): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool :return: the list of available transcripts :rtype TranscriptList: """ @@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object): if cookies: http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id, - preserve_formatting=preserve_formatting) + return TranscriptListFetcher(http_client).fetch(video_id) @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, @@ -137,8 +134,8 @@ class YouTubeTranscriptApi(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch() - + return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting) + @classmethod def _load_cookies(cls, cookies, video_id): try: diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 400da3c..2c79fd9 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -40,12 +40,12 @@ class TranscriptListFetcher(object): def __init__(self, http_client): self._http_client = http_client - def fetch(self, video_id, preserve_formatting=False): + def fetch(self, video_id): + return TranscriptList.build( self._http_client, video_id, self._extract_captions_json(self._fetch_video_html(video_id), video_id), - preserve_formatting=preserve_formatting, ) def _extract_captions_json(self, html, video_id): @@ -114,7 +114,7 @@ class TranscriptList(object): self._translation_languages = translation_languages @staticmethod - def build(http_client, video_id, captions_json, preserve_formatting=False): + def build(http_client, video_id, captions_json): """ Factory method for TranscriptList. @@ -124,8 +124,6 @@ class TranscriptList(object): :type video_id: str :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool :return: the created TranscriptList :rtype TranscriptList: """ @@ -153,7 +151,6 @@ class TranscriptList(object): caption['languageCode'], caption.get('kind', '') == 'asr', translation_languages if caption.get('isTranslatable', False) else [], - preserve_formatting=preserve_formatting, ) return TranscriptList( @@ -253,8 +250,7 @@ class TranscriptList(object): class Transcript(object): - def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, - preserve_formatting=False): + def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. @@ -268,8 +264,6 @@ class Transcript(object): :param language_code: :param is_generated: :param translation_languages: - :param preserve_formatting: whether to keep select HTML text formatting - :type preserve_formatting: bool """ self._http_client = http_client self.video_id = video_id @@ -282,17 +276,17 @@ class Transcript(object): translation_language['language_code']: translation_language['language'] for translation_language in translation_languages } - self.preserve_formatting = preserve_formatting - def fetch(self): + def fetch(self, preserve_formatting=False): """ Loads the actual transcript data. - + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get(self._url) - return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse( + return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text,) def __str__(self): @@ -321,7 +315,6 @@ class Transcript(object): language_code, True, [], - preserve_formatting=self.preserve_formatting, ) From 8c62e5e276d1b43ffbea9914cc0b9e92f85643a5 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 17 Apr 2023 15:07:10 +0200 Subject: [PATCH 12/12] _FORMATTING_TAGS is now a static property of _TranscriptParser; _get_html_regext is now private; removed preserve_formatting property of _TranscriptParser --- youtube_transcript_api/_transcripts.py | 44 ++++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 2c79fd9..1e0f8f1 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -1,7 +1,7 @@ import sys # This can only be tested by using different python versions, therefore it is not covered by coverage.py -if sys.version_info.major == 2: # pragma: no cover +if sys.version_info.major == 2: # pragma: no cover reload(sys) sys.setdefaultencoding('utf-8') @@ -95,6 +95,7 @@ class TranscriptList(object): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): """ The constructor is only for internal use. Use the static build method instead. @@ -191,7 +192,7 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._generated_transcripts,]) + return self._find_transcript(language_codes, [self._generated_transcripts]) def find_manually_created_transcript(self, language_codes): """ @@ -205,7 +206,7 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._manually_created_transcripts,]) + return self._find_transcript(language_codes, [self._manually_created_transcripts]) def _find_transcript(self, language_codes, transcript_dicts): for language_code in language_codes: @@ -287,7 +288,8 @@ class Transcript(object): """ response = self._http_client.get(self._url) return _TranscriptParser(preserve_formatting=preserve_formatting).parse( - _raise_http_errors(response, self.video_id).text,) + _raise_http_errors(response, self.video_id).text, + ) def __str__(self): return '{language_code} ("{language}"){translation_description}'.format( @@ -319,24 +321,24 @@ class Transcript(object): class _TranscriptParser(object): - def __init__(self, preserve_formatting=False): - self.preserve_formatting = preserve_formatting - self._FORMATTING_TAGS = [ - 'strong', # important - 'em', # emphasized - 'b', # bold - 'i', # italic - 'mark', # marked - 'small', # smaller - 'del', # deleted - 'ins', # inserted - 'sub', # subscript - 'sup', # superscript - ] - self._html_regex = self.get_html_regex() + _FORMATTING_TAGS = [ + 'strong', # important + 'em', # emphasized + 'b', # bold + 'i', # italic + 'mark', # marked + 'small', # smaller + 'del', # deleted + 'ins', # inserted + 'sub', # subscript + 'sup', # superscript + ] - def get_html_regex(self): - if self.preserve_formatting: + def __init__(self, preserve_formatting=False): + self._html_regex = self._get_html_regex(preserve_formatting) + + def _get_html_regex(self, preserve_formatting): + if preserve_formatting: formats_regex = '|'.join(self._FORMATTING_TAGS) formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' html_regex = re.compile(formats_regex, re.IGNORECASE)