From 1f1c8b249b931a27fc78f2c0f18c65993be38ef3 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <ellis.seiver@gmail.com>
Date: Wed, 15 Mar 2023 15:44:26 -0700
Subject: [PATCH 01/12] Add optional HTML formatting `_TranscriptParser`

Text formats in `TEXT_FORMATS` global variable
Defaults to False
---
 youtube_transcript_api/_transcripts.py | 30 ++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index cea50c4..64925f3 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -27,6 +27,19 @@ from ._errors import (
 )
 from ._settings import WATCH_URL
 
+TEXT_FORMATS = [
+    'strong',  # important
+    'em',  # emphasized
+    'b',  # bold
+    'i',  # italic
+    'mark',  # marked
+    'small',  # smaller
+    'del',  # deleted
+    'ins',  # inserted
+    'sub',  # subscript
+    'sup',  # superscript
+]
+
 
 def _raise_http_errors(response, video_id):
     try:
@@ -315,15 +328,24 @@ class Transcript(object):
             True,
             [],
         )
-
-
 class _TranscriptParser(object):
-    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+    def __init__(self, preserve_formatting=False):
+        self.preserve_formatting = preserve_formatting
+    
+    @property
+    def html_regex(self):
+        if self.preserve_formatting:
+            formats_regex = '|'.join(TEXT_FORMATS)
+            formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
+            html_regex = re.compile(formats_regex, re.IGNORECASE)
+        else:
+            html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
+        return html_regex
 
     def parse(self, plain_data):
         return [
             {
-                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
+                'text': re.sub(self.html_regex, '', unescape(xml_element.text)),
                 'start': float(xml_element.attrib['start']),
                 'duration': float(xml_element.attrib.get('dur', '0.0')),
             }

From c1a037c39ccd6a232e1f0c419cf5f3ea60422df0 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 15 Mar 2023 18:20:16 -0700
Subject: [PATCH 02/12] Propagate formatting up to user level

---
 youtube_transcript_api/_api.py         |  9 +++++----
 youtube_transcript_api/_transcripts.py | 23 +++++++++++++++--------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
index dfb790d..c5e835d 100644
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@@ -16,7 +16,7 @@ from ._errors import (
 
 class YouTubeTranscriptApi(object):
     @classmethod
-    def list_transcripts(cls, video_id, proxies=None, cookies=None):
+    def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
         """
         Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
         which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object):
             if cookies:
                 http_client.cookies = cls._load_cookies(cookies, video_id)
             http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id)
+            return TranscriptListFetcher(http_client).fetch(video_id,
+                                                            preserve_formatting=preserve_formatting)
 
     @classmethod
     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
@@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object):
         return data, unretrievable_videos
 
     @classmethod
-    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
         """
         Retrieves the transcript for a single video. This is just a shortcut for calling::
 
@@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object):
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
         assert isinstance(video_id, str), "`video_id` must be a string"
-        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
+        return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
     
     @classmethod
     def _load_cookies(cls, cookies, video_id):
diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 64925f3..9107c04 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -53,11 +53,12 @@ class TranscriptListFetcher(object):
     def __init__(self, http_client):
         self._http_client = http_client
 
-    def fetch(self, video_id):
+    def fetch(self, video_id, preserve_formatting=False):
         return TranscriptList.build(
             self._http_client,
             video_id,
-            self._extract_captions_json(self._fetch_video_html(video_id), video_id)
+            self._extract_captions_json(self._fetch_video_html(video_id), video_id),
+            preserve_formatting=preserve_formatting,
         )
 
     def _extract_captions_json(self, html, video_id):
@@ -107,7 +108,8 @@ class TranscriptList(object):
     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
     for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
     """
-    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
+    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages,
+):
         """
         The constructor is only for internal use. Use the static build method instead.
 
@@ -126,7 +128,7 @@ class TranscriptList(object):
         self._translation_languages = translation_languages
 
     @staticmethod
-    def build(http_client, video_id, captions_json):
+    def build(http_client, video_id, captions_json, preserve_formatting=False):
         """
         Factory method for TranscriptList.
 
@@ -162,7 +164,8 @@ class TranscriptList(object):
                 caption['name']['simpleText'],
                 caption['languageCode'],
                 caption.get('kind', '') == 'asr',
-                translation_languages if caption.get('isTranslatable', False) else []
+                translation_languages if caption.get('isTranslatable', False) else [],
+                preserve_formatting=preserve_formatting,
             )
 
         return TranscriptList(
@@ -262,7 +265,8 @@ class TranscriptList(object):
 
 
 class Transcript(object):
-    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
+                 preserve_formatting=False):
         """
         You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
         TranscriptList.
@@ -276,6 +280,7 @@ class Transcript(object):
         :param language_code:
         :param is_generated:
         :param translation_languages:
+        :param preserve_formatting: whether to keep select HTMl text formatting
         """
         self._http_client = http_client
         self.video_id = video_id
@@ -288,6 +293,7 @@ class Transcript(object):
             translation_language['language_code']: translation_language['language']
             for translation_language in translation_languages
         }
+        self.preserve_formatting = preserve_formatting
 
     def fetch(self):
         """
@@ -297,7 +303,7 @@ class Transcript(object):
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
         response = self._http_client.get(self._url)
-        return _TranscriptParser().parse(
+        return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
             _raise_http_errors(response, self.video_id).text,
         )
 
@@ -327,11 +333,12 @@ class Transcript(object):
             language_code,
             True,
             [],
+            preserve_formatting=self.preserve_formatting,
         )
 class _TranscriptParser(object):
     def __init__(self, preserve_formatting=False):
         self.preserve_formatting = preserve_formatting
-    
+
     @property
     def html_regex(self):
         if self.preserve_formatting:

From c1e5ce4ebb821fe02cbc80848305e5b75541da68 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 15 Mar 2023 18:21:34 -0700
Subject: [PATCH 03/12] update tests and test doc add (partially escaped)
 italics to test doc add new test for `preserve_formatting=True`

---
 .../test/assets/transcript.xml.static            |  2 +-
 youtube_transcript_api/test/test_api.py          | 16 ++++++++++++++--
 youtube_transcript_api/test/test_cli.py          |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/youtube_transcript_api/test/assets/transcript.xml.static b/youtube_transcript_api/test/assets/transcript.xml.static
index ec777e7..64f9c3c 100644
--- a/youtube_transcript_api/test/assets/transcript.xml.static
+++ b/youtube_transcript_api/test/assets/transcript.xml.static
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <transcript>
     <text start="0" dur="1.54">Hey, this is just a test</text>
-    <text start="1.54" dur="4.16">this is not the original transcript</text>
+    <text start="1.54" dur="4.16">this is &lt;i>not&lt;/i> the original transcript</text>
     <text start="5" dur="0.5"></text>
     <text start="5.7" dur="3.239">just something shorter, I made up for testing</text>
 </transcript>
\ No newline at end of file
diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py
index 3bda630..122ffd7 100644
--- a/youtube_transcript_api/test/test_api.py
+++ b/youtube_transcript_api/test/test_api.py
@@ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase):
             ]
         )
 
+    def test_get_transcript_formatted(self):
+        transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True)
+
+        self.assertEqual(
+            transcript,
+            [
+                {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
+                {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
+                {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
+            ]
+        )
+
     def test_list_transcripts(self):
         transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
 
@@ -254,11 +266,11 @@ class TestYouTubeTranscriptApi(TestCase):
                 {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
             ]
         )
-    
+
     def test_get_transcript__assertionerror_if_input_not_string(self):
         with self.assertRaises(AssertionError):
             YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2'])
-    
+
     def test_get_transcripts__assertionerror_if_input_not_list(self):
         with self.assertRaises(AssertionError):
             YouTubeTranscriptApi.get_transcripts('video_id_1')
diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py
index d14f331..26ffabc 100644
--- a/youtube_transcript_api/test/test_cli.py
+++ b/youtube_transcript_api/test/test_cli.py
@@ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase):
         self.transcript_mock = MagicMock()
         self.transcript_mock.fetch = MagicMock(return_value=[
             {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
-            {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
+            {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
             {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
         ])
         self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)

From 123763c9686f23d5c1694c6a835699de7ebcae15 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 15 Mar 2023 18:22:50 -0700
Subject: [PATCH 04/12] update readme with example

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eb30c25..a25f5f4 100644
--- a/README.md
+++ b/README.md
@@ -49,10 +49,10 @@ This will return a list of dictionaries looking somewhat like this:
 ]
 ```
 
-You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).
+You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold).
 
 ```python
-YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True)
 ```
 
 It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts)

From e88783d69e8830b964f0589df66d966d17123147 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 15 Mar 2023 18:49:35 -0700
Subject: [PATCH 05/12] fix spacing

---
 youtube_transcript_api/_transcripts.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 9107c04..a1b2b5c 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -108,8 +108,7 @@ class TranscriptList(object):
     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
     for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
     """
-    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages,
-):
+    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
         """
         The constructor is only for internal use. Use the static build method instead.
 
@@ -304,8 +303,7 @@ class Transcript(object):
         """
         response = self._http_client.get(self._url)
         return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
-            _raise_http_errors(response, self.video_id).text,
-        )
+            _raise_http_errors(response, self.video_id).text,)
 
     def __str__(self):
         return '{language_code} ("{language}"){translation_description}'.format(
@@ -335,6 +333,8 @@ class Transcript(object):
             [],
             preserve_formatting=self.preserve_formatting,
         )
+
+
 class _TranscriptParser(object):
     def __init__(self, preserve_formatting=False):
         self.preserve_formatting = preserve_formatting

From 393a76ca6a122fb00fc9c69b7a92c201971e8e5b Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Tue, 11 Apr 2023 16:37:06 -0700
Subject: [PATCH 06/12] add preserve_formatting docstrings

---
 youtube_transcript_api/_api.py         | 4 ++++
 youtube_transcript_api/_transcripts.py | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
index c5e835d..99f7d53 100644
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@@ -61,6 +61,8 @@ class YouTubeTranscriptApi(object):
         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
         :param cookies: a string of the path to a text file containing youtube authorization cookies
         :type cookies: str
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         :return: the list of available transcripts
         :rtype TranscriptList:
         """
@@ -126,6 +128,8 @@ class YouTubeTranscriptApi(object):
         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
         :param cookies: a string of the path to a text file containing youtube authorization cookies
         :type cookies: str
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index a1b2b5c..59d2f4c 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -137,6 +137,8 @@ class TranscriptList(object):
         :type video_id: str
         :param captions_json: the JSON parsed from the YouTube pages static HTML
         :type captions_json: dict
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         :return: the created TranscriptList
         :rtype TranscriptList:
         """
@@ -279,7 +281,8 @@ class Transcript(object):
         :param language_code:
         :param is_generated:
         :param translation_languages:
-        :param preserve_formatting: whether to keep select HTMl text formatting
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         """
         self._http_client = http_client
         self.video_id = video_id

From fdedfff681321e162114283c588717acf94efe15 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Tue, 11 Apr 2023 16:38:07 -0700
Subject: [PATCH 07/12] separate out format example in readme

---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a25f5f4..b2ae4a8 100644
--- a/README.md
+++ b/README.md
@@ -48,11 +48,12 @@ This will return a list of dictionaries looking somewhat like this:
     # ...
 ]
 ```
+### Translate transcript
 
-You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold).
+You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).
 
 ```python
-YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True)
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
 ```
 
 It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts)
@@ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
 
 `languages` also is optional here.
 
+### Preserve formatting
+
+You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold).
+
+```python
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True)
+```
+
 ### List available transcripts
 
 If you want to list all transcripts which are available for a given video you can call:

From 72e97815289016e6296575f1ee03f175fadce870 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:43:48 -0700
Subject: [PATCH 08/12] add preserve_formatting to get_transcripts() + tests

also assertion tests for `get_transcripts()` updated to include `False` at the end for  new `preserve_formatting` param
---
 youtube_transcript_api/_api.py          |  7 +++++--
 youtube_transcript_api/test/test_api.py | 12 ++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
index 99f7d53..d57fa3c 100644
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@@ -74,7 +74,8 @@ class YouTubeTranscriptApi(object):
                                                             preserve_formatting=preserve_formatting)
 
     @classmethod
-    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
+    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
+                        cookies=None, preserve_formatting=False):
         """
         Retrieves the transcripts for a list of videos.
 
@@ -91,6 +92,8 @@ class YouTubeTranscriptApi(object):
         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
         :param cookies: a string of the path to a text file containing youtube authorization cookies
         :type cookies: str
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
         video ids, which could not be retrieved
         :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@@ -102,7 +105,7 @@ class YouTubeTranscriptApi(object):
 
         for video_id in video_ids:
             try:
-                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
             except Exception as exception:
                 if not continue_after_error:
                     raise exception
diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py
index 122ffd7..36d60a5 100644
--- a/youtube_transcript_api/test/test_api.py
+++ b/youtube_transcript_api/test/test_api.py
@@ -283,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase):
 
         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
 
-        mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
-        mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
+        mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False)
+        mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False)
         self.assertEqual(mock_get_transcript.call_count, 2)
 
     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@@ -299,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase):
 
         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
 
-        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
-        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
+        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False)
+        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False)
     
     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
     def test_get_transcripts__with_cookies(self, mock_get_transcript):
         cookies = '/example_cookies.txt'
         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
-        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False)
 
     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
     def test_get_transcripts__with_proxies(self, mock_get_transcript):
         proxies = {'http': '', 'https:': ''}
         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
-        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False)
 
     def test_load_cookies(self):
         dirname, filename = os.path.split(os.path.abspath(__file__))

From eda8ddb38f229369447b463834d5d3c0b773536f Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Wed, 12 Apr 2023 14:29:19 -0700
Subject: [PATCH 09/12] _html_regex static property of _TranscriptParser()

also rename TEXT_FORMATS -> FORMATTING TAGS
---
 youtube_transcript_api/_transcripts.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 59d2f4c..32e0fc4 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -27,7 +27,7 @@ from ._errors import (
 )
 from ._settings import WATCH_URL
 
-TEXT_FORMATS = [
+_FORMATTING_TAGS = [
     'strong',  # important
     'em',  # emphasized
     'b',  # bold
@@ -341,11 +341,11 @@ class Transcript(object):
 class _TranscriptParser(object):
     def __init__(self, preserve_formatting=False):
         self.preserve_formatting = preserve_formatting
+        self._html_regex = self.get_html_regex()
 
-    @property
-    def html_regex(self):
+    def get_html_regex(self):
         if self.preserve_formatting:
-            formats_regex = '|'.join(TEXT_FORMATS)
+            formats_regex = '|'.join(_FORMATTING_TAGS)
             formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
             html_regex = re.compile(formats_regex, re.IGNORECASE)
         else:
@@ -355,7 +355,7 @@ class _TranscriptParser(object):
     def parse(self, plain_data):
         return [
             {
-                'text': re.sub(self.html_regex, '', unescape(xml_element.text)),
+                'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
                 'start': float(xml_element.attrib['start']),
                 'duration': float(xml_element.attrib.get('dur', '0.0')),
             }

From 79fd63d585e225eba15cf6ec14edcb221d4475e9 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Thu, 13 Apr 2023 12:27:34 -0700
Subject: [PATCH 10/12] move _FORMATTING_TAGS inside _TranscriptParser

---
 youtube_transcript_api/_transcripts.py | 27 +++++++++++++-------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 32e0fc4..400da3c 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -27,19 +27,6 @@ from ._errors import (
 )
 from ._settings import WATCH_URL
 
-_FORMATTING_TAGS = [
-    'strong',  # important
-    'em',  # emphasized
-    'b',  # bold
-    'i',  # italic
-    'mark',  # marked
-    'small',  # smaller
-    'del',  # deleted
-    'ins',  # inserted
-    'sub',  # subscript
-    'sup',  # superscript
-]
-
 
 def _raise_http_errors(response, video_id):
     try:
@@ -341,11 +328,23 @@ class Transcript(object):
 class _TranscriptParser(object):
     def __init__(self, preserve_formatting=False):
         self.preserve_formatting = preserve_formatting
+        self._FORMATTING_TAGS = [
+            'strong',  # important
+            'em',  # emphasized
+            'b',  # bold
+            'i',  # italic
+            'mark',  # marked
+            'small',  # smaller
+            'del',  # deleted
+            'ins',  # inserted
+            'sub',  # subscript
+            'sup',  # superscript
+            ]
         self._html_regex = self.get_html_regex()
 
     def get_html_regex(self):
         if self.preserve_formatting:
-            formats_regex = '|'.join(_FORMATTING_TAGS)
+            formats_regex = '|'.join(self._FORMATTING_TAGS)
             formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
             html_regex = re.compile(formats_regex, re.IGNORECASE)
         else:

From ca93c48fa1684806a5cc887c70fcd589eb8ec9af Mon Sep 17 00:00:00 2001
From: "E. Seiver" <5547078+eseiver@users.noreply.github.com>
Date: Thu, 13 Apr 2023 12:46:24 -0700
Subject: [PATCH 11/12] move preserve_formatting from init to fetch()

also remove from transcriptlist & transcriptlistfetcher
---
 youtube_transcript_api/_api.py         | 11 ++++-------
 youtube_transcript_api/_transcripts.py | 23 ++++++++---------------
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
index d57fa3c..24a1236 100644
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@@ -16,7 +16,7 @@ from ._errors import (
 
 class YouTubeTranscriptApi(object):
     @classmethod
-    def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
+    def list_transcripts(cls, video_id, proxies=None, cookies=None):
         """
         Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
         which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object):
         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
         :param cookies: a string of the path to a text file containing youtube authorization cookies
         :type cookies: str
-        :param preserve_formatting: whether to keep select HTML text formatting
-        :type preserve_formatting: bool
         :return: the list of available transcripts
         :rtype TranscriptList:
         """
@@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object):
             if cookies:
                 http_client.cookies = cls._load_cookies(cookies, video_id)
             http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id,
-                                                            preserve_formatting=preserve_formatting)
+            return TranscriptListFetcher(http_client).fetch(video_id)
 
     @classmethod
     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
@@ -137,8 +134,8 @@ class YouTubeTranscriptApi(object):
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
         assert isinstance(video_id, str), "`video_id` must be a string"
-        return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
-    
+        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
+
     @classmethod
     def _load_cookies(cls, cookies, video_id):
         try:
diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 400da3c..2c79fd9 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -40,12 +40,12 @@ class TranscriptListFetcher(object):
     def __init__(self, http_client):
         self._http_client = http_client
 
-    def fetch(self, video_id, preserve_formatting=False):
+    def fetch(self, video_id):
+
         return TranscriptList.build(
             self._http_client,
             video_id,
             self._extract_captions_json(self._fetch_video_html(video_id), video_id),
-            preserve_formatting=preserve_formatting,
         )
 
     def _extract_captions_json(self, html, video_id):
@@ -114,7 +114,7 @@ class TranscriptList(object):
         self._translation_languages = translation_languages
 
     @staticmethod
-    def build(http_client, video_id, captions_json, preserve_formatting=False):
+    def build(http_client, video_id, captions_json):
         """
         Factory method for TranscriptList.
 
@@ -124,8 +124,6 @@ class TranscriptList(object):
         :type video_id: str
         :param captions_json: the JSON parsed from the YouTube pages static HTML
         :type captions_json: dict
-        :param preserve_formatting: whether to keep select HTML text formatting
-        :type preserve_formatting: bool
         :return: the created TranscriptList
         :rtype TranscriptList:
         """
@@ -153,7 +151,6 @@ class TranscriptList(object):
                 caption['languageCode'],
                 caption.get('kind', '') == 'asr',
                 translation_languages if caption.get('isTranslatable', False) else [],
-                preserve_formatting=preserve_formatting,
             )
 
         return TranscriptList(
@@ -253,8 +250,7 @@ class TranscriptList(object):
 
 
 class Transcript(object):
-    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
-                 preserve_formatting=False):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
         """
         You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
         TranscriptList.
@@ -268,8 +264,6 @@ class Transcript(object):
         :param language_code:
         :param is_generated:
         :param translation_languages:
-        :param preserve_formatting: whether to keep select HTML text formatting
-        :type preserve_formatting: bool
         """
         self._http_client = http_client
         self.video_id = video_id
@@ -282,17 +276,17 @@ class Transcript(object):
             translation_language['language_code']: translation_language['language']
             for translation_language in translation_languages
         }
-        self.preserve_formatting = preserve_formatting
 
-    def fetch(self):
+    def fetch(self, preserve_formatting=False):
         """
         Loads the actual transcript data.
-
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
         response = self._http_client.get(self._url)
-        return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
+        return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
             _raise_http_errors(response, self.video_id).text,)
 
     def __str__(self):
@@ -321,7 +315,6 @@ class Transcript(object):
             language_code,
             True,
             [],
-            preserve_formatting=self.preserve_formatting,
         )
 
 

From 8c62e5e276d1b43ffbea9914cc0b9e92f85643a5 Mon Sep 17 00:00:00 2001
From: Jonas Depoix <jonas.depoix@web.de>
Date: Mon, 17 Apr 2023 15:07:10 +0200
Subject: [PATCH 12/12] _FORMATTING_TAGS is now a static property of
 _TranscriptParser; _get_html_regext is now private; removed
 preserve_formatting property of _TranscriptParser

---
 youtube_transcript_api/_transcripts.py | 44 ++++++++++++++------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index 2c79fd9..1e0f8f1 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -1,7 +1,7 @@
 import sys
 
 # This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 2: # pragma: no cover
+if sys.version_info.major == 2:  # pragma: no cover
     reload(sys)
     sys.setdefaultencoding('utf-8')
 
@@ -95,6 +95,7 @@ class TranscriptList(object):
     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
     for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
     """
+
     def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
         """
         The constructor is only for internal use. Use the static build method instead.
@@ -191,7 +192,7 @@ class TranscriptList(object):
         :rtype Transcript:
         :raises: NoTranscriptFound
         """
-        return self._find_transcript(language_codes, [self._generated_transcripts,])
+        return self._find_transcript(language_codes, [self._generated_transcripts])
 
     def find_manually_created_transcript(self, language_codes):
         """
@@ -205,7 +206,7 @@ class TranscriptList(object):
         :rtype Transcript:
         :raises: NoTranscriptFound
         """
-        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
+        return self._find_transcript(language_codes, [self._manually_created_transcripts])
 
     def _find_transcript(self, language_codes, transcript_dicts):
         for language_code in language_codes:
@@ -287,7 +288,8 @@ class Transcript(object):
         """
         response = self._http_client.get(self._url)
         return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
-            _raise_http_errors(response, self.video_id).text,)
+            _raise_http_errors(response, self.video_id).text,
+        )
 
     def __str__(self):
         return '{language_code} ("{language}"){translation_description}'.format(
@@ -319,24 +321,24 @@ class Transcript(object):
 
 
 class _TranscriptParser(object):
-    def __init__(self, preserve_formatting=False):
-        self.preserve_formatting = preserve_formatting
-        self._FORMATTING_TAGS = [
-            'strong',  # important
-            'em',  # emphasized
-            'b',  # bold
-            'i',  # italic
-            'mark',  # marked
-            'small',  # smaller
-            'del',  # deleted
-            'ins',  # inserted
-            'sub',  # subscript
-            'sup',  # superscript
-            ]
-        self._html_regex = self.get_html_regex()
+    _FORMATTING_TAGS = [
+        'strong',  # important
+        'em',  # emphasized
+        'b',  # bold
+        'i',  # italic
+        'mark',  # marked
+        'small',  # smaller
+        'del',  # deleted
+        'ins',  # inserted
+        'sub',  # subscript
+        'sup',  # superscript
+    ]
 
-    def get_html_regex(self):
-        if self.preserve_formatting:
+    def __init__(self, preserve_formatting=False):
+        self._html_regex = self._get_html_regex(preserve_formatting)
+
+    def _get_html_regex(self, preserve_formatting):
+        if preserve_formatting:
             formats_regex = '|'.join(self._FORMATTING_TAGS)
             formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
             html_regex = re.compile(formats_regex, re.IGNORECASE)