Merge pull request #192 from eseiver/html_formatting

Add HTML text formatting option
2023-04-17 15:15:46 +02:00 · 2023-04-17 15:15:46 +02:00 · e0a9f0d3e5
parent f910971066 8c62e5e276
commit e0a9f0d3e5
6 changed files with 78 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -48,8 +48,9 @@ This will return a list of dictionaries looking somewhat like this:
    # ...
 ]
 ```
+### Translate transcript

-You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).
+You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).

 ```python
 YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
@ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])

 `languages` also is optional here.

+### Preserve formatting
+
+You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold).
+
+```python
+YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True)
+```
+
 ### List available transcripts

 If you want to list all transcripts which are available for a given video you can call:
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -71,7 +71,8 @@ class YouTubeTranscriptApi(object):
            return TranscriptListFetcher(http_client).fetch(video_id)

    @classmethod
-    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
+    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
+                        cookies=None, preserve_formatting=False):
        """
        Retrieves the transcripts for a list of videos.

@ -88,6 +89,8 @@ class YouTubeTranscriptApi(object):
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :param cookies: a string of the path to a text file containing youtube authorization cookies
        :type cookies: str
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
        video ids, which could not be retrieved
        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@ -99,7 +102,7 @@ class YouTubeTranscriptApi(object):

        for video_id in video_ids:
            try:
-                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
            except Exception as exception:
                if not continue_after_error:
                    raise exception
@ -109,7 +112,7 @@ class YouTubeTranscriptApi(object):
        return data, unretrievable_videos

    @classmethod
-    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
        """
        Retrieves the transcript for a single video. This is just a shortcut for calling::

@ -125,12 +128,14 @@ class YouTubeTranscriptApi(object):
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :param cookies: a string of the path to a text file containing youtube authorization cookies
        :type cookies: str
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
        assert isinstance(video_id, str), "`video_id` must be a string"
-        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
-    
+        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
+
    @classmethod
    def _load_cookies(cls, cookies, video_id):
        try:
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@ -1,7 +1,7 @@
 import sys

 # This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 2: # pragma: no cover
+if sys.version_info.major == 2:  # pragma: no cover
    reload(sys)
    sys.setdefaultencoding('utf-8')

@ -41,10 +41,11 @@ class TranscriptListFetcher(object):
        self._http_client = http_client

    def fetch(self, video_id):
+
        return TranscriptList.build(
            self._http_client,
            video_id,
-            self._extract_captions_json(self._fetch_video_html(video_id), video_id)
+            self._extract_captions_json(self._fetch_video_html(video_id), video_id),
        )

    def _extract_captions_json(self, html, video_id):
@ -94,6 +95,7 @@ class TranscriptList(object):
    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
    """
+
    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
        """
        The constructor is only for internal use. Use the static build method instead.
@ -149,7 +151,7 @@ class TranscriptList(object):
                caption['name']['simpleText'],
                caption['languageCode'],
                caption.get('kind', '') == 'asr',
-                translation_languages if caption.get('isTranslatable', False) else []
+                translation_languages if caption.get('isTranslatable', False) else [],
            )

        return TranscriptList(
@ -190,7 +192,7 @@ class TranscriptList(object):
        :rtype Transcript:
        :raises: NoTranscriptFound
        """
-        return self._find_transcript(language_codes, [self._generated_transcripts,])
+        return self._find_transcript(language_codes, [self._generated_transcripts])

    def find_manually_created_transcript(self, language_codes):
        """
@ -204,7 +206,7 @@ class TranscriptList(object):
        :rtype Transcript:
        :raises: NoTranscriptFound
        """
-        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
+        return self._find_transcript(language_codes, [self._manually_created_transcripts])

    def _find_transcript(self, language_codes, transcript_dicts):
        for language_code in language_codes:
@ -276,15 +278,16 @@ class Transcript(object):
            for translation_language in translation_languages
        }

-    def fetch(self):
+    def fetch(self, preserve_formatting=False):
        """
        Loads the actual transcript data.
-
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
        response = self._http_client.get(self._url)
-        return _TranscriptParser().parse(
+        return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
            _raise_http_errors(response, self.video_id).text,
        )

@ -318,12 +321,35 @@ class Transcript(object):


 class _TranscriptParser(object):
-    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+    _FORMATTING_TAGS = [
+        'strong',  # important
+        'em',  # emphasized
+        'b',  # bold
+        'i',  # italic
+        'mark',  # marked
+        'small',  # smaller
+        'del',  # deleted
+        'ins',  # inserted
+        'sub',  # subscript
+        'sup',  # superscript
+    ]
+
+    def __init__(self, preserve_formatting=False):
+        self._html_regex = self._get_html_regex(preserve_formatting)
+
+    def _get_html_regex(self, preserve_formatting):
+        if preserve_formatting:
+            formats_regex = '|'.join(self._FORMATTING_TAGS)
+            formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
+            html_regex = re.compile(formats_regex, re.IGNORECASE)
+        else:
+            html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
+        return html_regex

    def parse(self, plain_data):
        return [
            {
-                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
+                'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
                'start': float(xml_element.attrib['start']),
                'duration': float(xml_element.attrib.get('dur', '0.0')),
            }
--- a/youtube_transcript_api/test/assets/transcript.xml.static
+++ b/youtube_transcript_api/test/assets/transcript.xml.static
@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <transcript>
    <text start="0" dur="1.54">Hey, this is just a test</text>
-    <text start="1.54" dur="4.16">this is not the original transcript</text>
+    <text start="1.54" dur="4.16">this is &lt;i>not&lt;/i> the original transcript</text>
    <text start="5" dur="0.5"></text>
    <text start="5.7" dur="3.239">just something shorter, I made up for testing</text>
 </transcript>
--- a/youtube_transcript_api/test/test_api.py
+++ b/youtube_transcript_api/test/test_api.py
@ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase):
            ]
        )

+    def test_get_transcript_formatted(self):
+        transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True)
+
+        self.assertEqual(
+            transcript,
+            [
+                {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
+                {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
+                {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
+            ]
+        )
+
    def test_list_transcripts(self):
        transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')

@ -254,11 +266,11 @@ class TestYouTubeTranscriptApi(TestCase):
                {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
            ]
        )
-    
+
    def test_get_transcript__assertionerror_if_input_not_string(self):
        with self.assertRaises(AssertionError):
            YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2'])
-    
+
    def test_get_transcripts__assertionerror_if_input_not_list(self):
        with self.assertRaises(AssertionError):
            YouTubeTranscriptApi.get_transcripts('video_id_1')
@ -271,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase):

        YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)

-        mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
-        mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
+        mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False)
+        mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False)
        self.assertEqual(mock_get_transcript.call_count, 2)

    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@ -287,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase):

        YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)

-        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
-        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
+        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False)
+        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False)
    
    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
    def test_get_transcripts__with_cookies(self, mock_get_transcript):
        cookies = '/example_cookies.txt'
        YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
-        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False)

    @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
    def test_get_transcripts__with_proxies(self, mock_get_transcript):
        proxies = {'http': '', 'https:': ''}
        YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
-        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None)
+        mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False)

    def test_load_cookies(self):
        dirname, filename = os.path.split(os.path.abspath(__file__))
--- a/youtube_transcript_api/test/test_cli.py
+++ b/youtube_transcript_api/test/test_cli.py
@ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase):
        self.transcript_mock = MagicMock()
        self.transcript_mock.fetch = MagicMock(return_value=[
            {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
-            {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
+            {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
            {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
        ])
        self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)