Merge pull request #192 from eseiver/html_formatting
Add HTML text formatting option
This commit is contained in:
		
						commit
						e0a9f0d3e5
					
				
							
								
								
									
										11
									
								
								README.md
								
								
								
								
							
							
						
						
									
										11
									
								
								README.md
								
								
								
								
							|  | @ -48,8 +48,9 @@ This will return a list of dictionaries looking somewhat like this: | ||||||
|     # ... |     # ... | ||||||
| ] | ] | ||||||
| ``` | ``` | ||||||
|  | ### Translate transcript | ||||||
| 
 | 
 | ||||||
| You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). | You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). | ||||||
| 
 | 
 | ||||||
| ```python | ```python | ||||||
| YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) | YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) | ||||||
|  | @ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) | ||||||
| 
 | 
 | ||||||
| `languages` also is optional here. | `languages` also is optional here. | ||||||
| 
 | 
 | ||||||
|  | ### Preserve formatting | ||||||
|  | 
 | ||||||
|  | You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold). | ||||||
|  | 
 | ||||||
|  | ```python | ||||||
|  | YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True) | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
| ### List available transcripts | ### List available transcripts | ||||||
| 
 | 
 | ||||||
| If you want to list all transcripts which are available for a given video you can call: | If you want to list all transcripts which are available for a given video you can call: | ||||||
|  |  | ||||||
|  | @ -71,7 +71,8 @@ class YouTubeTranscriptApi(object): | ||||||
|             return TranscriptListFetcher(http_client).fetch(video_id) |             return TranscriptListFetcher(http_client).fetch(video_id) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): |     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, | ||||||
|  |                         cookies=None, preserve_formatting=False): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcripts for a list of videos. |         Retrieves the transcripts for a list of videos. | ||||||
| 
 | 
 | ||||||
|  | @ -88,6 +89,8 @@ class YouTubeTranscriptApi(object): | ||||||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies |         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|         :param cookies: a string of the path to a text file containing youtube authorization cookies |         :param cookies: a string of the path to a text file containing youtube authorization cookies | ||||||
|         :type cookies: str |         :type cookies: str | ||||||
|  |         :param preserve_formatting: whether to keep select HTML text formatting | ||||||
|  |         :type preserve_formatting: bool | ||||||
|         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of |         :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of | ||||||
|         video ids, which could not be retrieved |         video ids, which could not be retrieved | ||||||
|         :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): |         :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): | ||||||
|  | @ -99,7 +102,7 @@ class YouTubeTranscriptApi(object): | ||||||
| 
 | 
 | ||||||
|         for video_id in video_ids: |         for video_id in video_ids: | ||||||
|             try: |             try: | ||||||
|                 data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) |                 data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting) | ||||||
|             except Exception as exception: |             except Exception as exception: | ||||||
|                 if not continue_after_error: |                 if not continue_after_error: | ||||||
|                     raise exception |                     raise exception | ||||||
|  | @ -109,7 +112,7 @@ class YouTubeTranscriptApi(object): | ||||||
|         return data, unretrievable_videos |         return data, unretrievable_videos | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): |     def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcript for a single video. This is just a shortcut for calling:: |         Retrieves the transcript for a single video. This is just a shortcut for calling:: | ||||||
| 
 | 
 | ||||||
|  | @ -125,11 +128,13 @@ class YouTubeTranscriptApi(object): | ||||||
|         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies |         :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies | ||||||
|         :param cookies: a string of the path to a text file containing youtube authorization cookies |         :param cookies: a string of the path to a text file containing youtube authorization cookies | ||||||
|         :type cookies: str |         :type cookies: str | ||||||
|  |         :param preserve_formatting: whether to keep select HTML text formatting | ||||||
|  |         :type preserve_formatting: bool | ||||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys |         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||||
|         :rtype [{'text': str, 'start': float, 'end': float}]: |         :rtype [{'text': str, 'start': float, 'end': float}]: | ||||||
|         """ |         """ | ||||||
|         assert isinstance(video_id, str), "`video_id` must be a string" |         assert isinstance(video_id, str), "`video_id` must be a string" | ||||||
|         return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() |         return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def _load_cookies(cls, cookies, video_id): |     def _load_cookies(cls, cookies, video_id): | ||||||
|  |  | ||||||
|  | @ -41,10 +41,11 @@ class TranscriptListFetcher(object): | ||||||
|         self._http_client = http_client |         self._http_client = http_client | ||||||
| 
 | 
 | ||||||
|     def fetch(self, video_id): |     def fetch(self, video_id): | ||||||
|  | 
 | ||||||
|         return TranscriptList.build( |         return TranscriptList.build( | ||||||
|             self._http_client, |             self._http_client, | ||||||
|             video_id, |             video_id, | ||||||
|             self._extract_captions_json(self._fetch_video_html(video_id), video_id) |             self._extract_captions_json(self._fetch_video_html(video_id), video_id), | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     def _extract_captions_json(self, html, video_id): |     def _extract_captions_json(self, html, video_id): | ||||||
|  | @ -94,6 +95,7 @@ class TranscriptList(object): | ||||||
|     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available |     This object represents a list of transcripts. It can be iterated over to list all transcripts which are available | ||||||
|     for a given YouTube video. Also it provides functionality to search for a transcript in a given language. |     for a given YouTube video. Also it provides functionality to search for a transcript in a given language. | ||||||
|     """ |     """ | ||||||
|  | 
 | ||||||
|     def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): |     def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): | ||||||
|         """ |         """ | ||||||
|         The constructor is only for internal use. Use the static build method instead. |         The constructor is only for internal use. Use the static build method instead. | ||||||
|  | @ -149,7 +151,7 @@ class TranscriptList(object): | ||||||
|                 caption['name']['simpleText'], |                 caption['name']['simpleText'], | ||||||
|                 caption['languageCode'], |                 caption['languageCode'], | ||||||
|                 caption.get('kind', '') == 'asr', |                 caption.get('kind', '') == 'asr', | ||||||
|                 translation_languages if caption.get('isTranslatable', False) else [] |                 translation_languages if caption.get('isTranslatable', False) else [], | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         return TranscriptList( |         return TranscriptList( | ||||||
|  | @ -190,7 +192,7 @@ class TranscriptList(object): | ||||||
|         :rtype Transcript: |         :rtype Transcript: | ||||||
|         :raises: NoTranscriptFound |         :raises: NoTranscriptFound | ||||||
|         """ |         """ | ||||||
|         return self._find_transcript(language_codes, [self._generated_transcripts,]) |         return self._find_transcript(language_codes, [self._generated_transcripts]) | ||||||
| 
 | 
 | ||||||
|     def find_manually_created_transcript(self, language_codes): |     def find_manually_created_transcript(self, language_codes): | ||||||
|         """ |         """ | ||||||
|  | @ -204,7 +206,7 @@ class TranscriptList(object): | ||||||
|         :rtype Transcript: |         :rtype Transcript: | ||||||
|         :raises: NoTranscriptFound |         :raises: NoTranscriptFound | ||||||
|         """ |         """ | ||||||
|         return self._find_transcript(language_codes, [self._manually_created_transcripts,]) |         return self._find_transcript(language_codes, [self._manually_created_transcripts]) | ||||||
| 
 | 
 | ||||||
|     def _find_transcript(self, language_codes, transcript_dicts): |     def _find_transcript(self, language_codes, transcript_dicts): | ||||||
|         for language_code in language_codes: |         for language_code in language_codes: | ||||||
|  | @ -276,15 +278,16 @@ class Transcript(object): | ||||||
|             for translation_language in translation_languages |             for translation_language in translation_languages | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self, preserve_formatting=False): | ||||||
|         """ |         """ | ||||||
|         Loads the actual transcript data. |         Loads the actual transcript data. | ||||||
| 
 |         :param preserve_formatting: whether to keep select HTML text formatting | ||||||
|  |         :type preserve_formatting: bool | ||||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys |         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||||
|         :rtype [{'text': str, 'start': float, 'end': float}]: |         :rtype [{'text': str, 'start': float, 'end': float}]: | ||||||
|         """ |         """ | ||||||
|         response = self._http_client.get(self._url) |         response = self._http_client.get(self._url) | ||||||
|         return _TranscriptParser().parse( |         return _TranscriptParser(preserve_formatting=preserve_formatting).parse( | ||||||
|             _raise_http_errors(response, self.video_id).text, |             _raise_http_errors(response, self.video_id).text, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  | @ -318,12 +321,35 @@ class Transcript(object): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class _TranscriptParser(object): | class _TranscriptParser(object): | ||||||
|     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) |     _FORMATTING_TAGS = [ | ||||||
|  |         'strong',  # important | ||||||
|  |         'em',  # emphasized | ||||||
|  |         'b',  # bold | ||||||
|  |         'i',  # italic | ||||||
|  |         'mark',  # marked | ||||||
|  |         'small',  # smaller | ||||||
|  |         'del',  # deleted | ||||||
|  |         'ins',  # inserted | ||||||
|  |         'sub',  # subscript | ||||||
|  |         'sup',  # superscript | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     def __init__(self, preserve_formatting=False): | ||||||
|  |         self._html_regex = self._get_html_regex(preserve_formatting) | ||||||
|  | 
 | ||||||
|  |     def _get_html_regex(self, preserve_formatting): | ||||||
|  |         if preserve_formatting: | ||||||
|  |             formats_regex = '|'.join(self._FORMATTING_TAGS) | ||||||
|  |             formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' | ||||||
|  |             html_regex = re.compile(formats_regex, re.IGNORECASE) | ||||||
|  |         else: | ||||||
|  |             html_regex = re.compile(r'<[^>]*>', re.IGNORECASE) | ||||||
|  |         return html_regex | ||||||
| 
 | 
 | ||||||
|     def parse(self, plain_data): |     def parse(self, plain_data): | ||||||
|         return [ |         return [ | ||||||
|             { |             { | ||||||
|                 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), |                 'text': re.sub(self._html_regex, '', unescape(xml_element.text)), | ||||||
|                 'start': float(xml_element.attrib['start']), |                 'start': float(xml_element.attrib['start']), | ||||||
|                 'duration': float(xml_element.attrib.get('dur', '0.0')), |                 'duration': float(xml_element.attrib.get('dur', '0.0')), | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| <?xml version="1.0" encoding="utf-8" ?> | <?xml version="1.0" encoding="utf-8" ?> | ||||||
| <transcript> | <transcript> | ||||||
|     <text start="0" dur="1.54">Hey, this is just a test</text> |     <text start="0" dur="1.54">Hey, this is just a test</text> | ||||||
|     <text start="1.54" dur="4.16">this is not the original transcript</text> |     <text start="1.54" dur="4.16">this is <i>not</i> the original transcript</text> | ||||||
|     <text start="5" dur="0.5"></text> |     <text start="5" dur="0.5"></text> | ||||||
|     <text start="5.7" dur="3.239">just something shorter, I made up for testing</text> |     <text start="5.7" dur="3.239">just something shorter, I made up for testing</text> | ||||||
| </transcript> | </transcript> | ||||||
|  | @ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
|             ] |             ] | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  |     def test_get_transcript_formatted(self): | ||||||
|  |         transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual( | ||||||
|  |             transcript, | ||||||
|  |             [ | ||||||
|  |                 {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, | ||||||
|  |                 {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16}, | ||||||
|  |                 {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|     def test_list_transcripts(self): |     def test_list_transcripts(self): | ||||||
|         transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') |         transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') | ||||||
| 
 | 
 | ||||||
|  | @ -271,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) |         YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) | ||||||
| 
 | 
 | ||||||
|         mock_get_transcript.assert_any_call(video_id_1, languages, None, None) |         mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False) | ||||||
|         mock_get_transcript.assert_any_call(video_id_2, languages, None, None) |         mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False) | ||||||
|         self.assertEqual(mock_get_transcript.call_count, 2) |         self.assertEqual(mock_get_transcript.call_count, 2) | ||||||
| 
 | 
 | ||||||
|     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) |     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) | ||||||
|  | @ -287,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase): | ||||||
| 
 | 
 | ||||||
|         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) |         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) | ||||||
| 
 | 
 | ||||||
|         mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) |         mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False) | ||||||
|         mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) |         mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False) | ||||||
|      |      | ||||||
|     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') |     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') | ||||||
|     def test_get_transcripts__with_cookies(self, mock_get_transcript): |     def test_get_transcripts__with_cookies(self, mock_get_transcript): | ||||||
|         cookies = '/example_cookies.txt' |         cookies = '/example_cookies.txt' | ||||||
|         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) |         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) | ||||||
|         mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) |         mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False) | ||||||
| 
 | 
 | ||||||
|     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') |     @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') | ||||||
|     def test_get_transcripts__with_proxies(self, mock_get_transcript): |     def test_get_transcripts__with_proxies(self, mock_get_transcript): | ||||||
|         proxies = {'http': '', 'https:': ''} |         proxies = {'http': '', 'https:': ''} | ||||||
|         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) |         YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) | ||||||
|         mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) |         mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False) | ||||||
| 
 | 
 | ||||||
|     def test_load_cookies(self): |     def test_load_cookies(self): | ||||||
|         dirname, filename = os.path.split(os.path.abspath(__file__)) |         dirname, filename = os.path.split(os.path.abspath(__file__)) | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase): | ||||||
|         self.transcript_mock = MagicMock() |         self.transcript_mock = MagicMock() | ||||||
|         self.transcript_mock.fetch = MagicMock(return_value=[ |         self.transcript_mock.fetch = MagicMock(return_value=[ | ||||||
|             {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, |             {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, | ||||||
|             {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, |             {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16}, | ||||||
|             {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} |             {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} | ||||||
|         ]) |         ]) | ||||||
|         self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) |         self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue