add preserve_formatting to get_transcripts() + tests

also assertion tests for `get_transcripts()` updated to include `False` at the end for  new `preserve_formatting` param
This commit is contained in:
E. Seiver 2023-04-12 12:43:48 -07:00
parent fdedfff681
commit 72e9781528
2 changed files with 11 additions and 8 deletions

View File

@ -74,7 +74,8 @@ class YouTubeTranscriptApi(object):
preserve_formatting=preserve_formatting) preserve_formatting=preserve_formatting)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
cookies=None, preserve_formatting=False):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -91,6 +92,8 @@ class YouTubeTranscriptApi(object):
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies :param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str :type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@ -102,7 +105,7 @@ class YouTubeTranscriptApi(object):
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception

View File

@ -283,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
mock_get_transcript.assert_any_call(video_id_1, languages, None, None) mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False)
mock_get_transcript.assert_any_call(video_id_2, languages, None, None) mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False)
self.assertEqual(mock_get_transcript.call_count, 2) self.assertEqual(mock_get_transcript.call_count, 2)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@ -299,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False)
mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_cookies(self, mock_get_transcript): def test_get_transcripts__with_cookies(self, mock_get_transcript):
cookies = '/example_cookies.txt' cookies = '/example_cookies.txt'
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_proxies(self, mock_get_transcript): def test_get_transcripts__with_proxies(self, mock_get_transcript):
proxies = {'http': '', 'https:': ''} proxies = {'http': '', 'https:': ''}
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False)
def test_load_cookies(self): def test_load_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__)) dirname, filename = os.path.split(os.path.abspath(__file__))