Merge pull request #164 from liamrs222/master
added new Formatter for SRT files
This commit is contained in:
		
						commit
						f92fe21a25
					
				|  | @ -79,8 +79,19 @@ class TextFormatter(Formatter): | ||||||
|         """ |         """ | ||||||
|         return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) |         return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) | ||||||
| 
 | 
 | ||||||
|  | class _TextBasedFormatter(TextFormatter): | ||||||
|  |     def _format_timestamp(self, hours, mins, secs, ms): | ||||||
|  |         raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ | ||||||
|  |             'their own .format_timestamp() method.') | ||||||
|  | 
 | ||||||
|  |     def _format_transcript_header(self, lines): | ||||||
|  |         raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ | ||||||
|  |             'their own _format_transcript_header method.') | ||||||
|  | 
 | ||||||
|  |     def _format_transcript_helper(self, i, time_text, line): | ||||||
|  |         raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ | ||||||
|  |             'their own _format_transcript_helper method.')     | ||||||
|              |              | ||||||
| class WebVTTFormatter(Formatter): |  | ||||||
|     def _seconds_to_timestamp(self, time): |     def _seconds_to_timestamp(self, time): | ||||||
|         """Helper that converts `time` into a transcript cue timestamp. |         """Helper that converts `time` into a transcript cue timestamp. | ||||||
| 
 | 
 | ||||||
|  | @ -95,44 +106,55 @@ class WebVTTFormatter(Formatter): | ||||||
|         '00:00:06.930' |         '00:00:06.930' | ||||||
|         """ |         """ | ||||||
|         time = float(time) |         time = float(time) | ||||||
|         hours, remainder = divmod(time, 3600) |         hours_float, remainder = divmod(time, 3600) | ||||||
|         mins, secs = divmod(remainder, 60) |         mins_float, secs_float = divmod(remainder, 60) | ||||||
|  |         hours, mins, secs = int(hours_float), int(mins_float), int(secs_float) | ||||||
|         ms = int(round((time - int(time))*1000, 2)) |         ms = int(round((time - int(time))*1000, 2)) | ||||||
|         return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms) |         return self._format_timestamp(hours, mins, secs, ms) | ||||||
| 
 | 
 | ||||||
|     def format_transcript(self, transcript, **kwargs): |     def format_transcript(self, transcript, **kwargs): | ||||||
|         """A basic implementation of WEBVTT formatting. |         """A basic implementation of WEBVTT/SRT formatting. | ||||||
| 
 | 
 | ||||||
|         :param transcript: |         :param transcript: | ||||||
|         :reference: https://www.w3.org/TR/webvtt1/#introduction-caption |         :reference:  | ||||||
|  |         https://www.w3.org/TR/webvtt1/#introduction-caption | ||||||
|  |         https://www.3playmedia.com/blog/create-srt-file/ | ||||||
|         """ |         """ | ||||||
|         lines = [] |         lines = [] | ||||||
|         for i, line in enumerate(transcript): |         for i, line in enumerate(transcript): | ||||||
|             if i < len(transcript) - 1: |             end = line['start'] + line['duration'] | ||||||
|                 # Looks ahead, use next start time since duration value |  | ||||||
|                 # would create an overlap between start times. |  | ||||||
|             time_text = "{} --> {}".format( |             time_text = "{} --> {}".format( | ||||||
|                 self._seconds_to_timestamp(line['start']), |                 self._seconds_to_timestamp(line['start']), | ||||||
|                     self._seconds_to_timestamp(transcript[i + 1]['start']) |                 self._seconds_to_timestamp( | ||||||
|  |                     transcript[i + 1]['start']  | ||||||
|  |                     if i < len(transcript) - 1 and transcript[i + 1]['start'] < end else end | ||||||
|                 ) |                 ) | ||||||
|             else: |  | ||||||
|                 # Reached the end, cannot look ahead, use duration now. |  | ||||||
|                 duration = line['start'] + line['duration'] |  | ||||||
|                 time_text = "{} --> {}".format( |  | ||||||
|                     self._seconds_to_timestamp(line['start']), |  | ||||||
|                     self._seconds_to_timestamp(duration) |  | ||||||
|             ) |             ) | ||||||
|             lines.append("{}\n{}".format(time_text, line['text'])) |             lines.append(self._format_transcript_helper(i, time_text, line)) | ||||||
| 
 | 
 | ||||||
|  |         return self._format_transcript_header(lines) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class SRTFormatter(_TextBasedFormatter): | ||||||
|  |     def _format_timestamp(self, hours, mins, secs, ms): | ||||||
|  |         return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, mins, secs, ms) | ||||||
|  |      | ||||||
|  |     def _format_transcript_header(self, lines): | ||||||
|  |         return "\n\n".join(lines) + "\n" | ||||||
|  | 
 | ||||||
|  |     def _format_transcript_helper(self, i, time_text, line): | ||||||
|  |         return "{}\n{}\n{}".format(i + 1, time_text, line['text']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class WebVTTFormatter(_TextBasedFormatter): | ||||||
|  |     def _format_timestamp(self, hours, mins, secs, ms): | ||||||
|  |         return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) | ||||||
|  | 
 | ||||||
|  |     def _format_transcript_header(self, lines): | ||||||
|         return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" |         return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" | ||||||
| 
 | 
 | ||||||
|     def format_transcripts(self, transcripts, **kwargs): |     def _format_transcript_helper(self, i, time_text, line): | ||||||
|         """A basic implementation of WEBVTT formatting for a list of transcripts. |         return "{}\n{}".format(time_text, line['text']) | ||||||
| 
 |  | ||||||
|         :param transcripts: |  | ||||||
|         :reference: https://www.w3.org/TR/webvtt1/#introduction-caption |  | ||||||
|         """ |  | ||||||
|         return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class FormatterLoader(object): | class FormatterLoader(object): | ||||||
|  | @ -141,6 +163,7 @@ class FormatterLoader(object): | ||||||
|         'pretty': PrettyPrintFormatter, |         'pretty': PrettyPrintFormatter, | ||||||
|         'text': TextFormatter, |         'text': TextFormatter, | ||||||
|         'webvtt': WebVTTFormatter, |         'webvtt': WebVTTFormatter, | ||||||
|  |         'srt' : SRTFormatter, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     class UnknownFormatterType(Exception): |     class UnknownFormatterType(Exception): | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ from youtube_transcript_api.formatters import ( | ||||||
|     Formatter, |     Formatter, | ||||||
|     JSONFormatter, |     JSONFormatter, | ||||||
|     TextFormatter, |     TextFormatter, | ||||||
|  |     SRTFormatter, | ||||||
|     WebVTTFormatter, |     WebVTTFormatter, | ||||||
|     PrettyPrintFormatter, FormatterLoader |     PrettyPrintFormatter, FormatterLoader | ||||||
| ) | ) | ||||||
|  | @ -28,6 +29,38 @@ class TestFormatters(TestCase): | ||||||
|         with self.assertRaises(NotImplementedError): |         with self.assertRaises(NotImplementedError): | ||||||
|             Formatter().format_transcripts([self.transcript]) |             Formatter().format_transcripts([self.transcript]) | ||||||
| 
 | 
 | ||||||
|  |     def test_srt_formatter_starting(self): | ||||||
|  |         content = SRTFormatter().format_transcript(self.transcript) | ||||||
|  |         lines = content.split('\n') | ||||||
|  | 
 | ||||||
|  |         # test starting lines | ||||||
|  |         self.assertEqual(lines[0], "1") | ||||||
|  |         self.assertEqual(lines[1], "00:00:00,000 --> 00:00:01,500") | ||||||
|  |          | ||||||
|  |     def test_srt_formatter_middle(self): | ||||||
|  |         content = SRTFormatter().format_transcript(self.transcript) | ||||||
|  |         lines = content.split('\n') | ||||||
|  | 
 | ||||||
|  |         # test middle lines | ||||||
|  |         self.assertEqual(lines[4], "2") | ||||||
|  |         self.assertEqual(lines[5], "00:00:01,500 --> 00:00:02,500") | ||||||
|  |         self.assertEqual(lines[6], self.transcript[1]['text']) | ||||||
|  | 
 | ||||||
|  |     def test_srt_formatter_ending(self): | ||||||
|  |         content = SRTFormatter().format_transcript(self.transcript) | ||||||
|  |         lines = content.split('\n') | ||||||
|  | 
 | ||||||
|  |         # test ending lines | ||||||
|  |         self.assertEqual(lines[-2], self.transcript[-1]['text']) | ||||||
|  |         self.assertEqual(lines[-1], "") | ||||||
|  | 
 | ||||||
|  |     def test_srt_formatter_many(self): | ||||||
|  |         formatter = SRTFormatter() | ||||||
|  |         content = formatter.format_transcripts(self.transcripts) | ||||||
|  |         formatted_single_transcript = formatter.format_transcript(self.transcript) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) | ||||||
|  | 
 | ||||||
|     def test_webvtt_formatter_starting(self): |     def test_webvtt_formatter_starting(self): | ||||||
|         content = WebVTTFormatter().format_transcript(self.transcript) |         content = WebVTTFormatter().format_transcript(self.transcript) | ||||||
|         lines = content.split('\n') |         lines = content.split('\n') | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue