Add more functionality to TranscriptFormatter base class
Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter. - Now can specify a DELIMITER to separate multiple transcripts on. - Can also specify how those items are combine overriding the combine class method. Remove unused imports Fix adjust some lines to meet PEP
This commit is contained in:
		
							parent
							
								
									2c79bd563c
								
							
						
					
					
						commit
						1c0d584959
					
				|  | @ -1,13 +1,9 @@ | |||
| from abc import ABCMeta | ||||
| from abc import ABC | ||||
| from abc import abstractclassmethod | ||||
| from collections import defaultdict | ||||
| import json | ||||
| import re | ||||
| 
 | ||||
| from xml.etree import ElementTree | ||||
| 
 | ||||
| from ._html_unescaping import unescape | ||||
| 
 | ||||
| 
 | ||||
| def parse_timecode(time): | ||||
|     """Converts a `time` into a formatted transcript timecode. | ||||
|  | @ -31,15 +27,29 @@ def parse_timecode(time): | |||
|     return f"{hours}:{mins}:{secs},{ms}" | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptFormatter(metaclass=ABCMeta): | ||||
|     """ | ||||
|     Abstract Base TranscriptFormatter class | ||||
| class TranscriptFormatter(ABC): | ||||
|     """Abstract Base TranscriptFormatter class | ||||
| 
 | ||||
|     This class should be inherited from to create additional | ||||
|      custom transcript formatters. | ||||
|      | ||||
|     """ | ||||
|     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) | ||||
|     DELIMITER = '' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def combine(cls, transcripts): | ||||
|         """Subclass may override this class method. | ||||
| 
 | ||||
|         Default behavior of this method will ''.join() the str()  | ||||
|          of each transcript in transcripts. | ||||
| 
 | ||||
|         :param transcripts: a list of many transcripts | ||||
|         :type transcript_data: list[<formatted transcript>, ...] | ||||
|         :return: A string joined on the `cls.DELIMITER` to combine transcripts | ||||
|         :rtype: str | ||||
|         """ | ||||
|         return cls.DELIMITER.join( | ||||
|                 str(transcript) for transcript in transcripts) | ||||
| 
 | ||||
|     @abstractclassmethod | ||||
|     def format(cls, transcript_data): | ||||
|  | @ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta): | |||
| 
 | ||||
| class JSONTranscriptFormatter(TranscriptFormatter): | ||||
|     """Formatter for outputting JSON data""" | ||||
|     DELIMITER = ',' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def combine(cls, transcripts): | ||||
|         return json.dumps(transcripts) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def format(cls, transcript_data): | ||||
|         return [json.dumps(transcript_data)] if transcript_data else [] | ||||
|         return transcript_data | ||||
| 
 | ||||
| 
 | ||||
| class TextTranscriptFormatter(TranscriptFormatter): | ||||
|  | @ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter): | |||
| 
 | ||||
|     Converts the fetched transcript data into separated lines of | ||||
|      plain text separated by newline breaks (\n) with no timecodes. | ||||
|      | ||||
|     """ | ||||
|     DELIMITER = '\n\n' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def format(cls, transcript_data): | ||||
|         return ['\n'.join(line['text'] for transcript in transcript_data | ||||
|                             for line in transcript)] | ||||
|         return '{}\n'.format('\n'.join( | ||||
|                     line['text']for line in transcript_data)) | ||||
| 
 | ||||
| 
 | ||||
| class SRTTranscriptFormatter(TranscriptFormatter): | ||||
|     """Formatter for outputting the SRT Format | ||||
| 
 | ||||
|     Converts the fetched transcript data into a simple .srt file format. | ||||
| 
 | ||||
|     """ | ||||
|     DELIMITER = '\n\n' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def format(cls, transcript_data): | ||||
|         contents = [] | ||||
|         for transcript in transcript_data: | ||||
|             content = [] | ||||
|             for frame, item in enumerate(transcript, start=1): | ||||
|         output = [] | ||||
|         for frame, item in enumerate(transcript_data, start=1): | ||||
|             start_time = float(item.get('start')) | ||||
|             duration = float(item.get('dur', '0.0')) | ||||
| 
 | ||||
|             end_time = parse_timecode(start_time + duration) | ||||
|             start_time = parse_timecode(start_time) | ||||
| 
 | ||||
|                 content.append("{frame}\n".format(frame=frame)) | ||||
|                 content.append("{start_time} --> {end_time}\n".format( | ||||
|             output.append("{frame}\n".format(frame=frame)) | ||||
|             output.append("{start_time} --> {end_time}\n".format( | ||||
|                 start_time=start_time, end_time=end_time)) | ||||
|                 content.append("{text}\n\n".format(text=item.get('text'))) | ||||
|             output.append("{text}".format(text=item.get('text'))) | ||||
|             if frame < len(transcript_data): | ||||
|                 output.append('\n\n') | ||||
| 
 | ||||
|             contents.append(''.join(content)) | ||||
|         return ['\n\n'.join(contents)] | ||||
|         return '{}\n'.format(''.join(output)) | ||||
| 
 | ||||
| 
 | ||||
| class TranscriptFormatterFactory: | ||||
|  | @ -124,7 +141,7 @@ class TranscriptFormatterFactory: | |||
|         if not issubclass(formatter_class, TranscriptFormatter): | ||||
|             raise TypeError( | ||||
|                 f'{formatter_class} must be a subclass of TranscriptFormatter') | ||||
|         self._formatters.update({name:formatter_class}) | ||||
|         self._formatters.update({name: formatter_class}) | ||||
| 
 | ||||
|     def add_formatters(self, formatters_dict): | ||||
|         """Allow creation of multiple transcript formatters at a time. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue