Add more functionality to TranscriptFormatter base class
Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter. - Now can specify a DELIMITER to separate multiple transcripts on. - Can also specify how those items are combine overriding the combine class method. Remove unused imports Fix adjust some lines to meet PEP
This commit is contained in:
		
							parent
							
								
									2c79bd563c
								
							
						
					
					
						commit
						1c0d584959
					
				|  | @ -1,13 +1,9 @@ | ||||||
| from abc import ABCMeta | from abc import ABC | ||||||
| from abc import abstractclassmethod | from abc import abstractclassmethod | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| import json | import json | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| from xml.etree import ElementTree |  | ||||||
| 
 |  | ||||||
| from ._html_unescaping import unescape |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def parse_timecode(time): | def parse_timecode(time): | ||||||
|     """Converts a `time` into a formatted transcript timecode. |     """Converts a `time` into a formatted transcript timecode. | ||||||
|  | @ -31,15 +27,29 @@ def parse_timecode(time): | ||||||
|     return f"{hours}:{mins}:{secs},{ms}" |     return f"{hours}:{mins}:{secs},{ms}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TranscriptFormatter(metaclass=ABCMeta): | class TranscriptFormatter(ABC): | ||||||
|     """ |     """Abstract Base TranscriptFormatter class | ||||||
|     Abstract Base TranscriptFormatter class |  | ||||||
| 
 | 
 | ||||||
|     This class should be inherited from to create additional |     This class should be inherited from to create additional | ||||||
|      custom transcript formatters. |      custom transcript formatters. | ||||||
|      |  | ||||||
|     """ |     """ | ||||||
|     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) |     HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) | ||||||
|  |     DELIMITER = '' | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def combine(cls, transcripts): | ||||||
|  |         """Subclass may override this class method. | ||||||
|  | 
 | ||||||
|  |         Default behavior of this method will ''.join() the str()  | ||||||
|  |          of each transcript in transcripts. | ||||||
|  | 
 | ||||||
|  |         :param transcripts: a list of many transcripts | ||||||
|  |         :type transcript_data: list[<formatted transcript>, ...] | ||||||
|  |         :return: A string joined on the `cls.DELIMITER` to combine transcripts | ||||||
|  |         :rtype: str | ||||||
|  |         """ | ||||||
|  |         return cls.DELIMITER.join( | ||||||
|  |                 str(transcript) for transcript in transcripts) | ||||||
| 
 | 
 | ||||||
|     @abstractclassmethod |     @abstractclassmethod | ||||||
|     def format(cls, transcript_data): |     def format(cls, transcript_data): | ||||||
|  | @ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta): | ||||||
| 
 | 
 | ||||||
| class JSONTranscriptFormatter(TranscriptFormatter): | class JSONTranscriptFormatter(TranscriptFormatter): | ||||||
|     """Formatter for outputting JSON data""" |     """Formatter for outputting JSON data""" | ||||||
|  |     DELIMITER = ',' | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def combine(cls, transcripts): | ||||||
|  |         return json.dumps(transcripts) | ||||||
|  | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def format(cls, transcript_data): |     def format(cls, transcript_data): | ||||||
|         return [json.dumps(transcript_data)] if transcript_data else [] |         return transcript_data | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TextTranscriptFormatter(TranscriptFormatter): | class TextTranscriptFormatter(TranscriptFormatter): | ||||||
|  | @ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter): | ||||||
| 
 | 
 | ||||||
|     Converts the fetched transcript data into separated lines of |     Converts the fetched transcript data into separated lines of | ||||||
|      plain text separated by newline breaks (\n) with no timecodes. |      plain text separated by newline breaks (\n) with no timecodes. | ||||||
|      |  | ||||||
|     """ |     """ | ||||||
|  |     DELIMITER = '\n\n' | ||||||
|  | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def format(cls, transcript_data): |     def format(cls, transcript_data): | ||||||
|         return ['\n'.join(line['text'] for transcript in transcript_data |         return '{}\n'.format('\n'.join( | ||||||
|                             for line in transcript)] |                     line['text']for line in transcript_data)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SRTTranscriptFormatter(TranscriptFormatter): | class SRTTranscriptFormatter(TranscriptFormatter): | ||||||
|     """Formatter for outputting the SRT Format |     """Formatter for outputting the SRT Format | ||||||
| 
 | 
 | ||||||
|     Converts the fetched transcript data into a simple .srt file format. |     Converts the fetched transcript data into a simple .srt file format. | ||||||
| 
 |  | ||||||
|     """ |     """ | ||||||
|  |     DELIMITER = '\n\n' | ||||||
|  | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def format(cls, transcript_data): |     def format(cls, transcript_data): | ||||||
|         contents = [] |         output = [] | ||||||
|         for transcript in transcript_data: |         for frame, item in enumerate(transcript_data, start=1): | ||||||
|             content = [] |  | ||||||
|             for frame, item in enumerate(transcript, start=1): |  | ||||||
|             start_time = float(item.get('start')) |             start_time = float(item.get('start')) | ||||||
|             duration = float(item.get('dur', '0.0')) |             duration = float(item.get('dur', '0.0')) | ||||||
| 
 | 
 | ||||||
|             end_time = parse_timecode(start_time + duration) |             end_time = parse_timecode(start_time + duration) | ||||||
|             start_time = parse_timecode(start_time) |             start_time = parse_timecode(start_time) | ||||||
| 
 | 
 | ||||||
|                 content.append("{frame}\n".format(frame=frame)) |             output.append("{frame}\n".format(frame=frame)) | ||||||
|                 content.append("{start_time} --> {end_time}\n".format( |             output.append("{start_time} --> {end_time}\n".format( | ||||||
|                 start_time=start_time, end_time=end_time)) |                 start_time=start_time, end_time=end_time)) | ||||||
|                 content.append("{text}\n\n".format(text=item.get('text'))) |             output.append("{text}".format(text=item.get('text'))) | ||||||
|  |             if frame < len(transcript_data): | ||||||
|  |                 output.append('\n\n') | ||||||
| 
 | 
 | ||||||
|             contents.append(''.join(content)) |         return '{}\n'.format(''.join(output)) | ||||||
|         return ['\n\n'.join(contents)] |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TranscriptFormatterFactory: | class TranscriptFormatterFactory: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue