Add more functionality to TranscriptFormatter base class
Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter. - Now can specify a DELIMITER to separate multiple transcripts on. - Can also specify how those items are combine overriding the combine class method. Remove unused imports Fix adjust some lines to meet PEP
This commit is contained in:
parent
2c79bd563c
commit
1c0d584959
|
@ -1,13 +1,9 @@
|
|||
from abc import ABCMeta
|
||||
from abc import ABC
|
||||
from abc import abstractclassmethod
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import re
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ._html_unescaping import unescape
|
||||
|
||||
|
||||
def parse_timecode(time):
|
||||
"""Converts a `time` into a formatted transcript timecode.
|
||||
|
@ -31,15 +27,29 @@ def parse_timecode(time):
|
|||
return f"{hours}:{mins}:{secs},{ms}"
|
||||
|
||||
|
||||
class TranscriptFormatter(metaclass=ABCMeta):
|
||||
"""
|
||||
Abstract Base TranscriptFormatter class
|
||||
class TranscriptFormatter(ABC):
|
||||
"""Abstract Base TranscriptFormatter class
|
||||
|
||||
This class should be inherited from to create additional
|
||||
custom transcript formatters.
|
||||
|
||||
"""
|
||||
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
||||
DELIMITER = ''
|
||||
|
||||
@classmethod
|
||||
def combine(cls, transcripts):
|
||||
"""Subclass may override this class method.
|
||||
|
||||
Default behavior of this method will ''.join() the str()
|
||||
of each transcript in transcripts.
|
||||
|
||||
:param transcripts: a list of many transcripts
|
||||
:type transcript_data: list[<formatted transcript>, ...]
|
||||
:return: A string joined on the `cls.DELIMITER` to combine transcripts
|
||||
:rtype: str
|
||||
"""
|
||||
return cls.DELIMITER.join(
|
||||
str(transcript) for transcript in transcripts)
|
||||
|
||||
@abstractclassmethod
|
||||
def format(cls, transcript_data):
|
||||
|
@ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta):
|
|||
|
||||
class JSONTranscriptFormatter(TranscriptFormatter):
|
||||
"""Formatter for outputting JSON data"""
|
||||
DELIMITER = ','
|
||||
|
||||
@classmethod
|
||||
def combine(cls, transcripts):
|
||||
return json.dumps(transcripts)
|
||||
|
||||
@classmethod
|
||||
def format(cls, transcript_data):
|
||||
return [json.dumps(transcript_data)] if transcript_data else []
|
||||
return transcript_data
|
||||
|
||||
|
||||
class TextTranscriptFormatter(TranscriptFormatter):
|
||||
|
@ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter):
|
|||
|
||||
Converts the fetched transcript data into separated lines of
|
||||
plain text separated by newline breaks (\n) with no timecodes.
|
||||
|
||||
"""
|
||||
DELIMITER = '\n\n'
|
||||
|
||||
@classmethod
|
||||
def format(cls, transcript_data):
|
||||
return ['\n'.join(line['text'] for transcript in transcript_data
|
||||
for line in transcript)]
|
||||
return '{}\n'.format('\n'.join(
|
||||
line['text']for line in transcript_data))
|
||||
|
||||
|
||||
class SRTTranscriptFormatter(TranscriptFormatter):
|
||||
"""Formatter for outputting the SRT Format
|
||||
|
||||
Converts the fetched transcript data into a simple .srt file format.
|
||||
|
||||
"""
|
||||
DELIMITER = '\n\n'
|
||||
|
||||
@classmethod
|
||||
def format(cls, transcript_data):
|
||||
contents = []
|
||||
for transcript in transcript_data:
|
||||
content = []
|
||||
for frame, item in enumerate(transcript, start=1):
|
||||
output = []
|
||||
for frame, item in enumerate(transcript_data, start=1):
|
||||
start_time = float(item.get('start'))
|
||||
duration = float(item.get('dur', '0.0'))
|
||||
|
||||
end_time = parse_timecode(start_time + duration)
|
||||
start_time = parse_timecode(start_time)
|
||||
|
||||
content.append("{frame}\n".format(frame=frame))
|
||||
content.append("{start_time} --> {end_time}\n".format(
|
||||
output.append("{frame}\n".format(frame=frame))
|
||||
output.append("{start_time} --> {end_time}\n".format(
|
||||
start_time=start_time, end_time=end_time))
|
||||
content.append("{text}\n\n".format(text=item.get('text')))
|
||||
output.append("{text}".format(text=item.get('text')))
|
||||
if frame < len(transcript_data):
|
||||
output.append('\n\n')
|
||||
|
||||
contents.append(''.join(content))
|
||||
return ['\n\n'.join(contents)]
|
||||
return '{}\n'.format(''.join(output))
|
||||
|
||||
|
||||
class TranscriptFormatterFactory:
|
||||
|
@ -124,7 +141,7 @@ class TranscriptFormatterFactory:
|
|||
if not issubclass(formatter_class, TranscriptFormatter):
|
||||
raise TypeError(
|
||||
f'{formatter_class} must be a subclass of TranscriptFormatter')
|
||||
self._formatters.update({name:formatter_class})
|
||||
self._formatters.update({name: formatter_class})
|
||||
|
||||
def add_formatters(self, formatters_dict):
|
||||
"""Allow creation of multiple transcript formatters at a time.
|
||||
|
|
Loading…
Reference in New Issue