Add more functionality to TranscriptFormatter base class

Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter.

- Now can specify a DELIMITER to separate multiple transcripts on.
- Can also specify how those items are combine overriding the combine class method.

Remove unused imports
Fix adjust some lines to meet PEP
This commit is contained in:
Chris Howell 2020-07-09 00:04:08 -07:00
parent 2c79bd563c
commit 1c0d584959
1 changed files with 56 additions and 39 deletions

View File

@ -1,13 +1,9 @@
from abc import ABCMeta
from abc import ABC
from abc import abstractclassmethod
from collections import defaultdict
import json
import re
from xml.etree import ElementTree
from ._html_unescaping import unescape
def parse_timecode(time):
"""Converts a `time` into a formatted transcript timecode.
@ -31,15 +27,29 @@ def parse_timecode(time):
return f"{hours}:{mins}:{secs},{ms}"
class TranscriptFormatter(metaclass=ABCMeta):
"""
Abstract Base TranscriptFormatter class
class TranscriptFormatter(ABC):
"""Abstract Base TranscriptFormatter class
This class should be inherited from to create additional
custom transcript formatters.
"""
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
DELIMITER = ''
@classmethod
def combine(cls, transcripts):
"""Subclass may override this class method.
Default behavior of this method will ''.join() the str()
of each transcript in transcripts.
:param transcripts: a list of many transcripts
:type transcript_data: list[<formatted transcript>, ...]
:return: A string joined on the `cls.DELIMITER` to combine transcripts
:rtype: str
"""
return cls.DELIMITER.join(
str(transcript) for transcript in transcripts)
@abstractclassmethod
def format(cls, transcript_data):
@ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta):
class JSONTranscriptFormatter(TranscriptFormatter):
"""Formatter for outputting JSON data"""
DELIMITER = ','
@classmethod
def combine(cls, transcripts):
return json.dumps(transcripts)
@classmethod
def format(cls, transcript_data):
return [json.dumps(transcript_data)] if transcript_data else []
return transcript_data
class TextTranscriptFormatter(TranscriptFormatter):
@ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter):
Converts the fetched transcript data into separated lines of
plain text separated by newline breaks (\n) with no timecodes.
"""
DELIMITER = '\n\n'
@classmethod
def format(cls, transcript_data):
return ['\n'.join(line['text'] for transcript in transcript_data
for line in transcript)]
return '{}\n'.format('\n'.join(
line['text']for line in transcript_data))
class SRTTranscriptFormatter(TranscriptFormatter):
"""Formatter for outputting the SRT Format
Converts the fetched transcript data into a simple .srt file format.
"""
DELIMITER = '\n\n'
@classmethod
def format(cls, transcript_data):
contents = []
for transcript in transcript_data:
content = []
for frame, item in enumerate(transcript, start=1):
output = []
for frame, item in enumerate(transcript_data, start=1):
start_time = float(item.get('start'))
duration = float(item.get('dur', '0.0'))
end_time = parse_timecode(start_time + duration)
start_time = parse_timecode(start_time)
content.append("{frame}\n".format(frame=frame))
content.append("{start_time} --> {end_time}\n".format(
output.append("{frame}\n".format(frame=frame))
output.append("{start_time} --> {end_time}\n".format(
start_time=start_time, end_time=end_time))
content.append("{text}\n\n".format(text=item.get('text')))
output.append("{text}".format(text=item.get('text')))
if frame < len(transcript_data):
output.append('\n\n')
contents.append(''.join(content))
return ['\n\n'.join(contents)]
return '{}\n'.format(''.join(output))
class TranscriptFormatterFactory: