175 lines
5.4 KiB
Python
175 lines
5.4 KiB
Python
from collections import defaultdict
|
|
import json
|
|
import re
|
|
|
|
|
|
def parse_timecode(time):
|
|
"""Converts a `time` into a formatted transcript timecode.
|
|
|
|
:param time: a float representing time in seconds.
|
|
:type time: float
|
|
:return: a string formatted as a timecode, 'HH:MM:SS,MS'
|
|
:rtype str
|
|
|
|
:example:
|
|
>>> parse_timecode(6.93)
|
|
'00:00:06,930'
|
|
"""
|
|
|
|
time = float(time)
|
|
times = {
|
|
'hours': str(int(time) // 3600).rjust(2, '0'),
|
|
'mins': str(int(time) // 60).rjust(2, '0'),
|
|
'secs': str(int(time) % 60).rjust(2, '0'),
|
|
'ms': str(int(round((time - int(time))*1000, 2))).rjust(3, '0')
|
|
}
|
|
return "{hours}:{mins}:{secs},{ms}".format(**times)
|
|
|
|
|
|
class TranscriptFormatter(object):
|
|
"""Abstract Base TranscriptFormatter class
|
|
|
|
This class should be inherited from to create additional
|
|
custom transcript formatters.
|
|
"""
|
|
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
|
DELIMITER = ''
|
|
|
|
@classmethod
|
|
def combine(cls, transcripts):
|
|
"""Subclass may override this class method.
|
|
|
|
Default behavior of this method will ''.join() the str()
|
|
of each transcript in transcripts.
|
|
|
|
:param transcripts: a list of many transcripts
|
|
:type transcript_data: list[<formatted transcript>, ...]
|
|
:return: A string joined on the `cls.DELIMITER` to combine transcripts
|
|
:rtype: str
|
|
"""
|
|
return cls.DELIMITER.join(
|
|
str(transcript) for transcript in transcripts)
|
|
|
|
@classmethod
|
|
def format(cls, transcript_data):
|
|
"""Any subclass must implement this format class method.
|
|
|
|
:param transcript_data: a list of transcripts, 1 or more.
|
|
:type transcript_data: list[list[dict], list[dict]]
|
|
:return: A list where each item is an individual transcript
|
|
as a string.
|
|
:rtype: list[str]
|
|
"""
|
|
raise NotImplementedError(
|
|
cls.__name__ + '.format'
|
|
)
|
|
|
|
|
|
class JSONTranscriptFormatter(TranscriptFormatter):
|
|
"""Formatter for outputting JSON data"""
|
|
DELIMITER = ','
|
|
|
|
@classmethod
|
|
def combine(cls, transcripts):
|
|
return json.dumps(transcripts)
|
|
|
|
@classmethod
|
|
def format(cls, transcript_data):
|
|
return transcript_data
|
|
|
|
|
|
class TextTranscriptFormatter(TranscriptFormatter):
|
|
"""Formatter for outputting a Plain Text Format
|
|
|
|
Converts the fetched transcript data into separated lines of
|
|
plain text separated by newline breaks (\n) with no timecodes.
|
|
"""
|
|
DELIMITER = '\n\n'
|
|
|
|
@classmethod
|
|
def format(cls, transcript_data):
|
|
return '{}\n'.format('\n'.join(
|
|
line['text']for line in transcript_data))
|
|
|
|
|
|
class SRTTranscriptFormatter(TranscriptFormatter):
|
|
"""Formatter for outputting the SRT Format
|
|
|
|
Converts the fetched transcript data into a simple .srt file format.
|
|
"""
|
|
DELIMITER = '\n\n'
|
|
|
|
@classmethod
|
|
def format(cls, transcript_data):
|
|
output = []
|
|
for frame, item in enumerate(transcript_data, start=1):
|
|
start_time = float(item.get('start'))
|
|
duration = float(item.get('duration', '0.0'))
|
|
|
|
output.append("{frame}\n".format(frame=frame))
|
|
output.append("{start_time} --> {end_time}\n".format(
|
|
start_time=parse_timecode(start_time),
|
|
end_time=parse_timecode(start_time + duration)
|
|
))
|
|
output.append("{text}".format(text=item.get('text')))
|
|
if frame < len(transcript_data):
|
|
output.append('\n\n')
|
|
return '{}\n'.format(''.join(output))
|
|
|
|
|
|
class TranscriptFormatterFactory(object):
|
|
"""A Transcript Class Factory
|
|
|
|
Allows for adding additional custom Transcript classes for the API
|
|
to use. Custom Transcript classes must inherit from the
|
|
TranscriptFormatter abstract base class.
|
|
"""
|
|
def __init__(self):
|
|
self._formatters = defaultdict(JSONTranscriptFormatter)
|
|
|
|
def add_formatter(self, name, formatter_class):
|
|
"""Allows for creating additional transcript formatters.
|
|
|
|
|
|
:param name: a name given to the `formatter_class`
|
|
:type name: str
|
|
:param formatter_class: a subclass of TranscriptFormatter
|
|
:type formatter_class: class
|
|
:rtype None
|
|
"""
|
|
if not issubclass(formatter_class, TranscriptFormatter):
|
|
raise TypeError((
|
|
'{0} must be a subclass of TranscriptFormatter'
|
|
).format(formatter_class)
|
|
)
|
|
self._formatters.update({name: formatter_class})
|
|
|
|
def add_formatters(self, formatters_dict):
|
|
"""Allow creation of multiple transcript formatters at a time.
|
|
|
|
:param formatters_dict: key(s) are the string name to be given
|
|
to the formatter class, value for each key should be a subclass
|
|
of TranscriptFormatter.
|
|
:type formatters_dict: dict
|
|
:rtype None
|
|
"""
|
|
for name, formatter_class in formatters_dict.items():
|
|
self.add_formatter(name, formatter_class)
|
|
|
|
def get_formatter(self, name):
|
|
"""Retrieve a formatter class by its assigned name.
|
|
|
|
:param name: the string name given to the formatter class.
|
|
:type name: str
|
|
:return: a subclass of `TranscriptFormatter`
|
|
"""
|
|
return self._formatters[name]
|
|
|
|
|
|
formats = TranscriptFormatterFactory()
|
|
formats.add_formatters({
|
|
'json': JSONTranscriptFormatter,
|
|
'srt': SRTTranscriptFormatter,
|
|
'text': TextTranscriptFormatter
|
|
})
|