_FORMATTING_TAGS is now a static property of _TranscriptParser; _get_html_regext is now private; removed preserve_formatting property of _TranscriptParser
This commit is contained in:
parent
ca93c48fa1
commit
8c62e5e276
|
@ -1,7 +1,7 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
||||||
if sys.version_info.major == 2: # pragma: no cover
|
if sys.version_info.major == 2: # pragma: no cover
|
||||||
reload(sys)
|
reload(sys)
|
||||||
sys.setdefaultencoding('utf-8')
|
sys.setdefaultencoding('utf-8')
|
||||||
|
|
||||||
|
@ -95,6 +95,7 @@ class TranscriptList(object):
|
||||||
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||||
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
||||||
"""
|
"""
|
||||||
The constructor is only for internal use. Use the static build method instead.
|
The constructor is only for internal use. Use the static build method instead.
|
||||||
|
@ -191,7 +192,7 @@ class TranscriptList(object):
|
||||||
:rtype Transcript:
|
:rtype Transcript:
|
||||||
:raises: NoTranscriptFound
|
:raises: NoTranscriptFound
|
||||||
"""
|
"""
|
||||||
return self._find_transcript(language_codes, [self._generated_transcripts,])
|
return self._find_transcript(language_codes, [self._generated_transcripts])
|
||||||
|
|
||||||
def find_manually_created_transcript(self, language_codes):
|
def find_manually_created_transcript(self, language_codes):
|
||||||
"""
|
"""
|
||||||
|
@ -205,7 +206,7 @@ class TranscriptList(object):
|
||||||
:rtype Transcript:
|
:rtype Transcript:
|
||||||
:raises: NoTranscriptFound
|
:raises: NoTranscriptFound
|
||||||
"""
|
"""
|
||||||
return self._find_transcript(language_codes, [self._manually_created_transcripts,])
|
return self._find_transcript(language_codes, [self._manually_created_transcripts])
|
||||||
|
|
||||||
def _find_transcript(self, language_codes, transcript_dicts):
|
def _find_transcript(self, language_codes, transcript_dicts):
|
||||||
for language_code in language_codes:
|
for language_code in language_codes:
|
||||||
|
@ -287,7 +288,8 @@ class Transcript(object):
|
||||||
"""
|
"""
|
||||||
response = self._http_client.get(self._url)
|
response = self._http_client.get(self._url)
|
||||||
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
||||||
_raise_http_errors(response, self.video_id).text,)
|
_raise_http_errors(response, self.video_id).text,
|
||||||
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return '{language_code} ("{language}"){translation_description}'.format(
|
return '{language_code} ("{language}"){translation_description}'.format(
|
||||||
|
@ -319,24 +321,24 @@ class Transcript(object):
|
||||||
|
|
||||||
|
|
||||||
class _TranscriptParser(object):
|
class _TranscriptParser(object):
|
||||||
def __init__(self, preserve_formatting=False):
|
_FORMATTING_TAGS = [
|
||||||
self.preserve_formatting = preserve_formatting
|
'strong', # important
|
||||||
self._FORMATTING_TAGS = [
|
'em', # emphasized
|
||||||
'strong', # important
|
'b', # bold
|
||||||
'em', # emphasized
|
'i', # italic
|
||||||
'b', # bold
|
'mark', # marked
|
||||||
'i', # italic
|
'small', # smaller
|
||||||
'mark', # marked
|
'del', # deleted
|
||||||
'small', # smaller
|
'ins', # inserted
|
||||||
'del', # deleted
|
'sub', # subscript
|
||||||
'ins', # inserted
|
'sup', # superscript
|
||||||
'sub', # subscript
|
]
|
||||||
'sup', # superscript
|
|
||||||
]
|
|
||||||
self._html_regex = self.get_html_regex()
|
|
||||||
|
|
||||||
def get_html_regex(self):
|
def __init__(self, preserve_formatting=False):
|
||||||
if self.preserve_formatting:
|
self._html_regex = self._get_html_regex(preserve_formatting)
|
||||||
|
|
||||||
|
def _get_html_regex(self, preserve_formatting):
|
||||||
|
if preserve_formatting:
|
||||||
formats_regex = '|'.join(self._FORMATTING_TAGS)
|
formats_regex = '|'.join(self._FORMATTING_TAGS)
|
||||||
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
|
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
|
||||||
html_regex = re.compile(formats_regex, re.IGNORECASE)
|
html_regex = re.compile(formats_regex, re.IGNORECASE)
|
||||||
|
|
Loading…
Reference in New Issue