diff --git a/src/html_unescaping.py b/src/html_unescaping.py new file mode 100644 index 0000000..eb88b33 --- /dev/null +++ b/src/html_unescaping.py @@ -0,0 +1,19 @@ +import sys + +if sys.version_info.major == 3 and sys.version_info.minor >= 4: + # Python 3.4+ + from html import unescape +else: + if sys.version_info.major <= 2: + # Python 2 + import HTMLParser + + html_parser = HTMLParser.HTMLParser() + else: + # Python 3.0 - 3.3 + import html.parser + + html_parser = html.parser.HTMLParser() + + def unescape(string): + return html_parser.unescape(string) diff --git a/src/transcript_api.py b/src/transcript_api.py index 837792b..b426003 100644 --- a/src/transcript_api.py +++ b/src/transcript_api.py @@ -6,6 +6,8 @@ import logging import requests +from .html_unescaping import unescape + logger = logging.getLogger(__name__) @@ -112,7 +114,7 @@ class _TranscriptParser(): def parse(self): return [ { - 'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text), + 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib['dur']), }