From 2b3a6f3a716e4adc17cab3e7aff0a776e11a16a7 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Thu, 26 Apr 2018 13:36:14 +0200 Subject: [PATCH] support for html unescaping for all python versions added --- src/html_unescaping.py | 19 +++++++++++++++++++ src/transcript_api.py | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 src/html_unescaping.py diff --git a/src/html_unescaping.py b/src/html_unescaping.py new file mode 100644 index 0000000..eb88b33 --- /dev/null +++ b/src/html_unescaping.py @@ -0,0 +1,19 @@ +import sys + +if sys.version_info.major == 3 and sys.version_info.minor >= 4: + # Python 3.4+ + from html import unescape +else: + if sys.version_info.major <= 2: + # Python 2 + import HTMLParser + + html_parser = HTMLParser.HTMLParser() + else: + # Python 3.0 - 3.3 + import html.parser + + html_parser = html.parser.HTMLParser() + + def unescape(string): + return html_parser.unescape(string) diff --git a/src/transcript_api.py b/src/transcript_api.py index 837792b..b426003 100644 --- a/src/transcript_api.py +++ b/src/transcript_api.py @@ -6,6 +6,8 @@ import logging import requests +from .html_unescaping import unescape + logger = logging.getLogger(__name__) @@ -112,7 +114,7 @@ class _TranscriptParser(): def parse(self): return [ { - 'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text), + 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib['dur']), }