diff --git a/src/html_unescaping.py b/src/html_unescaping.py
new file mode 100644
index 0000000..eb88b33
--- /dev/null
+++ b/src/html_unescaping.py
@@ -0,0 +1,19 @@
+import sys
+
+if sys.version_info.major == 3 and sys.version_info.minor >= 4:
+ # Python 3.4+
+ from html import unescape
+else:
+ if sys.version_info.major <= 2:
+ # Python 2
+ import HTMLParser
+
+ html_parser = HTMLParser.HTMLParser()
+ else:
+ # Python 3.0 - 3.3
+ import html.parser
+
+ html_parser = html.parser.HTMLParser()
+
+ def unescape(string):
+ return html_parser.unescape(string)
diff --git a/src/transcript_api.py b/src/transcript_api.py
index 837792b..b426003 100644
--- a/src/transcript_api.py
+++ b/src/transcript_api.py
@@ -6,6 +6,8 @@ import logging
import requests
+from .html_unescaping import unescape
+
logger = logging.getLogger(__name__)
@@ -112,7 +114,7 @@ class _TranscriptParser():
def parse(self):
return [
{
- 'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text),
+ 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib['dur']),
}