From a862ff2be6c4af21aa0fc9a5f0943fabbe888f23 Mon Sep 17 00:00:00 2001 From: MDW Date: Fri, 19 May 2023 01:04:42 +0200 Subject: [PATCH] Add fallback for plain elm #294 #290 --- ingest.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/ingest.py b/ingest.py index 47b5192..f226102 100644 --- a/ingest.py +++ b/ingest.py @@ -31,7 +31,7 @@ LOADER_MAPPING = { ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), - ".eml": (UnstructuredEmailLoader, {}), + ".eml": (MyElmLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), @@ -47,6 +47,24 @@ LOADER_MAPPING = { load_dotenv() +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + doc = UnstructuredEmailLoader.load() + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load() + else: + raise + + return doc + + def load_single_document(file_path: str) -> Document: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: