diff --git a/ingest.py b/ingest.py index 47b5192..f226102 100644 --- a/ingest.py +++ b/ingest.py @@ -31,7 +31,7 @@ LOADER_MAPPING = { ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), - ".eml": (UnstructuredEmailLoader, {}), + ".eml": (MyElmLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), @@ -47,6 +47,24 @@ LOADER_MAPPING = { load_dotenv() +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + doc = UnstructuredEmailLoader.load() + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load() + else: + raise + + return doc + + def load_single_document(file_path: str) -> Document: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: