diff --git a/ingest.py b/ingest.py index 47b5192..12049ab 100644 --- a/ingest.py +++ b/ingest.py @@ -24,6 +24,28 @@ from langchain.docstore.document import Document from constants import CHROMA_SETTINGS +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + try: + doc = UnstructuredEmailLoader.load(self) + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load(self) + else: + raise + except Exception as e: + # Add file_path to exception message + raise type(e)(f"{self.file_path}: {e}") from e + + return doc + + # Map file extensions to document loaders and their arguments LOADER_MAPPING = { ".csv": (CSVLoader, {}), @@ -31,7 +53,7 @@ LOADER_MAPPING = { ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), - ".eml": (UnstructuredEmailLoader, {}), + ".eml": (MyElmLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}),