From a862ff2be6c4af21aa0fc9a5f0943fabbe888f23 Mon Sep 17 00:00:00 2001 From: MDW Date: Fri, 19 May 2023 01:04:42 +0200 Subject: [PATCH 1/2] Add fallback for plain elm #294 #290 --- ingest.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/ingest.py b/ingest.py index 47b5192..f226102 100644 --- a/ingest.py +++ b/ingest.py @@ -31,7 +31,7 @@ LOADER_MAPPING = { ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), - ".eml": (UnstructuredEmailLoader, {}), + ".eml": (MyElmLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), @@ -47,6 +47,24 @@ LOADER_MAPPING = { load_dotenv() +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + doc = UnstructuredEmailLoader.load() + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load() + else: + raise + + return doc + + def load_single_document(file_path: str) -> Document: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: From 4cda348cf87f56ff237e376b03732b1b47a99215 Mon Sep 17 00:00:00 2001 From: MDW Date: Fri, 19 May 2023 16:23:09 +0200 Subject: [PATCH 2/2] Fix #294 (tested) --- ingest.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/ingest.py b/ingest.py index f226102..12049ab 100644 --- a/ingest.py +++ b/ingest.py @@ -24,6 +24,28 @@ from langchain.docstore.document import Document from constants import CHROMA_SETTINGS +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + try: + doc = UnstructuredEmailLoader.load(self) + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load(self) + else: + raise + except Exception as e: + # Add file_path to exception message + raise type(e)(f"{self.file_path}: {e}") from e + + return doc + + # Map file extensions to document loaders and their arguments LOADER_MAPPING = { ".csv": (CSVLoader, {}), @@ -47,24 +69,6 @@ LOADER_MAPPING = { load_dotenv() -class MyElmLoader(UnstructuredEmailLoader): - """Wrapper to fallback to text/plain when default does not work""" - - def load(self) -> List[Document]: - """Wrapper adding fallback for elm without html""" - try: - doc = UnstructuredEmailLoader.load() - except ValueError as e: - if 'text/html content not found in email' in str(e): - # Try plain text - self.unstructured_kwargs["content_source"]="text/plain" - doc = UnstructuredEmailLoader.load() - else: - raise - - return doc - - def load_single_document(file_path: str) -> Document: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: