Add fallback for plain elm #294 #290

This commit is contained in:
MDW 2023-05-19 01:04:42 +02:00
parent ad64589c8f
commit a862ff2be6
1 changed files with 19 additions and 1 deletions

View File

@ -31,7 +31,7 @@ LOADER_MAPPING = {
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}),
".eml": (UnstructuredEmailLoader, {}),
".eml": (MyElmLoader, {}),
".epub": (UnstructuredEPubLoader, {}),
".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}),
@ -47,6 +47,24 @@ LOADER_MAPPING = {
load_dotenv()
class MyElmLoader(UnstructuredEmailLoader):
"""Wrapper to fallback to text/plain when default does not work"""
def load(self) -> List[Document]:
"""Wrapper adding fallback for elm without html"""
try:
doc = UnstructuredEmailLoader.load()
except ValueError as e:
if 'text/html content not found in email' in str(e):
# Try plain text
self.unstructured_kwargs["content_source"]="text/plain"
doc = UnstructuredEmailLoader.load()
else:
raise
return doc
def load_single_document(file_path: str) -> Document:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING: