Merge branch 'imartinez:main' into main

This commit is contained in:
Abhiruka 2023-05-20 07:42:26 +08:00 committed by GitHub
commit be1bcbca37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 23 additions and 1 deletions

View File

@ -24,6 +24,28 @@ from langchain.docstore.document import Document
from constants import CHROMA_SETTINGS from constants import CHROMA_SETTINGS
class MyElmLoader(UnstructuredEmailLoader):
"""Wrapper to fallback to text/plain when default does not work"""
def load(self) -> List[Document]:
"""Wrapper adding fallback for elm without html"""
try:
try:
doc = UnstructuredEmailLoader.load(self)
except ValueError as e:
if 'text/html content not found in email' in str(e):
# Try plain text
self.unstructured_kwargs["content_source"]="text/plain"
doc = UnstructuredEmailLoader.load(self)
else:
raise
except Exception as e:
# Add file_path to exception message
raise type(e)(f"{self.file_path}: {e}") from e
return doc
# Map file extensions to document loaders and their arguments # Map file extensions to document loaders and their arguments
LOADER_MAPPING = { LOADER_MAPPING = {
".csv": (CSVLoader, {}), ".csv": (CSVLoader, {}),
@ -31,7 +53,7 @@ LOADER_MAPPING = {
".doc": (UnstructuredWordDocumentLoader, {}), ".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}), ".enex": (EverNoteLoader, {}),
".eml": (UnstructuredEmailLoader, {}), ".eml": (MyElmLoader, {}),
".epub": (UnstructuredEPubLoader, {}), ".epub": (UnstructuredEPubLoader, {}),
".html": (UnstructuredHTMLLoader, {}), ".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}), ".md": (UnstructuredMarkdownLoader, {}),