Merge branch 'imartinez:main' into main
This commit is contained in:
commit
be1bcbca37
24
ingest.py
24
ingest.py
|
@ -24,6 +24,28 @@ from langchain.docstore.document import Document
|
|||
from constants import CHROMA_SETTINGS
|
||||
|
||||
|
||||
class MyElmLoader(UnstructuredEmailLoader):
|
||||
"""Wrapper to fallback to text/plain when default does not work"""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Wrapper adding fallback for elm without html"""
|
||||
try:
|
||||
try:
|
||||
doc = UnstructuredEmailLoader.load(self)
|
||||
except ValueError as e:
|
||||
if 'text/html content not found in email' in str(e):
|
||||
# Try plain text
|
||||
self.unstructured_kwargs["content_source"]="text/plain"
|
||||
doc = UnstructuredEmailLoader.load(self)
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
# Add file_path to exception message
|
||||
raise type(e)(f"{self.file_path}: {e}") from e
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
# Map file extensions to document loaders and their arguments
|
||||
LOADER_MAPPING = {
|
||||
".csv": (CSVLoader, {}),
|
||||
|
@ -31,7 +53,7 @@ LOADER_MAPPING = {
|
|||
".doc": (UnstructuredWordDocumentLoader, {}),
|
||||
".docx": (UnstructuredWordDocumentLoader, {}),
|
||||
".enex": (EverNoteLoader, {}),
|
||||
".eml": (UnstructuredEmailLoader, {}),
|
||||
".eml": (MyElmLoader, {}),
|
||||
".epub": (UnstructuredEPubLoader, {}),
|
||||
".html": (UnstructuredHTMLLoader, {}),
|
||||
".md": (UnstructuredMarkdownLoader, {}),
|
||||
|
|
Loading…
Reference in New Issue