Fix #294 (tested)
This commit is contained in:
parent
a862ff2be6
commit
4cda348cf8
40
ingest.py
40
ingest.py
|
@ -24,6 +24,28 @@ from langchain.docstore.document import Document
|
||||||
from constants import CHROMA_SETTINGS
|
from constants import CHROMA_SETTINGS
|
||||||
|
|
||||||
|
|
||||||
|
class MyElmLoader(UnstructuredEmailLoader):
|
||||||
|
"""Wrapper to fallback to text/plain when default does not work"""
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Wrapper adding fallback for elm without html"""
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
doc = UnstructuredEmailLoader.load(self)
|
||||||
|
except ValueError as e:
|
||||||
|
if 'text/html content not found in email' in str(e):
|
||||||
|
# Try plain text
|
||||||
|
self.unstructured_kwargs["content_source"]="text/plain"
|
||||||
|
doc = UnstructuredEmailLoader.load(self)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
# Add file_path to exception message
|
||||||
|
raise type(e)(f"{self.file_path}: {e}") from e
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
# Map file extensions to document loaders and their arguments
|
# Map file extensions to document loaders and their arguments
|
||||||
LOADER_MAPPING = {
|
LOADER_MAPPING = {
|
||||||
".csv": (CSVLoader, {}),
|
".csv": (CSVLoader, {}),
|
||||||
|
@ -47,24 +69,6 @@ LOADER_MAPPING = {
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class MyElmLoader(UnstructuredEmailLoader):
|
|
||||||
"""Wrapper to fallback to text/plain when default does not work"""
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Wrapper adding fallback for elm without html"""
|
|
||||||
try:
|
|
||||||
doc = UnstructuredEmailLoader.load()
|
|
||||||
except ValueError as e:
|
|
||||||
if 'text/html content not found in email' in str(e):
|
|
||||||
# Try plain text
|
|
||||||
self.unstructured_kwargs["content_source"]="text/plain"
|
|
||||||
doc = UnstructuredEmailLoader.load()
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def load_single_document(file_path: str) -> Document:
|
def load_single_document(file_path: str) -> Document:
|
||||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
ext = "." + file_path.rsplit(".", 1)[-1]
|
||||||
if ext in LOADER_MAPPING:
|
if ext in LOADER_MAPPING:
|
||||||
|
|
Loading…
Reference in New Issue