Merge pull request #299 from mdeweerd/elm_extended
Add fallback for plain elm #294 #290
This commit is contained in:
		
						commit
						22945bc91d
					
				
							
								
								
									
										24
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										24
									
								
								ingest.py
								
								
								
								
							|  | @ -24,6 +24,28 @@ from langchain.docstore.document import Document | ||||||
| from constants import CHROMA_SETTINGS | from constants import CHROMA_SETTINGS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class MyElmLoader(UnstructuredEmailLoader): | ||||||
|  |     """Wrapper to fallback to text/plain when default does not work""" | ||||||
|  | 
 | ||||||
|  |     def load(self) -> List[Document]: | ||||||
|  |         """Wrapper adding fallback for elm without html""" | ||||||
|  |         try: | ||||||
|  |             try: | ||||||
|  |                 doc = UnstructuredEmailLoader.load(self) | ||||||
|  |             except ValueError as e: | ||||||
|  |                 if 'text/html content not found in email' in str(e): | ||||||
|  |                     # Try plain text | ||||||
|  |                     self.unstructured_kwargs["content_source"]="text/plain" | ||||||
|  |                     doc = UnstructuredEmailLoader.load(self) | ||||||
|  |                 else: | ||||||
|  |                     raise | ||||||
|  |         except Exception as e: | ||||||
|  |             # Add file_path to exception message | ||||||
|  |             raise type(e)(f"{self.file_path}: {e}") from e | ||||||
|  | 
 | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # Map file extensions to document loaders and their arguments | # Map file extensions to document loaders and their arguments | ||||||
| LOADER_MAPPING = { | LOADER_MAPPING = { | ||||||
|     ".csv": (CSVLoader, {}), |     ".csv": (CSVLoader, {}), | ||||||
|  | @ -31,7 +53,7 @@ LOADER_MAPPING = { | ||||||
|     ".doc": (UnstructuredWordDocumentLoader, {}), |     ".doc": (UnstructuredWordDocumentLoader, {}), | ||||||
|     ".docx": (UnstructuredWordDocumentLoader, {}), |     ".docx": (UnstructuredWordDocumentLoader, {}), | ||||||
|     ".enex": (EverNoteLoader, {}), |     ".enex": (EverNoteLoader, {}), | ||||||
|     ".eml": (UnstructuredEmailLoader, {}), |     ".eml": (MyElmLoader, {}), | ||||||
|     ".epub": (UnstructuredEPubLoader, {}), |     ".epub": (UnstructuredEPubLoader, {}), | ||||||
|     ".html": (UnstructuredHTMLLoader, {}), |     ".html": (UnstructuredHTMLLoader, {}), | ||||||
|     ".md": (UnstructuredMarkdownLoader, {}), |     ".md": (UnstructuredMarkdownLoader, {}), | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue