Merge pull request #299 from mdeweerd/elm_extended
Add fallback for plain elm #294 #290
This commit is contained in:
		
						commit
						22945bc91d
					
				
							
								
								
									
										24
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										24
									
								
								ingest.py
								
								
								
								
							|  | @ -24,6 +24,28 @@ from langchain.docstore.document import Document | |||
| from constants import CHROMA_SETTINGS | ||||
| 
 | ||||
| 
 | ||||
| class MyElmLoader(UnstructuredEmailLoader): | ||||
|     """Wrapper to fallback to text/plain when default does not work""" | ||||
| 
 | ||||
|     def load(self) -> List[Document]: | ||||
|         """Wrapper adding fallback for elm without html""" | ||||
|         try: | ||||
|             try: | ||||
|                 doc = UnstructuredEmailLoader.load(self) | ||||
|             except ValueError as e: | ||||
|                 if 'text/html content not found in email' in str(e): | ||||
|                     # Try plain text | ||||
|                     self.unstructured_kwargs["content_source"]="text/plain" | ||||
|                     doc = UnstructuredEmailLoader.load(self) | ||||
|                 else: | ||||
|                     raise | ||||
|         except Exception as e: | ||||
|             # Add file_path to exception message | ||||
|             raise type(e)(f"{self.file_path}: {e}") from e | ||||
| 
 | ||||
|         return doc | ||||
| 
 | ||||
| 
 | ||||
| # Map file extensions to document loaders and their arguments | ||||
| LOADER_MAPPING = { | ||||
|     ".csv": (CSVLoader, {}), | ||||
|  | @ -31,7 +53,7 @@ LOADER_MAPPING = { | |||
|     ".doc": (UnstructuredWordDocumentLoader, {}), | ||||
|     ".docx": (UnstructuredWordDocumentLoader, {}), | ||||
|     ".enex": (EverNoteLoader, {}), | ||||
|     ".eml": (UnstructuredEmailLoader, {}), | ||||
|     ".eml": (MyElmLoader, {}), | ||||
|     ".epub": (UnstructuredEPubLoader, {}), | ||||
|     ".html": (UnstructuredHTMLLoader, {}), | ||||
|     ".md": (UnstructuredMarkdownLoader, {}), | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue