parent
							
								
									ad64589c8f
								
							
						
					
					
						commit
						a862ff2be6
					
				
							
								
								
									
										20
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										20
									
								
								ingest.py
								
								
								
								
							|  | @ -31,7 +31,7 @@ LOADER_MAPPING = { | |||
|     ".doc": (UnstructuredWordDocumentLoader, {}), | ||||
|     ".docx": (UnstructuredWordDocumentLoader, {}), | ||||
|     ".enex": (EverNoteLoader, {}), | ||||
|     ".eml": (UnstructuredEmailLoader, {}), | ||||
|     ".eml": (MyElmLoader, {}), | ||||
|     ".epub": (UnstructuredEPubLoader, {}), | ||||
|     ".html": (UnstructuredHTMLLoader, {}), | ||||
|     ".md": (UnstructuredMarkdownLoader, {}), | ||||
|  | @ -47,6 +47,24 @@ LOADER_MAPPING = { | |||
| load_dotenv() | ||||
| 
 | ||||
| 
 | ||||
| class MyElmLoader(UnstructuredEmailLoader): | ||||
|     """Wrapper to fallback to text/plain when default does not work""" | ||||
| 
 | ||||
|     def load(self) -> List[Document]: | ||||
|         """Wrapper adding fallback for elm without html""" | ||||
|         try: | ||||
|             doc = UnstructuredEmailLoader.load() | ||||
|         except ValueError as e: | ||||
|             if 'text/html content not found in email' in str(e): | ||||
|                 # Try plain text | ||||
|                 self.unstructured_kwargs["content_source"]="text/plain" | ||||
|                 doc = UnstructuredEmailLoader.load() | ||||
|             else: | ||||
|                 raise | ||||
| 
 | ||||
|         return doc | ||||
| 
 | ||||
| 
 | ||||
| def load_single_document(file_path: str) -> Document: | ||||
|     ext = "." + file_path.rsplit(".", 1)[-1] | ||||
|     if ext in LOADER_MAPPING: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue