parent
							
								
									ad64589c8f
								
							
						
					
					
						commit
						a862ff2be6
					
				
							
								
								
									
										20
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										20
									
								
								ingest.py
								
								
								
								
							|  | @ -31,7 +31,7 @@ LOADER_MAPPING = { | ||||||
|     ".doc": (UnstructuredWordDocumentLoader, {}), |     ".doc": (UnstructuredWordDocumentLoader, {}), | ||||||
|     ".docx": (UnstructuredWordDocumentLoader, {}), |     ".docx": (UnstructuredWordDocumentLoader, {}), | ||||||
|     ".enex": (EverNoteLoader, {}), |     ".enex": (EverNoteLoader, {}), | ||||||
|     ".eml": (UnstructuredEmailLoader, {}), |     ".eml": (MyElmLoader, {}), | ||||||
|     ".epub": (UnstructuredEPubLoader, {}), |     ".epub": (UnstructuredEPubLoader, {}), | ||||||
|     ".html": (UnstructuredHTMLLoader, {}), |     ".html": (UnstructuredHTMLLoader, {}), | ||||||
|     ".md": (UnstructuredMarkdownLoader, {}), |     ".md": (UnstructuredMarkdownLoader, {}), | ||||||
|  | @ -47,6 +47,24 @@ LOADER_MAPPING = { | ||||||
| load_dotenv() | load_dotenv() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class MyElmLoader(UnstructuredEmailLoader): | ||||||
|  |     """Wrapper to fallback to text/plain when default does not work""" | ||||||
|  | 
 | ||||||
|  |     def load(self) -> List[Document]: | ||||||
|  |         """Wrapper adding fallback for elm without html""" | ||||||
|  |         try: | ||||||
|  |             doc = UnstructuredEmailLoader.load() | ||||||
|  |         except ValueError as e: | ||||||
|  |             if 'text/html content not found in email' in str(e): | ||||||
|  |                 # Try plain text | ||||||
|  |                 self.unstructured_kwargs["content_source"]="text/plain" | ||||||
|  |                 doc = UnstructuredEmailLoader.load() | ||||||
|  |             else: | ||||||
|  |                 raise | ||||||
|  | 
 | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def load_single_document(file_path: str) -> Document: | def load_single_document(file_path: str) -> Document: | ||||||
|     ext = "." + file_path.rsplit(".", 1)[-1] |     ext = "." + file_path.rsplit(".", 1)[-1] | ||||||
|     if ext in LOADER_MAPPING: |     if ext in LOADER_MAPPING: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue