Fix #294 (tested)
This commit is contained in:
		
							parent
							
								
									a862ff2be6
								
							
						
					
					
						commit
						4cda348cf8
					
				
							
								
								
									
										40
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										40
									
								
								ingest.py
								
								
								
								
							|  | @ -24,6 +24,28 @@ from langchain.docstore.document import Document | ||||||
| from constants import CHROMA_SETTINGS | from constants import CHROMA_SETTINGS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class MyElmLoader(UnstructuredEmailLoader): | ||||||
|  |     """Wrapper to fallback to text/plain when default does not work""" | ||||||
|  | 
 | ||||||
|  |     def load(self) -> List[Document]: | ||||||
|  |         """Wrapper adding fallback for elm without html""" | ||||||
|  |         try: | ||||||
|  |             try: | ||||||
|  |                 doc = UnstructuredEmailLoader.load(self) | ||||||
|  |             except ValueError as e: | ||||||
|  |                 if 'text/html content not found in email' in str(e): | ||||||
|  |                     # Try plain text | ||||||
|  |                     self.unstructured_kwargs["content_source"]="text/plain" | ||||||
|  |                     doc = UnstructuredEmailLoader.load(self) | ||||||
|  |                 else: | ||||||
|  |                     raise | ||||||
|  |         except Exception as e: | ||||||
|  |             # Add file_path to exception message | ||||||
|  |             raise type(e)(f"{self.file_path}: {e}") from e | ||||||
|  | 
 | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # Map file extensions to document loaders and their arguments | # Map file extensions to document loaders and their arguments | ||||||
| LOADER_MAPPING = { | LOADER_MAPPING = { | ||||||
|     ".csv": (CSVLoader, {}), |     ".csv": (CSVLoader, {}), | ||||||
|  | @ -47,24 +69,6 @@ LOADER_MAPPING = { | ||||||
| load_dotenv() | load_dotenv() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MyElmLoader(UnstructuredEmailLoader): |  | ||||||
|     """Wrapper to fallback to text/plain when default does not work""" |  | ||||||
| 
 |  | ||||||
|     def load(self) -> List[Document]: |  | ||||||
|         """Wrapper adding fallback for elm without html""" |  | ||||||
|         try: |  | ||||||
|             doc = UnstructuredEmailLoader.load() |  | ||||||
|         except ValueError as e: |  | ||||||
|             if 'text/html content not found in email' in str(e): |  | ||||||
|                 # Try plain text |  | ||||||
|                 self.unstructured_kwargs["content_source"]="text/plain" |  | ||||||
|                 doc = UnstructuredEmailLoader.load() |  | ||||||
|             else: |  | ||||||
|                 raise |  | ||||||
| 
 |  | ||||||
|         return doc |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_single_document(file_path: str) -> Document: | def load_single_document(file_path: str) -> Document: | ||||||
|     ext = "." + file_path.rsplit(".", 1)[-1] |     ext = "." + file_path.rsplit(".", 1)[-1] | ||||||
|     if ext in LOADER_MAPPING: |     if ext in LOADER_MAPPING: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue