fixed the the csv file reading issue
This commit is contained in:
		
							parent
							
								
									60e6bd25eb
								
							
						
					
					
						commit
						db341e2a40
					
				
							
								
								
									
										11
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										11
									
								
								ingest.py
								
								
								
								
							|  | @ -81,16 +81,15 @@ LOADER_MAPPING = { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_single_document(file_path: str) -> Document: | def load_single_document(file_path: str) -> List[Document]: | ||||||
|     ext = "." + file_path.rsplit(".", 1)[-1] |     ext = "." + file_path.rsplit(".", 1)[-1] | ||||||
|     if ext in LOADER_MAPPING: |     if ext in LOADER_MAPPING: | ||||||
|         loader_class, loader_args = LOADER_MAPPING[ext] |         loader_class, loader_args = LOADER_MAPPING[ext] | ||||||
|         loader = loader_class(file_path, **loader_args) |         loader = loader_class(file_path, **loader_args) | ||||||
|         return loader.load()[0] |         return loader.load() | ||||||
| 
 | 
 | ||||||
|     raise ValueError(f"Unsupported file extension '{ext}'") |     raise ValueError(f"Unsupported file extension '{ext}'") | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: | def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: | ||||||
|     """ |     """ | ||||||
|     Loads all documents from the source documents directory, ignoring specified files |     Loads all documents from the source documents directory, ignoring specified files | ||||||
|  | @ -98,15 +97,15 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum | ||||||
|     all_files = [] |     all_files = [] | ||||||
|     for ext in LOADER_MAPPING: |     for ext in LOADER_MAPPING: | ||||||
|         all_files.extend( |         all_files.extend( | ||||||
|             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) |         glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) | ||||||
|         ) |         ) | ||||||
|     filtered_files = [file_path for file_path in all_files if file_path not in ignored_files] |     filtered_files = [file_path for file_path in all_files if file_path not in ignored_files] | ||||||
| 
 | 
 | ||||||
|     with Pool(processes=os.cpu_count()) as pool: |     with Pool(processes=os.cpu_count()) as pool: | ||||||
|         results = [] |         results = [] | ||||||
|         with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: |         with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: | ||||||
|             for i, doc in enumerate(pool.imap_unordered(load_single_document, filtered_files)): |             for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): | ||||||
|                 results.append(doc) |                 results.extend(docs) | ||||||
|                 pbar.update() |                 pbar.update() | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue