Add progress bar to load_documents function
Enhanced the load_documents() function by adding a progress bar using the tqdm library. This change improves user experience by providing real-time feedback on the progress of document loading. Now, users can easily track the progress of this operation, especially when loading a large number of documents.
This commit is contained in:
		
							parent
							
								
									e3b769d33a
								
							
						
					
					
						commit
						cb7c96b31d
					
				
							
								
								
									
										12
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										12
									
								
								ingest.py
								
								
								
								
							|  | @ -4,6 +4,7 @@ import glob | ||||||
| from typing import List | from typing import List | ||||||
| from dotenv import load_dotenv | from dotenv import load_dotenv | ||||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||||
|  | from tqdm import tqdm | ||||||
| 
 | 
 | ||||||
| from langchain.document_loaders import ( | from langchain.document_loaders import ( | ||||||
|     CSVLoader, |     CSVLoader, | ||||||
|  | @ -80,7 +81,6 @@ def load_single_document(file_path: str) -> Document: | ||||||
| 
 | 
 | ||||||
|     raise ValueError(f"Unsupported file extension '{ext}'") |     raise ValueError(f"Unsupported file extension '{ext}'") | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def load_documents(source_dir: str) -> List[Document]: | def load_documents(source_dir: str) -> List[Document]: | ||||||
|     # Loads all documents from source documents directory |     # Loads all documents from source documents directory | ||||||
|     all_files = [] |     all_files = [] | ||||||
|  | @ -88,9 +88,15 @@ def load_documents(source_dir: str) -> List[Document]: | ||||||
|         all_files.extend( |         all_files.extend( | ||||||
|             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) |             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) | ||||||
|         ) |         ) | ||||||
|  | 
 | ||||||
|     with Pool(processes=os.cpu_count()) as pool: |     with Pool(processes=os.cpu_count()) as pool: | ||||||
|         documents = pool.map(load_single_document, all_files) |         results = [] | ||||||
|     return documents |         with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar: | ||||||
|  |             for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)): | ||||||
|  |                 results.append(doc) | ||||||
|  |                 pbar.update() | ||||||
|  | 
 | ||||||
|  |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|  |  | ||||||
|  | @ -9,4 +9,5 @@ unstructured==0.6.6 | ||||||
| extract-msg==0.41.1 | extract-msg==0.41.1 | ||||||
| tabulate==0.9.0 | tabulate==0.9.0 | ||||||
| pandoc==2.3 | pandoc==2.3 | ||||||
| pypandoc==1.11 | pypandoc==1.11 | ||||||
|  | tqdm==4.65.0 | ||||||
		Loading…
	
		Reference in New Issue