Improve logging and error handling when ingesting an entire folder (#1132)
This commit is contained in:
		
							parent
							
								
									5d1be6e94c
								
							
						
					
					
						commit
						b0e258265f
					
				|  | @ -416,6 +416,12 @@ and optionally watch changes on it with the command: | |||
| make ingest /path/to/folder -- --watch | ||||
| ``` | ||||
| 
 | ||||
| To log the processed and failed files to an additional file, use: | ||||
| 
 | ||||
| ```bash | ||||
| make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log | ||||
| ``` | ||||
| 
 | ||||
| After ingestion is complete, you should be able to chat with your documents | ||||
| by navigating to http://localhost:8001 and using the option `Query documents`, | ||||
| or using the completions / chat API. | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| import argparse | ||||
| import sys | ||||
| import logging | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from private_gpt.di import root_injector | ||||
| from private_gpt.server.ingest.ingest_service import IngestService | ||||
| from private_gpt.server.ingest.ingest_watcher import IngestWatcher | ||||
| 
 | ||||
| ingest_service = root_injector.get(IngestService) | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| ingest_service = root_injector.get(IngestService) | ||||
| 
 | ||||
| parser = argparse.ArgumentParser(prog="ingest_folder.py") | ||||
| parser.add_argument("folder", help="Folder to ingest") | ||||
|  | @ -17,29 +18,70 @@ parser.add_argument( | |||
|     action=argparse.BooleanOptionalAction, | ||||
|     default=False, | ||||
| ) | ||||
| parser.add_argument( | ||||
|     "--log-file", | ||||
|     help="Optional path to a log file. If provided, logs will be written to this file.", | ||||
|     type=str, | ||||
|     default=None, | ||||
| ) | ||||
| args = parser.parse_args() | ||||
| 
 | ||||
| # Set up logging to a file if a path is provided | ||||
| if args.log_file: | ||||
|     file_handler = logging.FileHandler(args.log_file, mode="a") | ||||
|     file_handler.setFormatter( | ||||
|         logging.Formatter( | ||||
|             "[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s", | ||||
|             datefmt="%Y-%m-%d %H:%M:%S", | ||||
|         ) | ||||
|     ) | ||||
|     logger.addHandler(file_handler) | ||||
| 
 | ||||
| 
 | ||||
| total_documents = 0 | ||||
| current_document_count = 0 | ||||
| 
 | ||||
| 
 | ||||
| def count_documents(folder_path: Path) -> None: | ||||
|     global total_documents | ||||
|     for file_path in folder_path.iterdir(): | ||||
|         if file_path.is_file(): | ||||
|             total_documents += 1 | ||||
|         elif file_path.is_dir(): | ||||
|             count_documents(file_path) | ||||
| 
 | ||||
| 
 | ||||
| def _recursive_ingest_folder(folder_path: Path) -> None: | ||||
|     global current_document_count, total_documents | ||||
|     for file_path in folder_path.iterdir(): | ||||
|         if file_path.is_file(): | ||||
|             current_document_count += 1 | ||||
|             progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)" | ||||
|             logger.info(progress_msg) | ||||
|             _do_ingest(file_path) | ||||
|         elif file_path.is_dir(): | ||||
|             _recursive_ingest_folder(file_path) | ||||
| 
 | ||||
| 
 | ||||
| def _do_ingest(changed_path: Path) -> None: | ||||
|     if changed_path.exists(): | ||||
|         print(f"\nIngesting {changed_path}") | ||||
|         ingest_service.ingest(changed_path.name, changed_path) | ||||
|     try: | ||||
|         if changed_path.exists(): | ||||
|             logger.info(f"Started ingesting {changed_path}") | ||||
|             ingest_service.ingest(changed_path.name, changed_path) | ||||
|             logger.info(f"Completed ingesting {changed_path}") | ||||
|     except Exception as e: | ||||
|         logger.error(f"Failed to ingest document: {changed_path}. Error: {e}") | ||||
| 
 | ||||
| 
 | ||||
| path = Path(args.folder) | ||||
| if not path.exists(): | ||||
|     raise ValueError(f"Path {args.folder} does not exist") | ||||
| 
 | ||||
| # Count total documents before ingestion | ||||
| count_documents(path) | ||||
| 
 | ||||
| _recursive_ingest_folder(path) | ||||
| if args.watch: | ||||
|     print(f"Watching {args.folder} for changes, press Ctrl+C to stop...") | ||||
|     logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...") | ||||
|     watcher = IngestWatcher(args.folder, _do_ingest) | ||||
|     watcher.start() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue