From 81b221bccbe1ce3a332e73658847452ed70cf71c Mon Sep 17 00:00:00 2001 From: jiangzhuo Date: Fri, 19 May 2023 02:35:20 +0900 Subject: [PATCH 1/2] Optimize load_documents function with multiprocessing --- ingest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ingest.py b/ingest.py index 47b5192..2566289 100644 --- a/ingest.py +++ b/ingest.py @@ -2,6 +2,7 @@ import os import glob from typing import List from dotenv import load_dotenv +from multiprocessing import Pool from langchain.document_loaders import ( CSVLoader, @@ -64,7 +65,9 @@ def load_documents(source_dir: str) -> List[Document]: all_files.extend( glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) - return [load_single_document(file_path) for file_path in all_files] + with Pool(processes=os.cpu_count()) as pool: + documents = pool.map(load_single_document, all_files) + return documents def main(): From ba0dbe8d1c456c44925387616c87168a6223d802 Mon Sep 17 00:00:00 2001 From: jiangzhuo Date: Fri, 19 May 2023 03:18:41 +0900 Subject: [PATCH 2/2] Add progress bar to load_documents function Enhanced the load_documents() function by adding a progress bar using the tqdm library. This change improves user experience by providing real-time feedback on the progress of document loading. Now, users can easily track the progress of this operation, especially when loading a large number of documents. --- ingest.py | 12 +++++++++--- requirements.txt | 3 ++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ingest.py b/ingest.py index 2566289..5138d28 100644 --- a/ingest.py +++ b/ingest.py @@ -3,6 +3,7 @@ import glob from typing import List from dotenv import load_dotenv from multiprocessing import Pool +from tqdm import tqdm from langchain.document_loaders import ( CSVLoader, @@ -57,7 +58,6 @@ def load_single_document(file_path: str) -> Document: raise ValueError(f"Unsupported file extension '{ext}'") - def load_documents(source_dir: str) -> List[Document]: # Loads all documents from source documents directory all_files = [] @@ -65,9 +65,15 @@ def load_documents(source_dir: str) -> List[Document]: all_files.extend( glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) + with Pool(processes=os.cpu_count()) as pool: - documents = pool.map(load_single_document, all_files) - return documents + results = [] + with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar: + for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)): + results.append(doc) + pbar.update() + + return results def main(): diff --git a/requirements.txt b/requirements.txt index 21740bc..204b77c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ unstructured==0.6.6 extract-msg==0.41.1 tabulate==0.9.0 pandoc==2.3 -pypandoc==1.11 \ No newline at end of file +pypandoc==1.11 +tqdm==4.65.0 \ No newline at end of file