Merge pull request #292 from jiangzhuo/feature/multiprocessing-for-document-loading
Optimize load_documents function with multiprocessing
This commit is contained in:
commit
20554a7c9d
13
ingest.py
13
ingest.py
|
@ -3,6 +3,8 @@ import os
|
|||
import glob
|
||||
from typing import List
|
||||
from dotenv import load_dotenv
|
||||
from multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
from langchain.document_loaders import (
|
||||
CSVLoader,
|
||||
|
@ -79,7 +81,6 @@ def load_single_document(file_path: str) -> Document:
|
|||
|
||||
raise ValueError(f"Unsupported file extension '{ext}'")
|
||||
|
||||
|
||||
def load_documents(source_dir: str) -> List[Document]:
|
||||
# Loads all documents from source documents directory
|
||||
all_files = []
|
||||
|
@ -87,7 +88,15 @@ def load_documents(source_dir: str) -> List[Document]:
|
|||
all_files.extend(
|
||||
glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
|
||||
)
|
||||
return [load_single_document(file_path) for file_path in all_files]
|
||||
|
||||
with Pool(processes=os.cpu_count()) as pool:
|
||||
results = []
|
||||
with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar:
|
||||
for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)):
|
||||
results.append(doc)
|
||||
pbar.update()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -9,4 +9,5 @@ unstructured==0.6.6
|
|||
extract-msg==0.41.1
|
||||
tabulate==0.9.0
|
||||
pandoc==2.3
|
||||
pypandoc==1.11
|
||||
pypandoc==1.11
|
||||
tqdm==4.65.0
|
Loading…
Reference in New Issue