From db341e2a406c0918e9200e7b74381451c53c3d70 Mon Sep 17 00:00:00 2001 From: Ravindra Prasad Date: Wed, 31 May 2023 00:04:56 +0530 Subject: [PATCH] fixed the the csv file reading issue --- ingest.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ingest.py b/ingest.py index 43f7ec2..620a652 100755 --- a/ingest.py +++ b/ingest.py @@ -81,16 +81,15 @@ LOADER_MAPPING = { } -def load_single_document(file_path: str) -> Document: +def load_single_document(file_path: str) -> List[Document]: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: loader_class, loader_args = LOADER_MAPPING[ext] loader = loader_class(file_path, **loader_args) - return loader.load()[0] + return loader.load() raise ValueError(f"Unsupported file extension '{ext}'") - def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: """ Loads all documents from the source documents directory, ignoring specified files @@ -98,15 +97,15 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum all_files = [] for ext in LOADER_MAPPING: all_files.extend( - glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) + glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) filtered_files = [file_path for file_path in all_files if file_path not in ignored_files] with Pool(processes=os.cpu_count()) as pool: results = [] with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: - for i, doc in enumerate(pool.imap_unordered(load_single_document, filtered_files)): - results.append(doc) + for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): + results.extend(docs) pbar.update() return results