From 8f369dd2b97bd34b5c042e4265d993977de9dac3 Mon Sep 17 00:00:00 2001 From: parampavar Date: Wed, 16 Aug 2023 16:03:56 -0700 Subject: [PATCH] Adding support to ingest files with extensions in uppercase Files in the source_directory where ignored if their extensions where in uppercase like (*.PDF). This change supports ingestion of files that match either lowercase or uppercase extensions like *.pdf or *.PDF. This can be enhanced further to support camelcase like *.Pdf at a later stage. The assumption is that this scenario is probably less than 5%. --- ingest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ingest.py b/ingest.py index 0ca8074..62e3963 100755 --- a/ingest.py +++ b/ingest.py @@ -82,7 +82,7 @@ LOADER_MAPPING = { def load_single_document(file_path: str) -> List[Document]: - ext = "." + file_path.rsplit(".", 1)[-1] + ext = "." + file_path.rsplit(".", 1)[-1].lower() if ext in LOADER_MAPPING: loader_class, loader_args = LOADER_MAPPING[ext] loader = loader_class(file_path, **loader_args) @@ -97,7 +97,10 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum all_files = [] for ext in LOADER_MAPPING: all_files.extend( - glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) + glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True) + ) + all_files.extend( + glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True) ) filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]