diff --git a/ingest.py b/ingest.py index 2930138..0ca8074 100755 --- a/ingest.py +++ b/ingest.py @@ -9,7 +9,7 @@ from tqdm import tqdm from langchain.document_loaders import ( CSVLoader, EverNoteLoader, - PDFMinerLoader, + PyMuPDFLoader, TextLoader, UnstructuredEmailLoader, UnstructuredEPubLoader, @@ -73,7 +73,7 @@ LOADER_MAPPING = { ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), ".odt": (UnstructuredODTLoader, {}), - ".pdf": (PDFMinerLoader, {}), + ".pdf": (PyMuPDFLoader, {}), ".ppt": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), diff --git a/requirements.txt b/requirements.txt index 8c43e53..acfb584 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ gpt4all==0.2.3 chromadb==0.3.23 llama-cpp-python==0.1.50 urllib3==2.0.2 -pdfminer.six==20221105 +PyMuPDF==1.22.3 python-dotenv==1.0.0 unstructured==0.6.6 extract-msg==0.41.1