Merge pull request #660 from doskoi/master

Improving performance for PDF loader
This commit is contained in:
Iván Martínez 2023-06-11 19:10:08 +02:00 committed by GitHub
commit 51fa989679
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 3 additions and 3 deletions

View File

@ -9,7 +9,7 @@ from tqdm import tqdm
from langchain.document_loaders import ( from langchain.document_loaders import (
CSVLoader, CSVLoader,
EverNoteLoader, EverNoteLoader,
PDFMinerLoader, PyMuPDFLoader,
TextLoader, TextLoader,
UnstructuredEmailLoader, UnstructuredEmailLoader,
UnstructuredEPubLoader, UnstructuredEPubLoader,
@ -73,7 +73,7 @@ LOADER_MAPPING = {
".html": (UnstructuredHTMLLoader, {}), ".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}), ".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}), ".odt": (UnstructuredODTLoader, {}),
".pdf": (PDFMinerLoader, {}), ".pdf": (PyMuPDFLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}), ".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}), ".txt": (TextLoader, {"encoding": "utf8"}),

View File

@ -3,7 +3,7 @@ gpt4all==0.2.3
chromadb==0.3.23 chromadb==0.3.23
llama-cpp-python==0.1.50 llama-cpp-python==0.1.50
urllib3==2.0.2 urllib3==2.0.2
pdfminer.six==20221105 PyMuPDF==1.22.3
python-dotenv==1.0.0 python-dotenv==1.0.0
unstructured==0.6.6 unstructured==0.6.6
extract-msg==0.41.1 extract-msg==0.41.1