From 7b294ed31f71fe3401dc9d544123470bccf61280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Mart=C3=ADnez?= Date: Mon, 28 Aug 2023 17:32:56 +0200 Subject: [PATCH] Update dependencies. Upgrade chromadb integration. --- .gitignore | 3 +++ constants.py | 1 - ingest.py | 23 +++++++++++------------ privateGPT.py | 6 ++++-- requirements.txt | 18 +++++++++--------- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 240b29e..1f7d3a1 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +.vscode/launch.json +persist_directory/chroma.sqlite3 diff --git a/constants.py b/constants.py index ca3b8a1..cae3c10 100644 --- a/constants.py +++ b/constants.py @@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY') # Define the Chroma settings CHROMA_SETTINGS = Settings( - chroma_db_impl='duckdb+parquet', persist_directory=PERSIST_DIRECTORY, anonymized_telemetry=False ) diff --git a/ingest.py b/ingest.py index 62e3963..b24b302 100755 --- a/ingest.py +++ b/ingest.py @@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document from constants import CHROMA_SETTINGS +import chromadb load_dotenv() @@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]: print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") return texts -def does_vectorstore_exist(persist_directory: str) -> bool: +def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: """ Checks if vectorstore exists """ - if os.path.exists(os.path.join(persist_directory, 'index')): - if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): - list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) - list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) - # At least 3 documents are needed in a working vectorstore - if len(list_index_files) > 3: - return True - return False + db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) + if not db.get()['documents']: + return False + return True def main(): # Create embeddings embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) + # Chroma client + chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory) - if does_vectorstore_exist(persist_directory): + if does_vectorstore_exist(persist_directory, embeddings): # Update and store locally vectorstore print(f"Appending to existing vectorstore at {persist_directory}") - db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) + db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) collection = db.get() texts = process_documents([metadata['source'] for metadata in collection['metadatas']]) print(f"Creating embeddings. May take some minutes...") @@ -158,7 +157,7 @@ def main(): print("Creating new vectorstore") texts = process_documents() print(f"Creating embeddings. May take some minutes...") - db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) + db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) db.persist() db = None diff --git a/privateGPT.py b/privateGPT.py index a11fe24..e8bd587 100755 --- a/privateGPT.py +++ b/privateGPT.py @@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma from langchain.llms import GPT4All, LlamaCpp +import chromadb import os import argparse import time @@ -26,7 +27,8 @@ def main(): # Parse the command line arguments args = parse_arguments() embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) - db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) + chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory) + db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) # activate/deactivate the streaming StdOut callback for LLMs callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()] @@ -39,7 +41,7 @@ def main(): case _default: # raise exception if model_type is not supported raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All") - + qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source) # Interactive questions and answers while True: diff --git a/requirements.txt b/requirements.txt index 808f872..1e3e71e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -langchain==0.0.228 -gpt4all==1.0.3 -chromadb==0.3.26 -llama-cpp-python==0.1.68 -urllib3==2.0.3 -PyMuPDF==1.22.5 +langchain==0.0.274 +gpt4all==1.0.8 +chromadb==0.4.7 +llama-cpp-python==0.1.81 +urllib3==2.0.4 +PyMuPDF==1.23.1 python-dotenv==1.0.0 -unstructured==0.8.0 -extract-msg==0.41.5 +unstructured==0.10.8 +extract-msg==0.45.0 tabulate==0.9.0 pandoc==2.3 pypandoc==1.11 -tqdm==4.65.0 +tqdm==4.66.1 sentence_transformers==2.2.2