From 7b294ed31f71fe3401dc9d544123470bccf61280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20Mart=C3=ADnez?= <ivanmartit@gmail.com>
Date: Mon, 28 Aug 2023 17:32:56 +0200
Subject: [PATCH] Update dependencies. Upgrade chromadb integration.

---
 .gitignore       |  3 +++
 constants.py     |  1 -
 ingest.py        | 23 +++++++++++------------
 privateGPT.py    |  6 ++++--
 requirements.txt | 18 +++++++++---------
 5 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index 240b29e..1f7d3a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,3 +168,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.vscode/launch.json
+persist_directory/chroma.sqlite3
diff --git a/constants.py b/constants.py
index ca3b8a1..cae3c10 100644
--- a/constants.py
+++ b/constants.py
@@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
 
 # Define the Chroma settings
 CHROMA_SETTINGS = Settings(
-        chroma_db_impl='duckdb+parquet',
         persist_directory=PERSIST_DIRECTORY,
         anonymized_telemetry=False
 )
diff --git a/ingest.py b/ingest.py
index 62e3963..b24b302 100755
--- a/ingest.py
+++ b/ingest.py
@@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 from constants import CHROMA_SETTINGS
+import chromadb
 
 
 load_dotenv()
@@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
     print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
     return texts
 
-def does_vectorstore_exist(persist_directory: str) -> bool:
+def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
     """
     Checks if vectorstore exists
     """
-    if os.path.exists(os.path.join(persist_directory, 'index')):
-        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
-            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
-            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
-            # At least 3 documents are needed in a working vectorstore
-            if len(list_index_files) > 3:
-                return True
-    return False
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+    if not db.get()['documents']:
+        return False
+    return True
 
 def main():
     # Create embeddings
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+    # Chroma client
+    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
 
-    if does_vectorstore_exist(persist_directory):
+    if does_vectorstore_exist(persist_directory, embeddings):
         # Update and store locally vectorstore
         print(f"Appending to existing vectorstore at {persist_directory}")
-        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
+        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
         collection = db.get()
         texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
         print(f"Creating embeddings. May take some minutes...")
@@ -158,7 +157,7 @@ def main():
         print("Creating new vectorstore")
         texts = process_documents()
         print(f"Creating embeddings. May take some minutes...")
-        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
+        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
     db.persist()
     db = None
 
diff --git a/privateGPT.py b/privateGPT.py
index a11fe24..e8bd587 100755
--- a/privateGPT.py
+++ b/privateGPT.py
@@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
 from langchain.llms import GPT4All, LlamaCpp
+import chromadb
 import os
 import argparse
 import time
@@ -26,7 +27,8 @@ def main():
     # Parse the command line arguments
     args = parse_arguments()
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
+    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
     retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
     # activate/deactivate the streaming StdOut callback for LLMs
     callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
@@ -39,7 +41,7 @@ def main():
         case _default:
             # raise exception if model_type is not supported
             raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
-        
+
     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
     # Interactive questions and answers
     while True:
diff --git a/requirements.txt b/requirements.txt
index 808f872..1e3e71e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
-langchain==0.0.228
-gpt4all==1.0.3
-chromadb==0.3.26
-llama-cpp-python==0.1.68
-urllib3==2.0.3
-PyMuPDF==1.22.5
+langchain==0.0.274
+gpt4all==1.0.8
+chromadb==0.4.7
+llama-cpp-python==0.1.81
+urllib3==2.0.4
+PyMuPDF==1.23.1
 python-dotenv==1.0.0
-unstructured==0.8.0
-extract-msg==0.41.5
+unstructured==0.10.8
+extract-msg==0.45.0
 tabulate==0.9.0
 pandoc==2.3
 pypandoc==1.11
-tqdm==4.65.0
+tqdm==4.66.1
 sentence_transformers==2.2.2