Update dependencies. Upgrade chromadb integration.

2023-08-28 17:32:56 +02:00 · 2023-08-28 17:32:56 +02:00 · 7b294ed31f
parent 54d3eee657
commit 7b294ed31f
5 changed files with 27 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -168,3 +168,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.vscode/launch.json
+persist_directory/chroma.sqlite3
--- a/constants.py
+++ b/constants.py
@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')

 # Define the Chroma settings
 CHROMA_SETTINGS = Settings(
-        chroma_db_impl='duckdb+parquet',
        persist_directory=PERSIST_DIRECTORY,
        anonymized_telemetry=False
 )
--- a/ingest.py
+++ b/ingest.py
@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 from constants import CHROMA_SETTINGS
+import chromadb


 load_dotenv()
@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

-def does_vectorstore_exist(persist_directory: str) -> bool:
+def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
    """
    Checks if vectorstore exists
    """
-    if os.path.exists(os.path.join(persist_directory, 'index')):
-        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
-            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
-            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
-            # At least 3 documents are needed in a working vectorstore
-            if len(list_index_files) > 3:
-                return True
-    return False
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+    if not db.get()['documents']:
+        return False
+    return True

 def main():
    # Create embeddings
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+    # Chroma client
+    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)

-    if does_vectorstore_exist(persist_directory):
+    if does_vectorstore_exist(persist_directory, embeddings):
        # Update and store locally vectorstore
        print(f"Appending to existing vectorstore at {persist_directory}")
-        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
+        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
        collection = db.get()
        texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
        print(f"Creating embeddings. May take some minutes...")
@ -158,7 +157,7 @@ def main():
        print("Creating new vectorstore")
        texts = process_documents()
        print(f"Creating embeddings. May take some minutes...")
-        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
+        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
    db.persist()
    db = None

--- a/privateGPT.py
+++ b/privateGPT.py
@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
 from langchain.llms import GPT4All, LlamaCpp
+import chromadb
 import os
 import argparse
 import time
@ -26,7 +27,8 @@ def main():
    # Parse the command line arguments
    args = parse_arguments()
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
+    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
    retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
    # activate/deactivate the streaming StdOut callback for LLMs
    callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
@ -39,7 +41,7 @@ def main():
        case _default:
            # raise exception if model_type is not supported
            raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
-        
+
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
    # Interactive questions and answers
    while True:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,14 @@
-langchain==0.0.228
-gpt4all==1.0.3
-chromadb==0.3.26
-llama-cpp-python==0.1.68
-urllib3==2.0.3
-PyMuPDF==1.22.5
+langchain==0.0.274
+gpt4all==1.0.8
+chromadb==0.4.7
+llama-cpp-python==0.1.81
+urllib3==2.0.4
+PyMuPDF==1.23.1
 python-dotenv==1.0.0
-unstructured==0.8.0
-extract-msg==0.41.5
+unstructured==0.10.8
+extract-msg==0.45.0
 tabulate==0.9.0
 pandoc==2.3
 pypandoc==1.11
-tqdm==4.65.0
+tqdm==4.66.1
 sentence_transformers==2.2.2