Update dependencies. Upgrade chromadb integration.

This commit is contained in:
Iván Martínez 2023-08-28 17:32:56 +02:00
parent 54d3eee657
commit 7b294ed31f
5 changed files with 27 additions and 24 deletions

3
.gitignore vendored
View File

@ -168,3 +168,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
.vscode/launch.json
persist_directory/chroma.sqlite3

View File

@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
# Define the Chroma settings # Define the Chroma settings
CHROMA_SETTINGS = Settings( CHROMA_SETTINGS = Settings(
chroma_db_impl='duckdb+parquet',
persist_directory=PERSIST_DIRECTORY, persist_directory=PERSIST_DIRECTORY,
anonymized_telemetry=False anonymized_telemetry=False
) )

View File

@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document from langchain.docstore.document import Document
from constants import CHROMA_SETTINGS from constants import CHROMA_SETTINGS
import chromadb
load_dotenv() load_dotenv()
@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
return texts return texts
def does_vectorstore_exist(persist_directory: str) -> bool: def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
""" """
Checks if vectorstore exists Checks if vectorstore exists
""" """
if os.path.exists(os.path.join(persist_directory, 'index')): db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): if not db.get()['documents']:
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) return False
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) return True
# At least 3 documents are needed in a working vectorstore
if len(list_index_files) > 3:
return True
return False
def main(): def main():
# Create embeddings # Create embeddings
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
# Chroma client
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
if does_vectorstore_exist(persist_directory): if does_vectorstore_exist(persist_directory, embeddings):
# Update and store locally vectorstore # Update and store locally vectorstore
print(f"Appending to existing vectorstore at {persist_directory}") print(f"Appending to existing vectorstore at {persist_directory}")
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
collection = db.get() collection = db.get()
texts = process_documents([metadata['source'] for metadata in collection['metadatas']]) texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
print(f"Creating embeddings. May take some minutes...") print(f"Creating embeddings. May take some minutes...")
@ -158,7 +157,7 @@ def main():
print("Creating new vectorstore") print("Creating new vectorstore")
texts = process_documents() texts = process_documents()
print(f"Creating embeddings. May take some minutes...") print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
db.persist() db.persist()
db = None db = None

View File

@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma from langchain.vectorstores import Chroma
from langchain.llms import GPT4All, LlamaCpp from langchain.llms import GPT4All, LlamaCpp
import chromadb
import os import os
import argparse import argparse
import time import time
@ -26,7 +27,8 @@ def main():
# Parse the command line arguments # Parse the command line arguments
args = parse_arguments() args = parse_arguments()
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
# activate/deactivate the streaming StdOut callback for LLMs # activate/deactivate the streaming StdOut callback for LLMs
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()] callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
@ -39,7 +41,7 @@ def main():
case _default: case _default:
# raise exception if model_type is not supported # raise exception if model_type is not supported
raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All") raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source) qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
# Interactive questions and answers # Interactive questions and answers
while True: while True:

View File

@ -1,14 +1,14 @@
langchain==0.0.228 langchain==0.0.274
gpt4all==1.0.3 gpt4all==1.0.8
chromadb==0.3.26 chromadb==0.4.7
llama-cpp-python==0.1.68 llama-cpp-python==0.1.81
urllib3==2.0.3 urllib3==2.0.4
PyMuPDF==1.22.5 PyMuPDF==1.23.1
python-dotenv==1.0.0 python-dotenv==1.0.0
unstructured==0.8.0 unstructured==0.10.8
extract-msg==0.41.5 extract-msg==0.45.0
tabulate==0.9.0 tabulate==0.9.0
pandoc==2.3 pandoc==2.3
pypandoc==1.11 pypandoc==1.11
tqdm==4.65.0 tqdm==4.66.1
sentence_transformers==2.2.2 sentence_transformers==2.2.2