Update dependencies. Upgrade chromadb integration.

This commit is contained in:
Iván Martínez 2023-08-28 17:32:56 +02:00
parent 54d3eee657
commit 7b294ed31f
5 changed files with 27 additions and 24 deletions

3
.gitignore vendored
View File

@ -168,3 +168,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.vscode/launch.json
persist_directory/chroma.sqlite3

View File

@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
# Define the Chroma settings
CHROMA_SETTINGS = Settings(
chroma_db_impl='duckdb+parquet',
persist_directory=PERSIST_DIRECTORY,
anonymized_telemetry=False
)

View File

@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from constants import CHROMA_SETTINGS
import chromadb
load_dotenv()
@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
return texts
def does_vectorstore_exist(persist_directory: str) -> bool:
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
"""
Checks if vectorstore exists
"""
if os.path.exists(os.path.join(persist_directory, 'index')):
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
# At least 3 documents are needed in a working vectorstore
if len(list_index_files) > 3:
return True
return False
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
if not db.get()['documents']:
return False
return True
def main():
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
# Chroma client
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
if does_vectorstore_exist(persist_directory):
if does_vectorstore_exist(persist_directory, embeddings):
# Update and store locally vectorstore
print(f"Appending to existing vectorstore at {persist_directory}")
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
collection = db.get()
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
print(f"Creating embeddings. May take some minutes...")
@ -158,7 +157,7 @@ def main():
print("Creating new vectorstore")
texts = process_documents()
print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
db.persist()
db = None

View File

@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All, LlamaCpp
import chromadb
import os
import argparse
import time
@ -26,7 +27,8 @@ def main():
# Parse the command line arguments
args = parse_arguments()
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
# activate/deactivate the streaming StdOut callback for LLMs
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
@ -39,7 +41,7 @@ def main():
case _default:
# raise exception if model_type is not supported
raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
# Interactive questions and answers
while True:

View File

@ -1,14 +1,14 @@
langchain==0.0.228
gpt4all==1.0.3
chromadb==0.3.26
llama-cpp-python==0.1.68
urllib3==2.0.3
PyMuPDF==1.22.5
langchain==0.0.274
gpt4all==1.0.8
chromadb==0.4.7
llama-cpp-python==0.1.81
urllib3==2.0.4
PyMuPDF==1.23.1
python-dotenv==1.0.0
unstructured==0.8.0
extract-msg==0.41.5
unstructured==0.10.8
extract-msg==0.45.0
tabulate==0.9.0
pandoc==2.3
pypandoc==1.11
tqdm==4.65.0
tqdm==4.66.1
sentence_transformers==2.2.2