Update dependencies. Upgrade chromadb integration.
This commit is contained in:
parent
54d3eee657
commit
7b294ed31f
|
@ -168,3 +168,6 @@ cython_debug/
|
|||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
.vscode/launch.json
|
||||
persist_directory/chroma.sqlite3
|
||||
|
|
|
@ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
|
|||
|
||||
# Define the Chroma settings
|
||||
CHROMA_SETTINGS = Settings(
|
||||
chroma_db_impl='duckdb+parquet',
|
||||
persist_directory=PERSIST_DIRECTORY,
|
||||
anonymized_telemetry=False
|
||||
)
|
||||
|
|
23
ingest.py
23
ingest.py
|
@ -25,6 +25,7 @@ from langchain.vectorstores import Chroma
|
|||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.docstore.document import Document
|
||||
from constants import CHROMA_SETTINGS
|
||||
import chromadb
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
@ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
|||
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
||||
return texts
|
||||
|
||||
def does_vectorstore_exist(persist_directory: str) -> bool:
|
||||
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
|
||||
"""
|
||||
Checks if vectorstore exists
|
||||
"""
|
||||
if os.path.exists(os.path.join(persist_directory, 'index')):
|
||||
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
|
||||
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
|
||||
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
|
||||
# At least 3 documents are needed in a working vectorstore
|
||||
if len(list_index_files) > 3:
|
||||
return True
|
||||
return False
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
||||
if not db.get()['documents']:
|
||||
return False
|
||||
return True
|
||||
|
||||
def main():
|
||||
# Create embeddings
|
||||
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
||||
# Chroma client
|
||||
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
||||
|
||||
if does_vectorstore_exist(persist_directory):
|
||||
if does_vectorstore_exist(persist_directory, embeddings):
|
||||
# Update and store locally vectorstore
|
||||
print(f"Appending to existing vectorstore at {persist_directory}")
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
||||
collection = db.get()
|
||||
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
|
||||
print(f"Creating embeddings. May take some minutes...")
|
||||
|
@ -158,7 +157,7 @@ def main():
|
|||
print("Creating new vectorstore")
|
||||
texts = process_documents()
|
||||
print(f"Creating embeddings. May take some minutes...")
|
||||
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
|
||||
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
||||
db.persist()
|
||||
db = None
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.llms import GPT4All, LlamaCpp
|
||||
import chromadb
|
||||
import os
|
||||
import argparse
|
||||
import time
|
||||
|
@ -26,7 +27,8 @@ def main():
|
|||
# Parse the command line arguments
|
||||
args = parse_arguments()
|
||||
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
|
||||
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
||||
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
|
||||
# activate/deactivate the streaming StdOut callback for LLMs
|
||||
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
|
||||
|
@ -39,7 +41,7 @@ def main():
|
|||
case _default:
|
||||
# raise exception if model_type is not supported
|
||||
raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
|
||||
|
||||
|
||||
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
|
||||
# Interactive questions and answers
|
||||
while True:
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
langchain==0.0.228
|
||||
gpt4all==1.0.3
|
||||
chromadb==0.3.26
|
||||
llama-cpp-python==0.1.68
|
||||
urllib3==2.0.3
|
||||
PyMuPDF==1.22.5
|
||||
langchain==0.0.274
|
||||
gpt4all==1.0.8
|
||||
chromadb==0.4.7
|
||||
llama-cpp-python==0.1.81
|
||||
urllib3==2.0.4
|
||||
PyMuPDF==1.23.1
|
||||
python-dotenv==1.0.0
|
||||
unstructured==0.8.0
|
||||
extract-msg==0.41.5
|
||||
unstructured==0.10.8
|
||||
extract-msg==0.45.0
|
||||
tabulate==0.9.0
|
||||
pandoc==2.3
|
||||
pypandoc==1.11
|
||||
tqdm==4.65.0
|
||||
tqdm==4.66.1
|
||||
sentence_transformers==2.2.2
|
||||
|
|
Loading…
Reference in New Issue