Update dependencies. Upgrade chromadb integration.
This commit is contained in:
		
							parent
							
								
									54d3eee657
								
							
						
					
					
						commit
						7b294ed31f
					
				|  | @ -168,3 +168,6 @@ cython_debug/ | |||
| #  and can be added to the global gitignore or merged into this file.  For a more nuclear | ||||
| #  option (not recommended) you can uncomment the following to ignore the entire idea folder. | ||||
| #.idea/ | ||||
| 
 | ||||
| .vscode/launch.json | ||||
| persist_directory/chroma.sqlite3 | ||||
|  |  | |||
|  | @ -9,7 +9,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY') | |||
| 
 | ||||
| # Define the Chroma settings | ||||
| CHROMA_SETTINGS = Settings( | ||||
|         chroma_db_impl='duckdb+parquet', | ||||
|         persist_directory=PERSIST_DIRECTORY, | ||||
|         anonymized_telemetry=False | ||||
| ) | ||||
|  |  | |||
							
								
								
									
										23
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										23
									
								
								ingest.py
								
								
								
								
							|  | @ -25,6 +25,7 @@ from langchain.vectorstores import Chroma | |||
| from langchain.embeddings import HuggingFaceEmbeddings | ||||
| from langchain.docstore.document import Document | ||||
| from constants import CHROMA_SETTINGS | ||||
| import chromadb | ||||
| 
 | ||||
| 
 | ||||
| load_dotenv() | ||||
|  | @ -128,27 +129,25 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]: | |||
|     print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") | ||||
|     return texts | ||||
| 
 | ||||
| def does_vectorstore_exist(persist_directory: str) -> bool: | ||||
| def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: | ||||
|     """ | ||||
|     Checks if vectorstore exists | ||||
|     """ | ||||
|     if os.path.exists(os.path.join(persist_directory, 'index')): | ||||
|         if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): | ||||
|             list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) | ||||
|             list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) | ||||
|             # At least 3 documents are needed in a working vectorstore | ||||
|             if len(list_index_files) > 3: | ||||
|                 return True | ||||
|     return False | ||||
|     db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | ||||
|     if not db.get()['documents']: | ||||
|         return False | ||||
|     return True | ||||
| 
 | ||||
| def main(): | ||||
|     # Create embeddings | ||||
|     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) | ||||
|     # Chroma client | ||||
|     chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory) | ||||
| 
 | ||||
|     if does_vectorstore_exist(persist_directory): | ||||
|     if does_vectorstore_exist(persist_directory, embeddings): | ||||
|         # Update and store locally vectorstore | ||||
|         print(f"Appending to existing vectorstore at {persist_directory}") | ||||
|         db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) | ||||
|         db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) | ||||
|         collection = db.get() | ||||
|         texts = process_documents([metadata['source'] for metadata in collection['metadatas']]) | ||||
|         print(f"Creating embeddings. May take some minutes...") | ||||
|  | @ -158,7 +157,7 @@ def main(): | |||
|         print("Creating new vectorstore") | ||||
|         texts = process_documents() | ||||
|         print(f"Creating embeddings. May take some minutes...") | ||||
|         db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) | ||||
|         db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) | ||||
|     db.persist() | ||||
|     db = None | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings | |||
| from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | ||||
| from langchain.vectorstores import Chroma | ||||
| from langchain.llms import GPT4All, LlamaCpp | ||||
| import chromadb | ||||
| import os | ||||
| import argparse | ||||
| import time | ||||
|  | @ -26,7 +27,8 @@ def main(): | |||
|     # Parse the command line arguments | ||||
|     args = parse_arguments() | ||||
|     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) | ||||
|     db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) | ||||
|     chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory) | ||||
|     db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) | ||||
|     retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) | ||||
|     # activate/deactivate the streaming StdOut callback for LLMs | ||||
|     callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()] | ||||
|  | @ -39,7 +41,7 @@ def main(): | |||
|         case _default: | ||||
|             # raise exception if model_type is not supported | ||||
|             raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All") | ||||
|          | ||||
| 
 | ||||
|     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source) | ||||
|     # Interactive questions and answers | ||||
|     while True: | ||||
|  |  | |||
|  | @ -1,14 +1,14 @@ | |||
| langchain==0.0.228 | ||||
| gpt4all==1.0.3 | ||||
| chromadb==0.3.26 | ||||
| llama-cpp-python==0.1.68 | ||||
| urllib3==2.0.3 | ||||
| PyMuPDF==1.22.5 | ||||
| langchain==0.0.274 | ||||
| gpt4all==1.0.8 | ||||
| chromadb==0.4.7 | ||||
| llama-cpp-python==0.1.81 | ||||
| urllib3==2.0.4 | ||||
| PyMuPDF==1.23.1 | ||||
| python-dotenv==1.0.0 | ||||
| unstructured==0.8.0 | ||||
| extract-msg==0.41.5 | ||||
| unstructured==0.10.8 | ||||
| extract-msg==0.45.0 | ||||
| tabulate==0.9.0 | ||||
| pandoc==2.3 | ||||
| pypandoc==1.11 | ||||
| tqdm==4.65.0 | ||||
| tqdm==4.66.1 | ||||
| sentence_transformers==2.2.2 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue