Merge pull request #999 from imartinez/990-cannot-submit-more-than-166-embeddings-at-once-while-ingesting
Batch embeddings to be processed by chromadb
This commit is contained in:
		
						commit
						0b5a6687e3
					
				|  | @ -7,6 +7,7 @@ models/ | ||||||
| # Local Chroma db | # Local Chroma db | ||||||
| .chroma/ | .chroma/ | ||||||
| db/ | db/ | ||||||
|  | persist_directory/chroma.sqlite | ||||||
| 
 | 
 | ||||||
| # Byte-compiled / optimized / DLL files | # Byte-compiled / optimized / DLL files | ||||||
| __pycache__/ | __pycache__/ | ||||||
|  | @ -169,5 +170,5 @@ cython_debug/ | ||||||
| #  option (not recommended) you can uncomment the following to ignore the entire idea folder. | #  option (not recommended) you can uncomment the following to ignore the entire idea folder. | ||||||
| #.idea/ | #.idea/ | ||||||
| 
 | 
 | ||||||
|  | # vscode | ||||||
| .vscode/launch.json | .vscode/launch.json | ||||||
| persist_directory/chroma.sqlite3 |  | ||||||
|  |  | ||||||
							
								
								
									
										34
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										34
									
								
								ingest.py
								
								
								
								
							|  | @ -31,6 +31,7 @@ if not load_dotenv(): | ||||||
| 
 | 
 | ||||||
| from constants import CHROMA_SETTINGS | from constants import CHROMA_SETTINGS | ||||||
| import chromadb | import chromadb | ||||||
|  | from chromadb.api.segment import API | ||||||
| 
 | 
 | ||||||
| # Load environment variables | # Load environment variables | ||||||
| persist_directory = os.environ.get('PERSIST_DIRECTORY') | persist_directory = os.environ.get('PERSIST_DIRECTORY') | ||||||
|  | @ -126,9 +127,19 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]: | ||||||
|         exit(0) |         exit(0) | ||||||
|     print(f"Loaded {len(documents)} new documents from {source_directory}") |     print(f"Loaded {len(documents)} new documents from {source_directory}") | ||||||
|     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | ||||||
|     texts = text_splitter.split_documents(documents) |     documents = text_splitter.split_documents(documents) | ||||||
|     print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") |     print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)") | ||||||
|     return texts |     return documents | ||||||
|  | 
 | ||||||
|  | def batch_chromadb_insertions(chroma_client: API, documents: List[Document]) -> List[Document]: | ||||||
|  |     """ | ||||||
|  |     Split the total documents to be inserted into batches of documents that the local chroma client can process | ||||||
|  |     """ | ||||||
|  |     # Get max batch size. | ||||||
|  |     max_batch_size = chroma_client.max_batch_size | ||||||
|  |     for i in range(0, len(documents), max_batch_size): | ||||||
|  |         yield documents[i:i + max_batch_size] | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: | def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: | ||||||
|     """ |     """ | ||||||
|  | @ -150,17 +161,22 @@ def main(): | ||||||
|         print(f"Appending to existing vectorstore at {persist_directory}") |         print(f"Appending to existing vectorstore at {persist_directory}") | ||||||
|         db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) |         db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) | ||||||
|         collection = db.get() |         collection = db.get() | ||||||
|         texts = process_documents([metadata['source'] for metadata in collection['metadatas']]) |         documents = process_documents([metadata['source'] for metadata in collection['metadatas']]) | ||||||
|         print(f"Creating embeddings. May take some minutes...") |         print(f"Creating embeddings. May take some minutes...") | ||||||
|         db.add_documents(texts) |         for batched_chromadb_insertion in batch_chromadb_insertions(chroma_client, documents): | ||||||
|  |             db.add_documents(batched_chromadb_insertion) | ||||||
|     else: |     else: | ||||||
|         # Create and store locally vectorstore |         # Create and store locally vectorstore | ||||||
|         print("Creating new vectorstore") |         print("Creating new vectorstore") | ||||||
|         texts = process_documents() |         documents = process_documents() | ||||||
|         print(f"Creating embeddings. May take some minutes...") |         print(f"Creating embeddings. May take some minutes...") | ||||||
|         db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) |         # Create the db with the first batch of documents to insert | ||||||
|     db.persist() |         batched_chromadb_insertions = batch_chromadb_insertions(chroma_client, documents) | ||||||
|     db = None |         first_insertion = next(batched_chromadb_insertions) | ||||||
|  |         db = Chroma.from_documents(first_insertion, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) | ||||||
|  |         # Add the rest of batches of documents | ||||||
|  |         for batched_chromadb_insertion in batched_chromadb_insertions: | ||||||
|  |             db.add_documents(batched_chromadb_insertion) | ||||||
| 
 | 
 | ||||||
|     print(f"Ingestion complete! You can now run privateGPT.py to query your documents") |     print(f"Ingestion complete! You can now run privateGPT.py to query your documents") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -419,13 +419,36 @@ files = [ | ||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "chroma-hnswlib" | name = "chroma-hnswlib" | ||||||
| version = "0.7.2" | version = "0.7.3" | ||||||
| description = "Chromas fork of hnswlib" | description = "Chromas fork of hnswlib" | ||||||
| optional = false | optional = false | ||||||
| python-versions = "*" | python-versions = "*" | ||||||
| files = [ | files = [ | ||||||
|     {file = "chroma-hnswlib-0.7.2.tar.gz", hash = "sha256:87c6a0ced9e52ac7c8ca01ded25bb70c4a7f63f5871181eb18bea9111ce786c4"}, |     {file = "chroma-hnswlib-0.7.3.tar.gz", hash = "sha256:b6137bedde49fffda6af93b0297fe00429fc61e5a072b1ed9377f909ed95a932"}, | ||||||
|     {file = "chroma_hnswlib-0.7.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:3ffbb542cada959771ae4b8394f8cee1ef76bd17950adb592531433e912377db"}, |     {file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59d6a7c6f863c67aeb23e79a64001d537060b6995c3eca9a06e349ff7b0998ca"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d71a3f4f232f537b6152947006bd32bc1629a8686df22fd97777b70f416c127a"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c92dc1ebe062188e53970ba13f6b07e0ae32e64c9770eb7f7ffa83f149d4210"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49da700a6656fed8753f68d44b8cc8ae46efc99fc8a22a6d970dc1697f49b403"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:108bc4c293d819b56476d8f7865803cb03afd6ca128a2a04d678fffc139af029"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:11e7ca93fb8192214ac2b9c0943641ac0daf8f9d4591bb7b73be808a83835667"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6f552e4d23edc06cdeb553cdc757d2fe190cdeb10d43093d6a3319f8d4bf1c6b"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f96f4d5699e486eb1fb95849fe35ab79ab0901265805be7e60f4eaa83ce263ec"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:368e57fe9ebae05ee5844840fa588028a023d1182b0cfdb1d13f607c9ea05756"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:b7dca27b8896b494456db0fd705b689ac6b73af78e186eb6a42fea2de4f71c6f"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:70f897dc6218afa1d99f43a9ad5eb82f392df31f57ff514ccf4eeadecd62f544"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aef10b4952708f5a1381c124a29aead0c356f8d7d6e0b520b778aaa62a356f4"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee2d8d1529fca3898d512079144ec3e28a81d9c17e15e0ea4665697a7923253"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:a4021a70e898783cd6f26e00008b494c6249a7babe8774e90ce4766dd288c8ba"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a8f61fa1d417fda848e3ba06c07671f14806a2585272b175ba47501b066fe6b1"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d7563be58bc98e8f0866907368e22ae218d6060601b79c42f59af4eccbbd2e0a"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51b8d411486ee70d7b66ec08cc8b9b6620116b650df9c19076d2d8b6ce2ae914"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d706782b628e4f43f1b8a81e9120ac486837fbd9bcb8ced70fe0d9b95c72d77"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:54f053dedc0e3ba657f05fec6e73dd541bc5db5b09aa8bc146466ffb734bdc86"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e607c5a71c610a73167a517062d302c0827ccdd6e259af6e4869a5c1306ffb5d"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c2358a795870156af6761890f9eb5ca8cade57eb10c5f046fe94dae1faa04b9e"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cea425df2e6b8a5e201fff0d922a1cc1d165b3cfe762b1408075723c8892218"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:454df3dd3e97aa784fba7cf888ad191e0087eef0fd8c70daf28b753b3b591170"}, | ||||||
|  |     {file = "chroma_hnswlib-0.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:df587d15007ca701c6de0ee7d5585dd5e976b7edd2b30ac72bc376b3c3f85882"}, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [package.dependencies] | [package.dependencies] | ||||||
|  | @ -433,21 +456,21 @@ numpy = "*" | ||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "chromadb" | name = "chromadb" | ||||||
| version = "0.4.7" | version = "0.4.12" | ||||||
| description = "Chroma." | description = "Chroma." | ||||||
| optional = false | optional = false | ||||||
| python-versions = ">=3.7" | python-versions = ">=3.7" | ||||||
| files = [ | files = [ | ||||||
|     {file = "chromadb-0.4.7-py3-none-any.whl", hash = "sha256:e928406410efdd1e5550cb456a3f4c40774aec1efcd95011389483fa0ae3c472"}, |     {file = "chromadb-0.4.12-py3-none-any.whl", hash = "sha256:2a9d99945c25049ce8b8d2896ef296909f42ba2f5dca983a496adae0a0deb64a"}, | ||||||
|     {file = "chromadb-0.4.7.tar.gz", hash = "sha256:7282aab8fd7cf81f0bf55f5a056bdc3aca15bf56a37b711ec53fab1440b5e6f7"}, |     {file = "chromadb-0.4.12.tar.gz", hash = "sha256:430585725e1f2f43f51ef3d0d7a41d99d0cdc4635264e75aaf1e303ab48ae616"}, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [package.dependencies] | [package.dependencies] | ||||||
| bcrypt = ">=4.0.1" | bcrypt = ">=4.0.1" | ||||||
| chroma-hnswlib = "0.7.2" | chroma-hnswlib = "0.7.3" | ||||||
| fastapi = ">=0.95.2,<0.100.0" | fastapi = ">=0.95.2,<0.100.0" | ||||||
| importlib-resources = "*" | importlib-resources = "*" | ||||||
| numpy = ">=1.21.6" | numpy = {version = ">=1.22.5", markers = "python_version >= \"3.8\""} | ||||||
| onnxruntime = ">=1.14.1" | onnxruntime = ">=1.14.1" | ||||||
| overrides = ">=7.3.1" | overrides = ">=7.3.1" | ||||||
| posthog = ">=2.4.0" | posthog = ">=2.4.0" | ||||||
|  | @ -457,6 +480,7 @@ pypika = ">=0.48.9" | ||||||
| requests = ">=2.28" | requests = ">=2.28" | ||||||
| tokenizers = ">=0.13.2" | tokenizers = ">=0.13.2" | ||||||
| tqdm = ">=4.65.0" | tqdm = ">=4.65.0" | ||||||
|  | typer = ">=0.9.0" | ||||||
| typing-extensions = ">=4.5.0" | typing-extensions = ">=4.5.0" | ||||||
| uvicorn = {version = ">=0.18.3", extras = ["standard"]} | uvicorn = {version = ">=0.18.3", extras = ["standard"]} | ||||||
| 
 | 
 | ||||||
|  | @ -3038,6 +3062,27 @@ torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", | ||||||
| video = ["av (==9.2.0)", "decord (==0.6.0)"] | video = ["av (==9.2.0)", "decord (==0.6.0)"] | ||||||
| vision = ["Pillow (<10.0.0)"] | vision = ["Pillow (<10.0.0)"] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "typer" | ||||||
|  | version = "0.9.0" | ||||||
|  | description = "Typer, build great CLIs. Easy to code. Based on Python type hints." | ||||||
|  | optional = false | ||||||
|  | python-versions = ">=3.6" | ||||||
|  | files = [ | ||||||
|  |     {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"}, | ||||||
|  |     {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"}, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | [package.dependencies] | ||||||
|  | click = ">=7.1.1,<9.0.0" | ||||||
|  | typing-extensions = ">=3.7.4.3" | ||||||
|  | 
 | ||||||
|  | [package.extras] | ||||||
|  | all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] | ||||||
|  | dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] | ||||||
|  | doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] | ||||||
|  | test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "typing-extensions" | name = "typing-extensions" | ||||||
| version = "4.7.1" | version = "4.7.1" | ||||||
|  | @ -3455,4 +3500,4 @@ multidict = ">=4.0" | ||||||
| [metadata] | [metadata] | ||||||
| lock-version = "2.0" | lock-version = "2.0" | ||||||
| python-versions = "^3.10" | python-versions = "^3.10" | ||||||
| content-hash = "9772f4040d3a2152ec06db1ec709509e0f05815c2ddc3cba9ed974ce183e2691" | content-hash = "111b08c8b4a98f2efb0ad223dab9777c171cea626211aa7efef03a4e4605bc08" | ||||||
|  |  | ||||||
|  | @ -10,7 +10,7 @@ readme = "README.md" | ||||||
| python = "^3.10" | python = "^3.10" | ||||||
| langchain = "0.0.274" | langchain = "0.0.274" | ||||||
| gpt4all = "1.0.8" | gpt4all = "1.0.8" | ||||||
| chromadb = "0.4.7" | chromadb = "0.4.12" | ||||||
| llama-cpp-python = "0.1.81" | llama-cpp-python = "0.1.81" | ||||||
| urllib3 = "2.0.4" | urllib3 = "2.0.4" | ||||||
| PyMuPDF = "1.23.1" | PyMuPDF = "1.23.1" | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| langchain==0.0.274 | langchain==0.0.274 | ||||||
| gpt4all==1.0.8 | gpt4all==1.0.8 | ||||||
| chromadb==0.4.7 | chromadb==0.4.12 | ||||||
| llama-cpp-python==0.1.81 | llama-cpp-python==0.1.81 | ||||||
| urllib3==2.0.4 | urllib3==2.0.4 | ||||||
| PyMuPDF==1.23.1 | PyMuPDF==1.23.1 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue