feat(ingest): Created a faster ingestion mode - pipeline (#1750)
* Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres * postgresql should be postgres * Adding pipeline ingestion mode * disable hugging face parallelism. Continue on file to doc transform failure * Semaphore to limit docq async workers. ETA reporting
This commit is contained in:
		
							parent
							
								
									1efac6a3fe
								
							
						
					
					
						commit
						134fc54d7d
					
				|  | @ -62,6 +62,7 @@ The following ingestion mode exist: | ||||||
| * `simple`: historic behavior, ingest one document at a time, sequentially | * `simple`: historic behavior, ingest one document at a time, sequentially | ||||||
| * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed) | * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed) | ||||||
| * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup. | * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup. | ||||||
|  | * `pipeline`: Alternative to parallel. | ||||||
| To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`. | To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`. | ||||||
| 
 | 
 | ||||||
| To configure the number of workers used for parallel or batched ingestion, you can use | To configure the number of workers used for parallel or batched ingestion, you can use | ||||||
|  |  | ||||||
|  | @ -6,6 +6,7 @@ import multiprocessing.pool | ||||||
| import os | import os | ||||||
| import threading | import threading | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | from queue import Queue | ||||||
| from typing import Any | from typing import Any | ||||||
| 
 | 
 | ||||||
| from llama_index.core.data_structs import IndexDict | from llama_index.core.data_structs import IndexDict | ||||||
|  | @ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType | ||||||
| from llama_index.core.indices import VectorStoreIndex, load_index_from_storage | from llama_index.core.indices import VectorStoreIndex, load_index_from_storage | ||||||
| from llama_index.core.indices.base import BaseIndex | from llama_index.core.indices.base import BaseIndex | ||||||
| from llama_index.core.ingestion import run_transformations | from llama_index.core.ingestion import run_transformations | ||||||
| from llama_index.core.schema import Document, TransformComponent | from llama_index.core.schema import BaseNode, Document, TransformComponent | ||||||
| from llama_index.core.storage import StorageContext | from llama_index.core.storage import StorageContext | ||||||
| 
 | 
 | ||||||
| from private_gpt.components.ingest.ingest_helper import IngestionHelper | from private_gpt.components.ingest.ingest_helper import IngestionHelper | ||||||
| from private_gpt.paths import local_data_path | from private_gpt.paths import local_data_path | ||||||
| from private_gpt.settings.settings import Settings | from private_gpt.settings.settings import Settings | ||||||
|  | from private_gpt.utils.eta import eta | ||||||
| 
 | 
 | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
|  | @ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex): | ||||||
|         self._file_to_documents_work_pool.terminate() |         self._file_to_documents_work_pool.terminate() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class PipelineIngestComponent(BaseIngestComponentWithIndex): | ||||||
|  |     """Pipeline ingestion - keeping the embedding worker pool as busy as possible. | ||||||
|  | 
 | ||||||
|  |     This class implements a threaded ingestion pipeline, which comprises two threads | ||||||
|  |     and two queues. The primary thread is responsible for reading and parsing files | ||||||
|  |     into documents. These documents are then placed into a queue, which is | ||||||
|  |     distributed to a pool of worker processes for embedding computation. After | ||||||
|  |     embedding, the documents are transferred to another queue where they are | ||||||
|  |     accumulated until a threshold is reached. Upon reaching this threshold, the | ||||||
|  |     accumulated documents are flushed to the document store, index, and vector | ||||||
|  |     store. | ||||||
|  | 
 | ||||||
|  |     Exception handling ensures robustness against erroneous files. However, in the | ||||||
|  |     pipelined design, one error can lead to the discarding of multiple files. Any | ||||||
|  |     discarded files will be reported. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     NODE_FLUSH_COUNT = 5000  # Save the index every # nodes. | ||||||
|  | 
 | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         storage_context: StorageContext, | ||||||
|  |         embed_model: EmbedType, | ||||||
|  |         transformations: list[TransformComponent], | ||||||
|  |         count_workers: int, | ||||||
|  |         *args: Any, | ||||||
|  |         **kwargs: Any, | ||||||
|  |     ) -> None: | ||||||
|  |         super().__init__(storage_context, embed_model, transformations, *args, **kwargs) | ||||||
|  |         self.count_workers = count_workers | ||||||
|  |         assert ( | ||||||
|  |             len(self.transformations) >= 2 | ||||||
|  |         ), "Embeddings must be in the transformations" | ||||||
|  |         assert count_workers > 0, "count_workers must be > 0" | ||||||
|  |         self.count_workers = count_workers | ||||||
|  |         # We are doing our own multiprocessing | ||||||
|  |         # To do not collide with the multiprocessing of huggingface, we disable it | ||||||
|  |         os.environ["TOKENIZERS_PARALLELISM"] = "false" | ||||||
|  | 
 | ||||||
|  |         # doc_q stores parsed files as Document chunks. | ||||||
|  |         # Using a shallow queue causes the filesystem parser to block | ||||||
|  |         # when it reaches capacity. This ensures it doesn't outpace the | ||||||
|  |         # computationally intensive embeddings phase, avoiding unnecessary | ||||||
|  |         # memory consumption.  The semaphore is used to bound the async worker | ||||||
|  |         # embedding computations to cause the doc Q to fill and block. | ||||||
|  |         self.doc_semaphore = multiprocessing.Semaphore( | ||||||
|  |             self.count_workers | ||||||
|  |         )  # limit the doc queue to # items. | ||||||
|  |         self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20) | ||||||
|  |         # node_q stores documents parsed into nodes (embeddings). | ||||||
|  |         # Larger queue size so we don't block the embedding workers during a slow | ||||||
|  |         # index update. | ||||||
|  |         self.node_q: Queue[ | ||||||
|  |             tuple[str, str | None, list[Document] | None, list[BaseNode] | None] | ||||||
|  |         ] = Queue(40) | ||||||
|  |         threading.Thread(target=self._doc_to_node, daemon=True).start() | ||||||
|  |         threading.Thread(target=self._write_nodes, daemon=True).start() | ||||||
|  | 
 | ||||||
|  |     def _doc_to_node(self) -> None: | ||||||
|  |         # Parse documents into nodes | ||||||
|  |         with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool: | ||||||
|  |             while True: | ||||||
|  |                 try: | ||||||
|  |                     cmd, file_name, documents = self.doc_q.get( | ||||||
|  |                         block=True | ||||||
|  |                     )  # Documents for a file | ||||||
|  |                     if cmd == "process": | ||||||
|  |                         # Push CPU/GPU embedding work to the worker pool | ||||||
|  |                         # Acquire semaphore to control access to worker pool | ||||||
|  |                         self.doc_semaphore.acquire() | ||||||
|  |                         pool.apply_async( | ||||||
|  |                             self._doc_to_node_worker, (file_name, documents) | ||||||
|  |                         ) | ||||||
|  |                     elif cmd == "quit": | ||||||
|  |                         break | ||||||
|  |                 finally: | ||||||
|  |                     if cmd != "process": | ||||||
|  |                         self.doc_q.task_done()  # unblock Q joins | ||||||
|  | 
 | ||||||
|  |     def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None: | ||||||
|  |         # CPU/GPU intensive work in its own process | ||||||
|  |         try: | ||||||
|  |             nodes = run_transformations( | ||||||
|  |                 documents,  # type: ignore[arg-type] | ||||||
|  |                 self.transformations, | ||||||
|  |                 show_progress=self.show_progress, | ||||||
|  |             ) | ||||||
|  |             self.node_q.put(("process", file_name, documents, nodes)) | ||||||
|  |         finally: | ||||||
|  |             self.doc_semaphore.release() | ||||||
|  |             self.doc_q.task_done()  # unblock Q joins | ||||||
|  | 
 | ||||||
|  |     def _save_docs( | ||||||
|  |         self, files: list[str], documents: list[Document], nodes: list[BaseNode] | ||||||
|  |     ) -> None: | ||||||
|  |         try: | ||||||
|  |             logger.info( | ||||||
|  |                 f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)" | ||||||
|  |             ) | ||||||
|  |             self._index.insert_nodes(nodes) | ||||||
|  |             for document in documents: | ||||||
|  |                 self._index.docstore.set_document_hash( | ||||||
|  |                     document.get_doc_id(), document.hash | ||||||
|  |                 ) | ||||||
|  |             self._save_index() | ||||||
|  |         except Exception: | ||||||
|  |             # Tell the user so they can investigate these files | ||||||
|  |             logger.exception(f"Processing files {files}") | ||||||
|  |         finally: | ||||||
|  |             # Clearing work, even on exception, maintains a clean state. | ||||||
|  |             nodes.clear() | ||||||
|  |             documents.clear() | ||||||
|  |             files.clear() | ||||||
|  | 
 | ||||||
|  |     def _write_nodes(self) -> None: | ||||||
|  |         # Save nodes to index.  I/O intensive. | ||||||
|  |         node_stack: list[BaseNode] = [] | ||||||
|  |         doc_stack: list[Document] = [] | ||||||
|  |         file_stack: list[str] = [] | ||||||
|  |         while True: | ||||||
|  |             try: | ||||||
|  |                 cmd, file_name, documents, nodes = self.node_q.get(block=True) | ||||||
|  |                 if cmd in ("flush", "quit"): | ||||||
|  |                     if file_stack: | ||||||
|  |                         self._save_docs(file_stack, doc_stack, node_stack) | ||||||
|  |                     if cmd == "quit": | ||||||
|  |                         break | ||||||
|  |                 elif cmd == "process": | ||||||
|  |                     node_stack.extend(nodes)  # type: ignore[arg-type] | ||||||
|  |                     doc_stack.extend(documents)  # type: ignore[arg-type] | ||||||
|  |                     file_stack.append(file_name)  # type: ignore[arg-type] | ||||||
|  |                     # Constant saving is heavy on I/O - accumulate to a threshold | ||||||
|  |                     if len(node_stack) >= self.NODE_FLUSH_COUNT: | ||||||
|  |                         self._save_docs(file_stack, doc_stack, node_stack) | ||||||
|  |             finally: | ||||||
|  |                 self.node_q.task_done() | ||||||
|  | 
 | ||||||
|  |     def _flush(self) -> None: | ||||||
|  |         self.doc_q.put(("flush", None, None)) | ||||||
|  |         self.doc_q.join() | ||||||
|  |         self.node_q.put(("flush", None, None, None)) | ||||||
|  |         self.node_q.join() | ||||||
|  | 
 | ||||||
|  |     def ingest(self, file_name: str, file_data: Path) -> list[Document]: | ||||||
|  |         documents = IngestionHelper.transform_file_into_documents(file_name, file_data) | ||||||
|  |         self.doc_q.put(("process", file_name, documents)) | ||||||
|  |         self._flush() | ||||||
|  |         return documents | ||||||
|  | 
 | ||||||
|  |     def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]: | ||||||
|  |         docs = [] | ||||||
|  |         for file_name, file_data in eta(files): | ||||||
|  |             try: | ||||||
|  |                 documents = IngestionHelper.transform_file_into_documents( | ||||||
|  |                     file_name, file_data | ||||||
|  |                 ) | ||||||
|  |                 self.doc_q.put(("process", file_name, documents)) | ||||||
|  |                 docs.extend(documents) | ||||||
|  |             except Exception: | ||||||
|  |                 logger.exception(f"Skipping {file_data.name}") | ||||||
|  |         self._flush() | ||||||
|  |         return docs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_ingestion_component( | def get_ingestion_component( | ||||||
|     storage_context: StorageContext, |     storage_context: StorageContext, | ||||||
|     embed_model: EmbedType, |     embed_model: EmbedType, | ||||||
|  | @ -336,6 +502,13 @@ def get_ingestion_component( | ||||||
|             transformations=transformations, |             transformations=transformations, | ||||||
|             count_workers=settings.embedding.count_workers, |             count_workers=settings.embedding.count_workers, | ||||||
|         ) |         ) | ||||||
|  |     elif ingest_mode == "pipeline": | ||||||
|  |         return PipelineIngestComponent( | ||||||
|  |             storage_context=storage_context, | ||||||
|  |             embed_model=embed_model, | ||||||
|  |             transformations=transformations, | ||||||
|  |             count_workers=settings.embedding.count_workers, | ||||||
|  |         ) | ||||||
|     else: |     else: | ||||||
|         return SimpleIngestComponent( |         return SimpleIngestComponent( | ||||||
|             storage_context=storage_context, |             storage_context=storage_context, | ||||||
|  |  | ||||||
|  | @ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel): | ||||||
| 
 | 
 | ||||||
| class EmbeddingSettings(BaseModel): | class EmbeddingSettings(BaseModel): | ||||||
|     mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"] |     mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"] | ||||||
|     ingest_mode: Literal["simple", "batch", "parallel"] = Field( |     ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field( | ||||||
|         "simple", |         "simple", | ||||||
|         description=( |         description=( | ||||||
|             "The ingest mode to use for the embedding engine:\n" |             "The ingest mode to use for the embedding engine:\n" | ||||||
|             "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n" |             "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n" | ||||||
|             "If `batch` - if multiple files, parse all the files in parallel, " |             "If `batch` - if multiple files, parse all the files in parallel, " | ||||||
|             "and send them in batch to the embedding model.\n" |             "and send them in batch to the embedding model.\n" | ||||||
|  |             "In `pipeline` - The Embedding engine is kept as busy as possible\n" | ||||||
|             "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n" |             "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n" | ||||||
|             "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n" |             "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n" | ||||||
|             "For modes that leverage parallelization, you can specify the number of " |             "For modes that leverage parallelization, you can specify the number of " | ||||||
|  | @ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel): | ||||||
|             "The number of workers to use for file ingestion.\n" |             "The number of workers to use for file ingestion.\n" | ||||||
|             "In `batch` mode, this is the number of workers used to parse the files.\n" |             "In `batch` mode, this is the number of workers used to parse the files.\n" | ||||||
|             "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n" |             "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n" | ||||||
|  |             "In `pipeline` mode, this is the number of workers that can perform embeddings.\n" | ||||||
|             "This is only used if `ingest_mode` is not `simple`.\n" |             "This is only used if `ingest_mode` is not `simple`.\n" | ||||||
|             "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n" |             "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n" | ||||||
|             "Do not set it higher than your number of threads of your CPU." |             "Do not set it higher than your number of threads of your CPU." | ||||||
|  |  | ||||||
|  | @ -0,0 +1,122 @@ | ||||||
|  | import datetime | ||||||
|  | import logging | ||||||
|  | import math | ||||||
|  | import time | ||||||
|  | from collections import deque | ||||||
|  | from typing import Any | ||||||
|  | 
 | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def human_time(*args: Any, **kwargs: Any) -> str: | ||||||
|  |     def timedelta_total_seconds(timedelta: datetime.timedelta) -> float: | ||||||
|  |         return ( | ||||||
|  |             timedelta.microseconds | ||||||
|  |             + 0.0 | ||||||
|  |             + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6 | ||||||
|  |         ) / 10**6 | ||||||
|  | 
 | ||||||
|  |     secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs))) | ||||||
|  |     # We want (ms) precision below 2 seconds | ||||||
|  |     if secs < 2: | ||||||
|  |         return f"{secs * 1000}ms" | ||||||
|  |     units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)] | ||||||
|  |     parts = [] | ||||||
|  |     for unit, mul in units: | ||||||
|  |         if secs / mul >= 1 or mul == 1: | ||||||
|  |             if mul > 1: | ||||||
|  |                 n = int(math.floor(secs / mul)) | ||||||
|  |                 secs -= n * mul | ||||||
|  |             else: | ||||||
|  |                 # >2s we drop the (ms) component. | ||||||
|  |                 n = int(secs) | ||||||
|  |             if n: | ||||||
|  |                 parts.append(f"{n}{unit}") | ||||||
|  |     return " ".join(parts) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def eta(iterator: list[Any]) -> Any: | ||||||
|  |     """Report an ETA after 30s and every 60s thereafter.""" | ||||||
|  |     total = len(iterator) | ||||||
|  |     _eta = ETA(total) | ||||||
|  |     _eta.needReport(30) | ||||||
|  |     for processed, data in enumerate(iterator, start=1): | ||||||
|  |         yield data | ||||||
|  |         _eta.update(processed) | ||||||
|  |         if _eta.needReport(60): | ||||||
|  |             logger.info(f"{processed}/{total} - ETA {_eta.human_time()}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class ETA: | ||||||
|  |     """Predict how long something will take to complete.""" | ||||||
|  | 
 | ||||||
|  |     def __init__(self, total: int): | ||||||
|  |         self.total: int = total  # Total expected records. | ||||||
|  |         self.rate: float = 0.0  # per second | ||||||
|  |         self._timing_data: deque[tuple[float, int]] = deque(maxlen=100) | ||||||
|  |         self.secondsLeft: float = 0.0 | ||||||
|  |         self.nexttime: float = 0.0 | ||||||
|  | 
 | ||||||
|  |     def human_time(self) -> str: | ||||||
|  |         if self._calc(): | ||||||
|  |             return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min" | ||||||
|  |         return "(computing)" | ||||||
|  | 
 | ||||||
|  |     def update(self, count: int) -> None: | ||||||
|  |         # count should be in the range 0 to self.total | ||||||
|  |         assert count > 0 | ||||||
|  |         assert count <= self.total | ||||||
|  |         self._timing_data.append((time.time(), count))  # (X,Y) for pearson | ||||||
|  | 
 | ||||||
|  |     def needReport(self, whenSecs: int) -> bool: | ||||||
|  |         now = time.time() | ||||||
|  |         if now > self.nexttime: | ||||||
|  |             self.nexttime = now + whenSecs | ||||||
|  |             return True | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def _calc(self) -> bool: | ||||||
|  |         # A sample before a prediction.   Need two points to compute slope! | ||||||
|  |         if len(self._timing_data) < 3: | ||||||
|  |             return False | ||||||
|  | 
 | ||||||
|  |         # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient | ||||||
|  |         # Calculate means and standard deviations. | ||||||
|  |         samples = len(self._timing_data) | ||||||
|  |         # column wise sum of the timing tuples to compute their mean. | ||||||
|  |         mean_x, mean_y = ( | ||||||
|  |             sum(i) / samples for i in zip(*self._timing_data, strict=False) | ||||||
|  |         ) | ||||||
|  |         std_x = math.sqrt( | ||||||
|  |             sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1) | ||||||
|  |         ) | ||||||
|  |         std_y = math.sqrt( | ||||||
|  |             sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Calculate coefficient. | ||||||
|  |         sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0 | ||||||
|  |         for x, y in self._timing_data: | ||||||
|  |             x -= mean_x | ||||||
|  |             y -= mean_y | ||||||
|  |             sum_xy += x * y | ||||||
|  |             sum_sq_v_x += pow(x, 2) | ||||||
|  |             sum_sq_v_y += pow(y, 2) | ||||||
|  |         pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y) | ||||||
|  | 
 | ||||||
|  |         # Calculate regression line. | ||||||
|  |         # y = mx + b where m is the slope and b is the y-intercept. | ||||||
|  |         m = self.rate = pearson_r * (std_y / std_x) | ||||||
|  |         y = self.total | ||||||
|  |         b = mean_y - m * mean_x | ||||||
|  |         x = (y - b) / m | ||||||
|  | 
 | ||||||
|  |         # Calculate fitted line (transformed/shifted regression line horizontally). | ||||||
|  |         fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0]) | ||||||
|  |         fitted_x = (y - fitted_b) / m | ||||||
|  |         _, count = self._timing_data[-1]  # adjust last data point progress count | ||||||
|  |         adjusted_x = ((fitted_x - x) * (count / self.total)) + x | ||||||
|  |         eta_epoch = adjusted_x | ||||||
|  | 
 | ||||||
|  |         self.secondsLeft = max([eta_epoch - time.time(), 0]) | ||||||
|  |         return True | ||||||
|  | @ -1,3 +1,4 @@ | ||||||
|  | # poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface" | ||||||
| server: | server: | ||||||
|   env_name: ${APP_ENV:local} |   env_name: ${APP_ENV:local} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue