feat(ingest): Created a faster ingestion mode - pipeline (#1750)

* Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres * postgresql should be postgres * Adding pipeline ingestion mode * disable hugging face parallelism. Continue on file to doc transform failure * Semaphore to limit docq async workers. ETA reporting
2024-03-19 16:24:46 -04:00 · 2024-03-19 16:24:46 -04:00 · 134fc54d7d
parent 1efac6a3fe
commit 134fc54d7d
5 changed files with 301 additions and 2 deletions
--- a/fern/docs/pages/manual/ingestion.mdx
+++ b/fern/docs/pages/manual/ingestion.mdx
@ -62,6 +62,7 @@ The following ingestion mode exist:
 * `simple`: historic behavior, ingest one document at a time, sequentially
 * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
 * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
 * `pipeline`: Alternative to parallel.
 To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
 To configure the number of workers used for parallel or batched ingestion, you can use
--- a/private_gpt/components/ingest/ingest_component.py
+++ b/private_gpt/components/ingest/ingest_component.py
@ -6,6 +6,7 @@ import multiprocessing.pool
 import os
 import threading
 from pathlib import Path
 from queue import Queue
 from typing import Any
 from llama_index.core.data_structs import IndexDict
@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
 from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
 from llama_index.core.indices.base import BaseIndex
 from llama_index.core.ingestion import run_transformations
-from llama_index.core.schema import Document, TransformComponent
+from llama_index.core.schema import BaseNode, Document, TransformComponent
 from llama_index.core.storage import StorageContext
 from private_gpt.components.ingest.ingest_helper import IngestionHelper
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import Settings
 from private_gpt.utils.eta import eta
 logger = logging.getLogger(__name__)
@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
        self._file_to_documents_work_pool.terminate()
 class PipelineIngestComponent(BaseIngestComponentWithIndex):
    """Pipeline ingestion - keeping the embedding worker pool as busy as possible.
    This class implements a threaded ingestion pipeline, which comprises two threads
    and two queues. The primary thread is responsible for reading and parsing files
    into documents. These documents are then placed into a queue, which is
    distributed to a pool of worker processes for embedding computation. After
    embedding, the documents are transferred to another queue where they are
    accumulated until a threshold is reached. Upon reaching this threshold, the
    accumulated documents are flushed to the document store, index, and vector
    store.
    Exception handling ensures robustness against erroneous files. However, in the
    pipelined design, one error can lead to the discarding of multiple files. Any
    discarded files will be reported.
    """
    NODE_FLUSH_COUNT = 5000  # Save the index every # nodes.
    def __init__(
        self,
        storage_context: StorageContext,
        embed_model: EmbedType,
        transformations: list[TransformComponent],
        count_workers: int,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
        self.count_workers = count_workers
        assert (
            len(self.transformations) >= 2
        ), "Embeddings must be in the transformations"
        assert count_workers > 0, "count_workers must be > 0"
        self.count_workers = count_workers
        # We are doing our own multiprocessing
        # To do not collide with the multiprocessing of huggingface, we disable it
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        # doc_q stores parsed files as Document chunks.
        # Using a shallow queue causes the filesystem parser to block
        # when it reaches capacity. This ensures it doesn't outpace the
        # computationally intensive embeddings phase, avoiding unnecessary
        # memory consumption.  The semaphore is used to bound the async worker
        # embedding computations to cause the doc Q to fill and block.
        self.doc_semaphore = multiprocessing.Semaphore(
            self.count_workers
        )  # limit the doc queue to # items.
        self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
        # node_q stores documents parsed into nodes (embeddings).
        # Larger queue size so we don't block the embedding workers during a slow
        # index update.
        self.node_q: Queue[
            tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
        ] = Queue(40)
        threading.Thread(target=self._doc_to_node, daemon=True).start()
        threading.Thread(target=self._write_nodes, daemon=True).start()
    def _doc_to_node(self) -> None:
        # Parse documents into nodes
        with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
            while True:
                try:
                    cmd, file_name, documents = self.doc_q.get(
                        block=True
                    )  # Documents for a file
                    if cmd == "process":
                        # Push CPU/GPU embedding work to the worker pool
                        # Acquire semaphore to control access to worker pool
                        self.doc_semaphore.acquire()
                        pool.apply_async(
                            self._doc_to_node_worker, (file_name, documents)
                        )
                    elif cmd == "quit":
                        break
                finally:
                    if cmd != "process":
                        self.doc_q.task_done()  # unblock Q joins
    def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
        # CPU/GPU intensive work in its own process
        try:
            nodes = run_transformations(
                documents,  # type: ignore[arg-type]
                self.transformations,
                show_progress=self.show_progress,
            )
            self.node_q.put(("process", file_name, documents, nodes))
        finally:
            self.doc_semaphore.release()
            self.doc_q.task_done()  # unblock Q joins
    def _save_docs(
        self, files: list[str], documents: list[Document], nodes: list[BaseNode]
    ) -> None:
        try:
            logger.info(
                f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
            )
            self._index.insert_nodes(nodes)
            for document in documents:
                self._index.docstore.set_document_hash(
                    document.get_doc_id(), document.hash
                )
            self._save_index()
        except Exception:
            # Tell the user so they can investigate these files
            logger.exception(f"Processing files {files}")
        finally:
            # Clearing work, even on exception, maintains a clean state.
            nodes.clear()
            documents.clear()
            files.clear()
    def _write_nodes(self) -> None:
        # Save nodes to index.  I/O intensive.
        node_stack: list[BaseNode] = []
        doc_stack: list[Document] = []
        file_stack: list[str] = []
        while True:
            try:
                cmd, file_name, documents, nodes = self.node_q.get(block=True)
                if cmd in ("flush", "quit"):
                    if file_stack:
                        self._save_docs(file_stack, doc_stack, node_stack)
                    if cmd == "quit":
                        break
                elif cmd == "process":
                    node_stack.extend(nodes)  # type: ignore[arg-type]
                    doc_stack.extend(documents)  # type: ignore[arg-type]
                    file_stack.append(file_name)  # type: ignore[arg-type]
                    # Constant saving is heavy on I/O - accumulate to a threshold
                    if len(node_stack) >= self.NODE_FLUSH_COUNT:
                        self._save_docs(file_stack, doc_stack, node_stack)
            finally:
                self.node_q.task_done()
    def _flush(self) -> None:
        self.doc_q.put(("flush", None, None))
        self.doc_q.join()
        self.node_q.put(("flush", None, None, None))
        self.node_q.join()
    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
        documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
        self.doc_q.put(("process", file_name, documents))
        self._flush()
        return documents
    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
        docs = []
        for file_name, file_data in eta(files):
            try:
                documents = IngestionHelper.transform_file_into_documents(
                    file_name, file_data
                )
                self.doc_q.put(("process", file_name, documents))
                docs.extend(documents)
            except Exception:
                logger.exception(f"Skipping {file_data.name}")
        self._flush()
        return docs
 def get_ingestion_component(
    storage_context: StorageContext,
    embed_model: EmbedType,
@ -336,6 +502,13 @@ def get_ingestion_component(
            transformations=transformations,
            count_workers=settings.embedding.count_workers,
        )
    elif ingest_mode == "pipeline":
        return PipelineIngestComponent(
            storage_context=storage_context,
            embed_model=embed_model,
            transformations=transformations,
            count_workers=settings.embedding.count_workers,
        )
    else:
        return SimpleIngestComponent(
            storage_context=storage_context,
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
 class EmbeddingSettings(BaseModel):
    mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
-    ingest_mode: Literal["simple", "batch", "parallel"] = Field(
+    ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
        "simple",
        description=(
            "The ingest mode to use for the embedding engine:\n"
            "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
            "If `batch` - if multiple files, parse all the files in parallel, "
            "and send them in batch to the embedding model.\n"
            "In `pipeline` - The Embedding engine is kept as busy as possible\n"
            "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
            "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
            "For modes that leverage parallelization, you can specify the number of "
@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
            "The number of workers to use for file ingestion.\n"
            "In `batch` mode, this is the number of workers used to parse the files.\n"
            "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
            "In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
            "This is only used if `ingest_mode` is not `simple`.\n"
            "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
            "Do not set it higher than your number of threads of your CPU."
--- a/private_gpt/utils/eta.py
+++ b/private_gpt/utils/eta.py
@ -0,0 +1,122 @@
 import datetime
 import logging
 import math
 import time
 from collections import deque
 from typing import Any
 logger = logging.getLogger(__name__)
 def human_time(*args: Any, **kwargs: Any) -> str:
    def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
        return (
            timedelta.microseconds
            + 0.0
            + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
        ) / 10**6
    secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
    # We want (ms) precision below 2 seconds
    if secs < 2:
        return f"{secs * 1000}ms"
    units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
    parts = []
    for unit, mul in units:
        if secs / mul >= 1 or mul == 1:
            if mul > 1:
                n = int(math.floor(secs / mul))
                secs -= n * mul
            else:
                # >2s we drop the (ms) component.
                n = int(secs)
            if n:
                parts.append(f"{n}{unit}")
    return " ".join(parts)
 def eta(iterator: list[Any]) -> Any:
    """Report an ETA after 30s and every 60s thereafter."""
    total = len(iterator)
    _eta = ETA(total)
    _eta.needReport(30)
    for processed, data in enumerate(iterator, start=1):
        yield data
        _eta.update(processed)
        if _eta.needReport(60):
            logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
 class ETA:
    """Predict how long something will take to complete."""
    def __init__(self, total: int):
        self.total: int = total  # Total expected records.
        self.rate: float = 0.0  # per second
        self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
        self.secondsLeft: float = 0.0
        self.nexttime: float = 0.0
    def human_time(self) -> str:
        if self._calc():
            return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
        return "(computing)"
    def update(self, count: int) -> None:
        # count should be in the range 0 to self.total
        assert count > 0
        assert count <= self.total
        self._timing_data.append((time.time(), count))  # (X,Y) for pearson
    def needReport(self, whenSecs: int) -> bool:
        now = time.time()
        if now > self.nexttime:
            self.nexttime = now + whenSecs
            return True
        return False
    def _calc(self) -> bool:
        # A sample before a prediction.   Need two points to compute slope!
        if len(self._timing_data) < 3:
            return False
        # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
        # Calculate means and standard deviations.
        samples = len(self._timing_data)
        # column wise sum of the timing tuples to compute their mean.
        mean_x, mean_y = (
            sum(i) / samples for i in zip(*self._timing_data, strict=False)
        )
        std_x = math.sqrt(
            sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
        )
        std_y = math.sqrt(
            sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
        )
        # Calculate coefficient.
        sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
        for x, y in self._timing_data:
            x -= mean_x
            y -= mean_y
            sum_xy += x * y
            sum_sq_v_x += pow(x, 2)
            sum_sq_v_y += pow(y, 2)
        pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
        # Calculate regression line.
        # y = mx + b where m is the slope and b is the y-intercept.
        m = self.rate = pearson_r * (std_y / std_x)
        y = self.total
        b = mean_y - m * mean_x
        x = (y - b) / m
        # Calculate fitted line (transformed/shifted regression line horizontally).
        fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
        fitted_x = (y - fitted_b) / m
        _, count = self._timing_data[-1]  # adjust last data point progress count
        adjusted_x = ((fitted_x - x) * (count / self.total)) + x
        eta_epoch = adjusted_x
        self.secondsLeft = max([eta_epoch - time.time(), 0])
        return True
--- a/settings-local.yaml
+++ b/settings-local.yaml
@ -1,3 +1,4 @@
 # poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
 server:
  env_name: ${APP_ENV:local}