From 68b3a34b032a08ca073a687d2058f926032495b3 Mon Sep 17 00:00:00 2001
From: Brett England <brett@dbzoo.com>
Date: Thu, 14 Mar 2024 12:12:33 -0400
Subject: [PATCH] feat(nodestore): add Postgres for the doc and index store
 (#1706)

* Adding Postgres for the doc and index store

* Adding documentation.  Rename postgres database local->simple.  Postgres storage dependencies

* Update documentation for postgres storage

* Renaming feature to nodestore

* update docstore -> nodestore in doc

* missed some docstore changes in doc

* Updated poetry.lock

* Formatting updates to pass ruff/black checks

* Correction to unreachable code!

* Format adjustment to pass black test

* Adjust extra inclusion name for vector pg

* extra dep change for pg vector

* storage-postgres -> storage-nodestore-postgres

* Hash change on poetry lock
---
 fern/docs.yml                                 |  2 +
 fern/docs/pages/manual/nodestore.mdx          | 66 +++++++++++++++++++
 fern/docs/pages/manual/vectordb.mdx           |  4 +-
 poetry.lock                                   | 34 +++++++++-
 .../node_store/node_store_component.py        | 63 +++++++++++++-----
 private_gpt/settings/settings.py              | 19 ++++--
 pyproject.toml                                |  8 ++-
 settings-ollama-pg.yaml                       | 43 ++++++++++++
 settings.yaml                                 | 11 ++++
 9 files changed, 225 insertions(+), 25 deletions(-)
 create mode 100644 fern/docs/pages/manual/nodestore.mdx
 create mode 100644 settings-ollama-pg.yaml

diff --git a/fern/docs.yml b/fern/docs.yml
index c22784d..2611dac 100644
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -58,6 +58,8 @@ navigation:
         contents:
           - page: Vector Stores
             path: ./docs/pages/manual/vectordb.mdx
+          - page: Node Stores
+            path: ./docs/pages/manual/nodestore.mdx
       - section: Advanced Setup
         contents:
           - page: LLM Backends
diff --git a/fern/docs/pages/manual/nodestore.mdx b/fern/docs/pages/manual/nodestore.mdx
new file mode 100644
index 0000000..cec1d36
--- /dev/null
+++ b/fern/docs/pages/manual/nodestore.mdx
@@ -0,0 +1,66 @@
+## NodeStores
+PrivateGPT supports **Simple** and [Postgres](https://www.postgresql.org/) providers. Simple being the default.
+
+In order to select one or the other, set the `nodestore.database` property in the `settings.yaml` file to `simple` or `postgres`.
+
+```yaml
+nodestore:
+  database: simple
+```
+
+### Simple Document Store
+
+Setting up simple document store: Persist data with in-memory and disk storage.
+
+Enabling the simple document store is an excellent choice for small projects or proofs of concept where you need to persist data while maintaining minimal setup complexity. To get started, set the nodestore.database property in your settings.yaml file as follows:
+
+```yaml
+nodestore:
+  database: simple
+```
+The beauty of the simple document store is its flexibility and ease of implementation. It provides a solid foundation for managing and retrieving data without the need for complex setup or configuration. The combination of in-memory processing and disk persistence ensures that you can efficiently handle small to medium-sized datasets while maintaining data consistency across runs.
+
+### Postgres Document Store
+
+To enable Postgres, set the `nodestore.database` property in the `settings.yaml` file to `postgres` and install the `storage-nodestore-postgres` extra.  Note: Vector Embeddings Storage in Postgres is configured separately
+
+```bash
+poetry install --extras storage-nodestore-postgres
+```
+
+The available configuration options are:
+| Field         | Description                                               |
+|---------------|-----------------------------------------------------------|
+| **host**      | The server hosting the Postgres database. Default is `localhost` |
+| **port**      | The port on which the Postgres database is accessible. Default is `5432` |
+| **database**  | The specific database to connect to. Default is `postgres` |
+| **user**      | The username for database access. Default is `postgres` |
+| **password**  | The password for database access. (Required)            |
+| **schema_name** | The database schema to use. Default is `private_gpt`       |
+
+For example:
+```yaml
+nodestore:
+  database: postgres
+
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: <PASSWORD>
+  schema_name: private_gpt
+```
+
+Given the above configuration, Two PostgreSQL tables will be created upon successful connection: one for storing metadata related to the index and another for document data itself.
+
+```
+postgres=# \dt private_gpt.*
+                  List of relations
+   Schema    |      Name       | Type  |    Owner     
+-------------+-----------------+-------+--------------
+ private_gpt | data_docstore   | table | postgres
+ private_gpt | data_indexstore | table | postgres
+
+postgres=# 
+```
diff --git a/fern/docs/pages/manual/vectordb.mdx b/fern/docs/pages/manual/vectordb.mdx
index b738d4d..db28c1f 100644
--- a/fern/docs/pages/manual/vectordb.mdx
+++ b/fern/docs/pages/manual/vectordb.mdx
@@ -51,10 +51,10 @@ By default `chroma` will use a disk-based database stored in local_data_path / "
 
 ### PGVector
 
-To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `pgvector` and install the `pgvector` extra.
+To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `pgvector` and install the `vector-stores-postgres` extra.
 
 ```bash
-poetry install --extras pgvector
+poetry install --extras vector-stores-postgres
 ```
 
 PGVector settings can be configured by setting values to the `pgvector` property in the `settings.yaml` file.
diff --git a/poetry.lock b/poetry.lock
index c9d0057..6da365e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
 
 [[package]]
 name = "aiofiles"
@@ -2202,6 +2202,34 @@ llama-index-core = ">=0.10.1,<0.11.0"
 pymupdf = ">=1.23.21,<2.0.0"
 pypdf = ">=4.0.1,<5.0.0"
 
+[[package]]
+name = "llama-index-storage-docstore-postgres"
+version = "0.1.2"
+description = "llama-index docstore postgres integration"
+optional = true
+python-versions = ">=3.8.1,<4.0"
+files = [
+    {file = "llama_index_storage_docstore_postgres-0.1.2-py3-none-any.whl", hash = "sha256:54c9534d26a641af85857452ce09279eddec27ca14c3a50c4481e95f394daa08"},
+    {file = "llama_index_storage_docstore_postgres-0.1.2.tar.gz", hash = "sha256:40f5ebd9b461023110343c478caf9ef96c30317dd077e8b156460dff1568dba7"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.10.1,<0.11.0"
+
+[[package]]
+name = "llama-index-storage-index-store-postgres"
+version = "0.1.2"
+description = "llama-index index_store postgres integration"
+optional = true
+python-versions = ">=3.8.1,<4.0"
+files = [
+    {file = "llama_index_storage_index_store_postgres-0.1.2-py3-none-any.whl", hash = "sha256:8728c9cc5ce9312cf364e1cb1b65e0aba24321e20a16463d8f27f5a883b51b72"},
+    {file = "llama_index_storage_index_store_postgres-0.1.2.tar.gz", hash = "sha256:6a6af1ea6110b2b34de87acaf97c9615bbb738eb504fe89482fb6b973b07eb47"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.10.1,<0.11.0"
+
 [[package]]
 name = "llama-index-vector-stores-chroma"
 version = "0.1.4"
@@ -4153,6 +4181,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -5918,6 +5947,7 @@ llms-ollama = ["llama-index-llms-ollama"]
 llms-openai = ["llama-index-llms-openai"]
 llms-openai-like = ["llama-index-llms-openai-like"]
 llms-sagemaker = ["boto3"]
+storage-nodestore-postgres = ["asyncpg", "llama-index-storage-docstore-postgres", "llama-index-storage-index-store-postgres", "psycopg2-binary"]
 ui = ["gradio"]
 vector-stores-chroma = ["llama-index-vector-stores-chroma"]
 vector-stores-postgres = ["llama-index-vector-stores-postgres"]
@@ -5926,4 +5956,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.12"
-content-hash = "41849a9d15848a354fd4cc0ca9d752148e76fee64d8bb5b881210c2290fc8072"
+content-hash = "689df29f4f2209e7ae6638563f4bb25700d1454098d0c728a164a708d42fa377"
diff --git a/private_gpt/components/node_store/node_store_component.py b/private_gpt/components/node_store/node_store_component.py
index 4383cf9..f81ce70 100644
--- a/private_gpt/components/node_store/node_store_component.py
+++ b/private_gpt/components/node_store/node_store_component.py
@@ -6,6 +6,7 @@ from llama_index.core.storage.index_store import SimpleIndexStore
 from llama_index.core.storage.index_store.types import BaseIndexStore
 
 from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import Settings
 
 logger = logging.getLogger(__name__)
 
@@ -16,19 +17,51 @@ class NodeStoreComponent:
     doc_store: BaseDocumentStore
 
     @inject
-    def __init__(self) -> None:
-        try:
-            self.index_store = SimpleIndexStore.from_persist_dir(
-                persist_dir=str(local_data_path)
-            )
-        except FileNotFoundError:
-            logger.debug("Local index store not found, creating a new one")
-            self.index_store = SimpleIndexStore()
+    def __init__(self, settings: Settings) -> None:
+        match settings.nodestore.database:
+            case "simple":
+                try:
+                    self.index_store = SimpleIndexStore.from_persist_dir(
+                        persist_dir=str(local_data_path)
+                    )
+                except FileNotFoundError:
+                    logger.debug("Local index store not found, creating a new one")
+                    self.index_store = SimpleIndexStore()
 
-        try:
-            self.doc_store = SimpleDocumentStore.from_persist_dir(
-                persist_dir=str(local_data_path)
-            )
-        except FileNotFoundError:
-            logger.debug("Local document store not found, creating a new one")
-            self.doc_store = SimpleDocumentStore()
+                try:
+                    self.doc_store = SimpleDocumentStore.from_persist_dir(
+                        persist_dir=str(local_data_path)
+                    )
+                except FileNotFoundError:
+                    logger.debug("Local document store not found, creating a new one")
+                    self.doc_store = SimpleDocumentStore()
+
+            case "postgres":
+                try:
+                    from llama_index.core.storage.docstore.postgres_docstore import (
+                        PostgresDocumentStore,
+                    )
+                    from llama_index.core.storage.index_store.postgres_index_store import (
+                        PostgresIndexStore,
+                    )
+                except ImportError:
+                    raise ImportError(
+                        "Postgres dependencies not found, install with `poetry install --extras storage-nodestore-postgres`"
+                    ) from None
+
+                if settings.postgres is None:
+                    raise ValueError("Postgres index/doc store settings not found.")
+
+                self.index_store = PostgresIndexStore.from_params(
+                    **settings.postgres.model_dump(exclude_none=True)
+                )
+                self.doc_store = PostgresDocumentStore.from_params(
+                    **settings.postgres.model_dump(exclude_none=True)
+                )
+
+            case _:
+                # Should be unreachable
+                # The settings validator should have caught this
+                raise ValueError(
+                    f"Database {settings.nodestore.database} not supported"
+                )
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 62af3f3..3fe6750 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -108,6 +108,10 @@ class VectorstoreSettings(BaseModel):
     database: Literal["chroma", "qdrant", "pgvector"]
 
 
+class NodeStoreSettings(BaseModel):
+    database: Literal["simple", "postgres"]
+
+
 class LlamaCPPSettings(BaseModel):
     llm_hf_repo_id: str
     llm_hf_model_file: str
@@ -249,7 +253,7 @@ class UISettings(BaseModel):
     )
 
 
-class PGVectorSettings(BaseModel):
+class PostgresSettings(BaseModel):
     host: str = Field(
         "localhost",
         description="The server hosting the Postgres database",
@@ -270,14 +274,17 @@ class PGVectorSettings(BaseModel):
         "postgres",
         description="The database to use to connect to the Postgres database",
     )
+    schema_name: str = Field(
+        "public",
+        description="The name of the schema in the Postgres database to use",
+    )
+
+
+class PGVectorSettings(PostgresSettings):
     embed_dim: int = Field(
         384,
         description="The dimension of the embeddings stored in the Postgres database",
     )
-    schema_name: str = Field(
-        "public",
-        description="The name of the schema in the Postgres database where the embeddings are stored",
-    )
     table_name: str = Field(
         "embeddings",
         description="The name of the table in the Postgres database where the embeddings are stored",
@@ -350,7 +357,9 @@ class Settings(BaseModel):
     openai: OpenAISettings
     ollama: OllamaSettings
     vectorstore: VectorstoreSettings
+    nodestore: NodeStoreSettings
     qdrant: QdrantSettings | None = None
+    postgres: PostgresSettings | None = None
     pgvector: PGVectorSettings | None = None
 
 
diff --git a/pyproject.toml b/pyproject.toml
index c65afbf..1391fb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,12 @@ llama-index-embeddings-openai = {version ="^0.1.6", optional = true}
 llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
 llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
 llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
+llama-index-storage-docstore-postgres = {version ="^0.1.2", optional = true}
+llama-index-storage-index-store-postgres = {version ="^0.1.2", optional = true}
+# Postgres
+psycopg2-binary = {version ="^2.9.9", optional = true}
+asyncpg = {version="^0.29.0", optional = true}
+
 # Optional Sagemaker dependency
 boto3 = {version ="^1.34.51", optional = true}
 # Optional UI
@@ -46,7 +52,7 @@ embeddings-sagemaker = ["boto3"]
 vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 vector-stores-chroma = ["llama-index-vector-stores-chroma"]
 vector-stores-postgres = ["llama-index-vector-stores-postgres"]
-
+storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"]
 
 [tool.poetry.group.dev.dependencies]
 black = "^22"
diff --git a/settings-ollama-pg.yaml b/settings-ollama-pg.yaml
new file mode 100644
index 0000000..2bef97d
--- /dev/null
+++ b/settings-ollama-pg.yaml
@@ -0,0 +1,43 @@
+# Using ollama and postgres for the vector, doc and index store. Ollama is also used for embeddings.
+# To use install these extras:
+# poetry install --extras "llms-ollama ui vector-stores-postgres embeddings-ollama storage-nodestore-postgres"
+server:
+  env_name: ${APP_ENV:ollama}
+
+llm:
+  mode: ollama
+  max_new_tokens: 512
+  context_window: 3900
+
+embedding:
+  mode: ollama
+
+ollama:
+  llm_model: mistral
+  embedding_model: nomic-embed-text
+  api_base: http://localhost:11434
+
+nodestore:
+  database: postgres
+
+vectorstore:
+  database: pgvector
+
+pgvector:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: admin
+  embed_dim: 768
+  schema_name: private_gpt
+  table_name: embeddings
+
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: admin
+  schema_name: private_gpt
+
diff --git a/settings.yaml b/settings.yaml
index 0a3121f..ab04843 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -62,6 +62,9 @@ huggingface:
 vectorstore:
   database: qdrant
 
+nodestore:
+  database: simple
+
 qdrant:
   path: local_data/private_gpt/qdrant
 
@@ -75,6 +78,14 @@ pgvector:
   schema_name: private_gpt
   table_name: embeddings
 
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: postgres
+  schema_name: private_gpt
+
 sagemaker:
   llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
   embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479