Parse JSON files using llama_index JSONReader (#1176)

Patch the default list of llama_index to support JSON files. This injection of JSON documents should improve the comprehension in JSON files, as there is a parsing of JSON files.
2023-11-07 15:39:40 +01:00 · 2023-11-07 15:39:40 +01:00 · 23cd3fea10
parent 0c40cfb115
commit 23cd3fea10
1 changed files with 15 additions and 1 deletions
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, AnyStr
 from injector import inject, singleton
 from llama_index import (
    Document,
    JSONReader,
    ServiceContext,
    StorageContext,
    StringIterableReader,
@ -27,6 +28,14 @@ from private_gpt.paths import local_data_path
 if TYPE_CHECKING:
    from llama_index.readers.base import BaseReader
 # Patching the default file reader to support other file types
 FILE_READER_CLS = DEFAULT_FILE_READER_CLS.copy()
 FILE_READER_CLS.update(
    {
        ".json": JSONReader,
    }
 )
 logger = logging.getLogger(__name__)
@ -76,9 +85,13 @@ class IngestService:
    def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
        logger.info("Ingesting file_name=%s", file_name)
        extension = Path(file_name).suffix
-        reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
+        reader_cls = FILE_READER_CLS.get(extension)
        documents: list[Document]
        if reader_cls is None:
            logger.debug(
                "No reader found for extension=%s, using default string reader",
                extension,
            )
            # Read as a plain text
            string_reader = StringIterableReader()
            if isinstance(file_data, Path):
@ -91,6 +104,7 @@ class IngestService:
            else:
                raise ValueError(f"Unsupported data type {type(file_data)}")
        else:
            logger.debug("Specific reader found for extension=%s", extension)
            reader: BaseReader = reader_cls()
            if isinstance(file_data, Path):
                # Already a path, nothing to do