Parse JSON files using llama_index JSONReader (#1176)

Patch the default list of llama_index to support JSON files.
This injection of JSON documents should improve the comprehension in
JSON files, as there is a parsing of JSON files.
This commit is contained in:
lopagela 2023-11-07 15:39:40 +01:00 committed by GitHub
parent 0c40cfb115
commit 23cd3fea10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 15 additions and 1 deletions

View File

@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, AnyStr
from injector import inject, singleton
from llama_index import (
Document,
JSONReader,
ServiceContext,
StorageContext,
StringIterableReader,
@ -27,6 +28,14 @@ from private_gpt.paths import local_data_path
if TYPE_CHECKING:
from llama_index.readers.base import BaseReader
# Patching the default file reader to support other file types
FILE_READER_CLS = DEFAULT_FILE_READER_CLS.copy()
FILE_READER_CLS.update(
{
".json": JSONReader,
}
)
logger = logging.getLogger(__name__)
@ -76,9 +85,13 @@ class IngestService:
def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
extension = Path(file_name).suffix
reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
reader_cls = FILE_READER_CLS.get(extension)
documents: list[Document]
if reader_cls is None:
logger.debug(
"No reader found for extension=%s, using default string reader",
extension,
)
# Read as a plain text
string_reader = StringIterableReader()
if isinstance(file_data, Path):
@ -91,6 +104,7 @@ class IngestService:
else:
raise ValueError(f"Unsupported data type {type(file_data)}")
else:
logger.debug("Specific reader found for extension=%s", extension)
reader: BaseReader = reader_cls()
if isinstance(file_data, Path):
# Already a path, nothing to do