Endpoint to delete documents ingested (#1163)
A file that is ingested will be transformed into several documents (that are organized into nodes). This endpoint is deleting documents (bits of a file). These bits can be retrieved thanks to the endpoint to list all the documents.
This commit is contained in:
		
							parent
							
								
									6583dc84c0
								
							
						
					
					
						commit
						0c40cfb115
					
				|  | @ -1,3 +1,5 @@ | ||||||
|  | import logging | ||||||
|  | 
 | ||||||
| from injector import inject, singleton | from injector import inject, singleton | ||||||
| from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore | from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore | ||||||
| from llama_index.storage.index_store import SimpleIndexStore | from llama_index.storage.index_store import SimpleIndexStore | ||||||
|  | @ -5,6 +7,8 @@ from llama_index.storage.index_store.types import BaseIndexStore | ||||||
| 
 | 
 | ||||||
| from private_gpt.paths import local_data_path | from private_gpt.paths import local_data_path | ||||||
| 
 | 
 | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| @singleton | @singleton | ||||||
| class NodeStoreComponent: | class NodeStoreComponent: | ||||||
|  | @ -18,6 +22,7 @@ class NodeStoreComponent: | ||||||
|                 persist_dir=str(local_data_path) |                 persist_dir=str(local_data_path) | ||||||
|             ) |             ) | ||||||
|         except FileNotFoundError: |         except FileNotFoundError: | ||||||
|  |             logger.debug("Local index store not found, creating a new one") | ||||||
|             self.index_store = SimpleIndexStore() |             self.index_store = SimpleIndexStore() | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|  | @ -25,4 +30,5 @@ class NodeStoreComponent: | ||||||
|                 persist_dir=str(local_data_path) |                 persist_dir=str(local_data_path) | ||||||
|             ) |             ) | ||||||
|         except FileNotFoundError: |         except FileNotFoundError: | ||||||
|  |             logger.debug("Local document store not found, creating a new one") | ||||||
|             self.doc_store = SimpleDocumentStore() |             self.doc_store = SimpleDocumentStore() | ||||||
|  |  | ||||||
|  | @ -47,3 +47,14 @@ def list_ingested() -> IngestResponse: | ||||||
|     service = root_injector.get(IngestService) |     service = root_injector.get(IngestService) | ||||||
|     ingested_documents = service.list_ingested() |     ingested_documents = service.list_ingested() | ||||||
|     return IngestResponse(object="list", model="private-gpt", data=ingested_documents) |     return IngestResponse(object="list", model="private-gpt", data=ingested_documents) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"]) | ||||||
|  | def delete_ingested(doc_id: str) -> None: | ||||||
|  |     """Delete the specified ingested Document. | ||||||
|  | 
 | ||||||
|  |     The `doc_id` can be obtained from the `GET /ingest/list` endpoint. | ||||||
|  |     The document will be effectively deleted from your storage context. | ||||||
|  |     """ | ||||||
|  |     service = root_injector.get(IngestService) | ||||||
|  |     service.delete(doc_id) | ||||||
|  |  | ||||||
|  | @ -1,3 +1,4 @@ | ||||||
|  | import logging | ||||||
| import tempfile | import tempfile | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import TYPE_CHECKING, Any, AnyStr | from typing import TYPE_CHECKING, Any, AnyStr | ||||||
|  | @ -9,6 +10,7 @@ from llama_index import ( | ||||||
|     StorageContext, |     StorageContext, | ||||||
|     StringIterableReader, |     StringIterableReader, | ||||||
|     VectorStoreIndex, |     VectorStoreIndex, | ||||||
|  |     load_index_from_storage, | ||||||
| ) | ) | ||||||
| from llama_index.node_parser import SentenceWindowNodeParser | from llama_index.node_parser import SentenceWindowNodeParser | ||||||
| from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS | from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS | ||||||
|  | @ -25,6 +27,8 @@ from private_gpt.paths import local_data_path | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from llama_index.readers.base import BaseReader |     from llama_index.readers.base import BaseReader | ||||||
| 
 | 
 | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class IngestedDoc(BaseModel): | class IngestedDoc(BaseModel): | ||||||
|     object: str = Field(enum=["ingest.document"]) |     object: str = Field(enum=["ingest.document"]) | ||||||
|  | @ -70,6 +74,7 @@ class IngestService: | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: |     def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: | ||||||
|  |         logger.info("Ingesting file_name=%s", file_name) | ||||||
|         extension = Path(file_name).suffix |         extension = Path(file_name).suffix | ||||||
|         reader_cls = DEFAULT_FILE_READER_CLS.get(extension) |         reader_cls = DEFAULT_FILE_READER_CLS.get(extension) | ||||||
|         documents: list[Document] |         documents: list[Document] | ||||||
|  | @ -100,7 +105,9 @@ class IngestService: | ||||||
|                     else: |                     else: | ||||||
|                         path_to_tmp.write_text(str(file_data)) |                         path_to_tmp.write_text(str(file_data)) | ||||||
|                     documents = reader.load_data(path_to_tmp) |                     documents = reader.load_data(path_to_tmp) | ||||||
| 
 |         logger.info( | ||||||
|  |             "Transformed file=%s into count=%s documents", file_name, len(documents) | ||||||
|  |         ) | ||||||
|         for document in documents: |         for document in documents: | ||||||
|             document.metadata["file_name"] = file_name |             document.metadata["file_name"] = file_name | ||||||
|         return self._save_docs(documents) |         return self._save_docs(documents) | ||||||
|  | @ -153,7 +160,26 @@ class IngestService: | ||||||
|                         doc_metadata=doc_metadata, |                         doc_metadata=doc_metadata, | ||||||
|                     ) |                     ) | ||||||
|                 ) |                 ) | ||||||
|             return ingested_docs |  | ||||||
|         except ValueError: |         except ValueError: | ||||||
|  |             logger.warning("Got an exception when getting list of docs", exc_info=True) | ||||||
|             pass |             pass | ||||||
|  |         logger.debug("Found count=%s ingested documents", len(ingested_docs)) | ||||||
|         return ingested_docs |         return ingested_docs | ||||||
|  | 
 | ||||||
|  |     def delete(self, doc_id: str) -> None: | ||||||
|  |         """Delete an ingested document. | ||||||
|  | 
 | ||||||
|  |         :raises ValueError: if the document does not exist | ||||||
|  |         """ | ||||||
|  |         logger.info( | ||||||
|  |             "Deleting the ingested document=%s in the doc and index store", doc_id | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Load the index with store_nodes_override=True to be able to delete them | ||||||
|  |         index = load_index_from_storage(self.storage_context, store_nodes_override=True) | ||||||
|  | 
 | ||||||
|  |         # Delete the document from the index | ||||||
|  |         index.delete_ref_doc(doc_id, delete_from_docstore=True) | ||||||
|  | 
 | ||||||
|  |         # Save the index | ||||||
|  |         self.storage_context.persist(persist_dir=local_data_path) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue