diff --git a/README.md b/README.md index 21efe29..58d0db1 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,12 @@ This repo uses a [state of the union transcript](https://github.com/imartinez/pr ## Instructions for ingesting your own dataset -Place your .txt file in `source_documents` folder. -Edit `ingest.py` loader to point it to your document. +Get your .txt file ready. Run the following command to ingest the data. ```shell -python ingest.py +python ingest.py ``` It will create a `db` folder containing the local vectorstore. Will take time, depending on the size of your document. diff --git a/ingest.py b/ingest.py index e5d27b3..72b35e4 100644 --- a/ingest.py +++ b/ingest.py @@ -1,13 +1,14 @@ from langchain.document_loaders import TextLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import LlamaCppEmbeddings +from sys import argv def main(): # Load document and split in chunks - loader = TextLoader('./source_documents/state_of_the_union.txt', encoding='utf8') + loader = TextLoader(argv[1], encoding="utf8") documents = loader.load() - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) # Create embeddings llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")