diff --git a/ingest.py b/ingest.py index 72b35e4..e8b08e6 100644 --- a/ingest.py +++ b/ingest.py @@ -1,5 +1,5 @@ from langchain.document_loaders import TextLoader -from langchain.text_splitter import CharacterTextSplitter +from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import LlamaCppEmbeddings from sys import argv @@ -8,7 +8,7 @@ def main(): # Load document and split in chunks loader = TextLoader(argv[1], encoding="utf8") documents = loader.load() - text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) # Create embeddings llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")