From 026b9f895cfb727da523a20c59773146801236ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Mart=C3=ADnez?= Date: Tue, 9 May 2023 00:20:42 +0200 Subject: [PATCH] Use RecursiveCharacterTextSplitter to avoid llama_tokenize: too many tokens error during ingestion --- ingest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ingest.py b/ingest.py index 72b35e4..e8b08e6 100644 --- a/ingest.py +++ b/ingest.py @@ -1,5 +1,5 @@ from langchain.document_loaders import TextLoader -from langchain.text_splitter import CharacterTextSplitter +from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import LlamaCppEmbeddings from sys import argv @@ -8,7 +8,7 @@ def main(): # Load document and split in chunks loader = TextLoader(argv[1], encoding="utf8") documents = loader.load() - text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) # Create embeddings llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")