Use RecursiveCharacterTextSplitter to avoid llama_tokenize: too many tokens error during ingestion

This commit is contained in:
Iván Martínez 2023-05-09 00:20:42 +02:00
parent 75a1141743
commit 026b9f895c
1 changed files with 2 additions and 2 deletions

View File

@ -1,5 +1,5 @@
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from sys import argv
@ -8,7 +8,7 @@ def main():
# Load document and split in chunks
loader = TextLoader(argv[1], encoding="utf8")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
# Create embeddings
llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")