Use RecursiveCharacterTextSplitter to avoid llama_tokenize: too many tokens error during ingestion
This commit is contained in:
		
							parent
							
								
									75a1141743
								
							
						
					
					
						commit
						026b9f895c
					
				|  | @ -1,5 +1,5 @@ | |||
| from langchain.document_loaders import TextLoader | ||||
| from langchain.text_splitter import CharacterTextSplitter | ||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||
| from langchain.vectorstores import Chroma | ||||
| from langchain.embeddings import LlamaCppEmbeddings | ||||
| from sys import argv | ||||
|  | @ -8,7 +8,7 @@ def main(): | |||
|     # Load document and split in chunks | ||||
|     loader = TextLoader(argv[1], encoding="utf8") | ||||
|     documents = loader.load() | ||||
|     text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) | ||||
|     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | ||||
|     texts = text_splitter.split_documents(documents) | ||||
|     # Create embeddings | ||||
|     llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin") | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue