Use RecursiveCharacterTextSplitter to avoid llama_tokenize: too many tokens error during ingestion
This commit is contained in:
		
							parent
							
								
									75a1141743
								
							
						
					
					
						commit
						026b9f895c
					
				|  | @ -1,5 +1,5 @@ | ||||||
| from langchain.document_loaders import TextLoader | from langchain.document_loaders import TextLoader | ||||||
| from langchain.text_splitter import CharacterTextSplitter | from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||||
| from langchain.vectorstores import Chroma | from langchain.vectorstores import Chroma | ||||||
| from langchain.embeddings import LlamaCppEmbeddings | from langchain.embeddings import LlamaCppEmbeddings | ||||||
| from sys import argv | from sys import argv | ||||||
|  | @ -8,7 +8,7 @@ def main(): | ||||||
|     # Load document and split in chunks |     # Load document and split in chunks | ||||||
|     loader = TextLoader(argv[1], encoding="utf8") |     loader = TextLoader(argv[1], encoding="utf8") | ||||||
|     documents = loader.load() |     documents = loader.load() | ||||||
|     text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | ||||||
|     texts = text_splitter.split_documents(documents) |     texts = text_splitter.split_documents(documents) | ||||||
|     # Create embeddings |     # Create embeddings | ||||||
|     llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin") |     llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin") | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue