Use chunk_size variable in logs. Make vectorstore check more flexible
This commit is contained in:
		
							parent
							
								
									fca1128fba
								
							
						
					
					
						commit
						4a0e0d2e70
					
				
							
								
								
									
										11
									
								
								ingest.py
								
								
								
								
							
							
						
						
									
										11
									
								
								ingest.py
								
								
								
								
							|  | @ -37,6 +37,8 @@ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME') | ||||||
| chunk_size = 500 | chunk_size = 500 | ||||||
| chunk_overlap = 50 | chunk_overlap = 50 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # Custom document loaders | ||||||
| class MyElmLoader(UnstructuredEmailLoader): | class MyElmLoader(UnstructuredEmailLoader): | ||||||
|     """Wrapper to fallback to text/plain when default does not work""" |     """Wrapper to fallback to text/plain when default does not work""" | ||||||
| 
 | 
 | ||||||
|  | @ -79,8 +81,6 @@ LOADER_MAPPING = { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| load_dotenv() |  | ||||||
| 
 |  | ||||||
| def load_single_document(file_path: str) -> Document: | def load_single_document(file_path: str) -> Document: | ||||||
|     ext = "." + file_path.rsplit(".", 1)[-1] |     ext = "." + file_path.rsplit(".", 1)[-1] | ||||||
|     if ext in LOADER_MAPPING: |     if ext in LOADER_MAPPING: | ||||||
|  | @ -123,7 +123,7 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]: | ||||||
|     print(f"Loaded {len(documents)} new documents from {source_directory}") |     print(f"Loaded {len(documents)} new documents from {source_directory}") | ||||||
|     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | ||||||
|     texts = text_splitter.split_documents(documents) |     texts = text_splitter.split_documents(documents) | ||||||
|     print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)") |     print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") | ||||||
|     return texts |     return texts | ||||||
| 
 | 
 | ||||||
| def does_vectorstore_exist(persist_directory: str) -> bool: | def does_vectorstore_exist(persist_directory: str) -> bool: | ||||||
|  | @ -131,10 +131,11 @@ def does_vectorstore_exist(persist_directory: str) -> bool: | ||||||
|     Checks if vectorstore exists |     Checks if vectorstore exists | ||||||
|     """ |     """ | ||||||
|     if os.path.exists(os.path.join(persist_directory, 'index')): |     if os.path.exists(os.path.join(persist_directory, 'index')): | ||||||
|          if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): |         if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): | ||||||
|             list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) |             list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) | ||||||
|             list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) |             list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) | ||||||
|             if len(list_index_files) == 4: |             # At least 3 documents are needed in a working vectorstore | ||||||
|  |             if len(list_index_files) > 3: | ||||||
|                 return True |                 return True | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue