feat(llm): adds serveral settings for llamacpp and ollama (#1703)

This commit is contained in:
icsy7867 2024-03-11 17:51:05 -04:00 committed by GitHub
parent 410bf7a71f
commit 02dc83e8e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 91 additions and 8 deletions

View File

@ -1,4 +1,5 @@
"""private-gpt.""" """private-gpt."""
import logging import logging
import os import os

View File

@ -39,16 +39,23 @@ class LLMComponent:
) from e ) from e
prompt_style = get_prompt_style(settings.llamacpp.prompt_style) prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
settings_kwargs = {
"tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
"top_p": settings.llamacpp.top_p, # ollama and llama-cpp
"repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp
"n_gpu_layers": -1,
"offload_kqv": True,
}
self.llm = LlamaCPP( self.llm = LlamaCPP(
model_path=str(models_path / settings.llamacpp.llm_hf_model_file), model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
temperature=0.1, temperature=settings.llm.temperature,
max_new_tokens=settings.llm.max_new_tokens, max_new_tokens=settings.llm.max_new_tokens,
context_window=settings.llm.context_window, context_window=settings.llm.context_window,
generate_kwargs={}, generate_kwargs={},
callback_manager=LlamaIndexSettings.callback_manager, callback_manager=LlamaIndexSettings.callback_manager,
# All to GPU # All to GPU
model_kwargs={"n_gpu_layers": -1, "offload_kqv": True}, model_kwargs=settings_kwargs,
# transform inputs into Llama2 format # transform inputs into Llama2 format
messages_to_prompt=prompt_style.messages_to_prompt, messages_to_prompt=prompt_style.messages_to_prompt,
completion_to_prompt=prompt_style.completion_to_prompt, completion_to_prompt=prompt_style.completion_to_prompt,
@ -108,8 +115,22 @@ class LLMComponent:
) from e ) from e
ollama_settings = settings.ollama ollama_settings = settings.ollama
settings_kwargs = {
"tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp
"num_predict": ollama_settings.num_predict, # ollama only
"top_k": ollama_settings.top_k, # ollama and llama-cpp
"top_p": ollama_settings.top_p, # ollama and llama-cpp
"repeat_last_n": ollama_settings.repeat_last_n, # ollama
"repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp
}
self.llm = Ollama( self.llm = Ollama(
model=ollama_settings.llm_model, base_url=ollama_settings.api_base model=ollama_settings.llm_model,
base_url=ollama_settings.api_base,
temperature=settings.llm.temperature,
context_window=settings.llm.context_window,
additional_kwargs=settings_kwargs,
) )
case "mock": case "mock":
self.llm = MockLLM() self.llm = MockLLM()

View File

@ -137,9 +137,11 @@ class VectorStoreComponent:
index=index, index=index,
similarity_top_k=similarity_top_k, similarity_top_k=similarity_top_k,
doc_ids=context_filter.docs_ids if context_filter else None, doc_ids=context_filter.docs_ids if context_filter else None,
filters=_doc_id_metadata_filter(context_filter) filters=(
_doc_id_metadata_filter(context_filter)
if self.settings.vectorstore.database != "qdrant" if self.settings.vectorstore.database != "qdrant"
else None, else None
),
) )
def close(self) -> None: def close(self) -> None:

View File

@ -1,4 +1,5 @@
"""FastAPI app creation, logger configuration and main API routes.""" """FastAPI app creation, logger configuration and main API routes."""
import logging import logging
from fastapi import Depends, FastAPI, Request from fastapi import Depends, FastAPI, Request

View File

@ -12,6 +12,7 @@ Authorization can be done by following fastapi's guides:
* https://fastapi.tiangolo.com/tutorial/security/ * https://fastapi.tiangolo.com/tutorial/security/
* https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/ * https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
""" """
# mypy: ignore-errors # mypy: ignore-errors
# Disabled mypy error: All conditional function variants must have identical signatures # Disabled mypy error: All conditional function variants must have identical signatures
# We are changing the implementation of the authenticated method, based on # We are changing the implementation of the authenticated method, based on

View File

@ -98,6 +98,10 @@ class LLMSettings(BaseModel):
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
"gpt-3.5-turbo LLM.", "gpt-3.5-turbo LLM.",
) )
temperature: float = Field(
0.1,
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
)
class VectorstoreSettings(BaseModel): class VectorstoreSettings(BaseModel):
@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel):
), ),
) )
tfs_z: float = Field(
1.0,
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
)
top_k: int = Field(
40,
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
)
top_p: float = Field(
0.9,
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
)
repeat_penalty: float = Field(
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)
class HuggingFaceSettings(BaseModel): class HuggingFaceSettings(BaseModel):
embedding_hf_model_name: str = Field( embedding_hf_model_name: str = Field(
@ -184,6 +205,30 @@ class OllamaSettings(BaseModel):
None, None,
description="Model to use. Example: 'nomic-embed-text'.", description="Model to use. Example: 'nomic-embed-text'.",
) )
tfs_z: float = Field(
1.0,
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
)
num_predict: int = Field(
None,
description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)",
)
top_k: int = Field(
40,
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
)
top_p: float = Field(
0.9,
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
)
repeat_last_n: int = Field(
64,
description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)",
)
repeat_penalty: float = Field(
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)
class UISettings(BaseModel): class UISettings(BaseModel):

View File

@ -1,4 +1,5 @@
"""This file should be imported only and only if you want to run the UI locally.""" """This file should be imported only and only if you want to run the UI locally."""
import itertools import itertools
import logging import logging
import time import time

View File

@ -5,6 +5,7 @@ llm:
mode: ollama mode: ollama
max_new_tokens: 512 max_new_tokens: 512
context_window: 3900 context_window: 3900
temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
embedding: embedding:
mode: ollama mode: ollama
@ -13,10 +14,14 @@ ollama:
llm_model: mistral llm_model: mistral
embedding_model: nomic-embed-text embedding_model: nomic-embed-text
api_base: http://localhost:11434 api_base: http://localhost:11434
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
vectorstore: vectorstore:
database: qdrant database: qdrant
qdrant: qdrant:
path: local_data/private_gpt/qdrant path: local_data/private_gpt/qdrant

View File

@ -39,11 +39,16 @@ llm:
# Should be matching the selected model # Should be matching the selected model
max_new_tokens: 512 max_new_tokens: 512
context_window: 3900 context_window: 3900
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
llamacpp: llamacpp:
prompt_style: "mistral" prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
embedding: embedding:
# Should be matching the value above in most cases # Should be matching the value above in most cases

View File

@ -5,6 +5,7 @@ NOTE: We are not testing the switch based on the config in
is currently architecture (it is hard to patch the `settings` and the app while is currently architecture (it is hard to patch the `settings` and the app while
the tests are directly importing them). the tests are directly importing them).
""" """
from typing import Annotated from typing import Annotated
import pytest import pytest