feat: prompt_style applied to all LLMs + extra LLM params. (#1835)
* Updated prompt_style to be moved to the main LLM setting since all LLMs from llama_index can utilize this. I also included temperature, context window size, max_tokens, max_new_tokens into the openailike to help ensure the settings are consistent from the other implementations. * Removed prompt_style from llamacpp entirely * Fixed settings-local.yaml to include prompt_style in the LLM settings instead of llamacpp.
This commit is contained in:
parent
c1802e7cf0
commit
e21bf20c10
|
@ -51,7 +51,7 @@ class LLMComponent:
|
|||
"Local dependencies not found, install with `poetry install --extras llms-llama-cpp`"
|
||||
) from e
|
||||
|
||||
prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
|
||||
prompt_style = get_prompt_style(settings.llm.prompt_style)
|
||||
settings_kwargs = {
|
||||
"tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp
|
||||
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
|
||||
|
@ -109,15 +109,20 @@ class LLMComponent:
|
|||
raise ImportError(
|
||||
"OpenAILike dependencies not found, install with `poetry install --extras llms-openai-like`"
|
||||
) from e
|
||||
|
||||
prompt_style = get_prompt_style(settings.llm.prompt_style)
|
||||
openai_settings = settings.openai
|
||||
self.llm = OpenAILike(
|
||||
api_base=openai_settings.api_base,
|
||||
api_key=openai_settings.api_key,
|
||||
model=openai_settings.model,
|
||||
is_chat_model=True,
|
||||
max_tokens=None,
|
||||
max_tokens=settings.llm.max_new_tokens,
|
||||
api_version="",
|
||||
temperature=settings.llm.temperature,
|
||||
context_window=settings.llm.context_window,
|
||||
max_new_tokens=settings.llm.max_new_tokens,
|
||||
messages_to_prompt=prompt_style.messages_to_prompt,
|
||||
completion_to_prompt=prompt_style.completion_to_prompt,
|
||||
)
|
||||
case "ollama":
|
||||
try:
|
||||
|
|
|
@ -104,6 +104,17 @@ class LLMSettings(BaseModel):
|
|||
0.1,
|
||||
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
|
||||
)
|
||||
prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
|
||||
"llama2",
|
||||
description=(
|
||||
"The prompt style to use for the chat engine. "
|
||||
"If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
|
||||
"If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
|
||||
"If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
|
||||
"If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
|
||||
"`llama2` is the historic behaviour. `default` might work better with your custom models."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class VectorstoreSettings(BaseModel):
|
||||
|
@ -117,18 +128,6 @@ class NodeStoreSettings(BaseModel):
|
|||
class LlamaCPPSettings(BaseModel):
|
||||
llm_hf_repo_id: str
|
||||
llm_hf_model_file: str
|
||||
prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
|
||||
"llama2",
|
||||
description=(
|
||||
"The prompt style to use for the chat engine. "
|
||||
"If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
|
||||
"If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
|
||||
"If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
|
||||
"If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
|
||||
"`llama2` is the historic behaviour. `default` might work better with your custom models."
|
||||
),
|
||||
)
|
||||
|
||||
tfs_z: float = Field(
|
||||
1.0,
|
||||
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
|
||||
|
|
|
@ -8,9 +8,9 @@ llm:
|
|||
max_new_tokens: 512
|
||||
context_window: 3900
|
||||
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
|
||||
prompt_style: "mistral"
|
||||
|
||||
llamacpp:
|
||||
prompt_style: "mistral"
|
||||
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
||||
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
|
||||
|
||||
|
@ -24,4 +24,4 @@ vectorstore:
|
|||
database: qdrant
|
||||
|
||||
qdrant:
|
||||
path: local_data/private_gpt/qdrant
|
||||
path: local_data/private_gpt/qdrant
|
||||
|
|
|
@ -36,6 +36,7 @@ ui:
|
|||
|
||||
llm:
|
||||
mode: llamacpp
|
||||
prompt_style: "mistral"
|
||||
# Should be matching the selected model
|
||||
max_new_tokens: 512
|
||||
context_window: 3900
|
||||
|
@ -53,7 +54,6 @@ rag:
|
|||
top_n: 1
|
||||
|
||||
llamacpp:
|
||||
prompt_style: "mistral"
|
||||
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
||||
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
|
||||
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
|
||||
|
|
Loading…
Reference in New Issue