feat(llm): adds serveral settings for llamacpp and ollama (#1703)
This commit is contained in:
parent
410bf7a71f
commit
02dc83e8e9
|
@ -1,4 +1,5 @@
|
||||||
"""private-gpt."""
|
"""private-gpt."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
|
@ -39,16 +39,23 @@ class LLMComponent:
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
|
prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
|
||||||
|
settings_kwargs = {
|
||||||
|
"tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp
|
||||||
|
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
|
||||||
|
"top_p": settings.llamacpp.top_p, # ollama and llama-cpp
|
||||||
|
"repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp
|
||||||
|
"n_gpu_layers": -1,
|
||||||
|
"offload_kqv": True,
|
||||||
|
}
|
||||||
self.llm = LlamaCPP(
|
self.llm = LlamaCPP(
|
||||||
model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
|
model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
|
||||||
temperature=0.1,
|
temperature=settings.llm.temperature,
|
||||||
max_new_tokens=settings.llm.max_new_tokens,
|
max_new_tokens=settings.llm.max_new_tokens,
|
||||||
context_window=settings.llm.context_window,
|
context_window=settings.llm.context_window,
|
||||||
generate_kwargs={},
|
generate_kwargs={},
|
||||||
callback_manager=LlamaIndexSettings.callback_manager,
|
callback_manager=LlamaIndexSettings.callback_manager,
|
||||||
# All to GPU
|
# All to GPU
|
||||||
model_kwargs={"n_gpu_layers": -1, "offload_kqv": True},
|
model_kwargs=settings_kwargs,
|
||||||
# transform inputs into Llama2 format
|
# transform inputs into Llama2 format
|
||||||
messages_to_prompt=prompt_style.messages_to_prompt,
|
messages_to_prompt=prompt_style.messages_to_prompt,
|
||||||
completion_to_prompt=prompt_style.completion_to_prompt,
|
completion_to_prompt=prompt_style.completion_to_prompt,
|
||||||
|
@ -108,8 +115,22 @@ class LLMComponent:
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
ollama_settings = settings.ollama
|
ollama_settings = settings.ollama
|
||||||
|
|
||||||
|
settings_kwargs = {
|
||||||
|
"tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp
|
||||||
|
"num_predict": ollama_settings.num_predict, # ollama only
|
||||||
|
"top_k": ollama_settings.top_k, # ollama and llama-cpp
|
||||||
|
"top_p": ollama_settings.top_p, # ollama and llama-cpp
|
||||||
|
"repeat_last_n": ollama_settings.repeat_last_n, # ollama
|
||||||
|
"repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp
|
||||||
|
}
|
||||||
|
|
||||||
self.llm = Ollama(
|
self.llm = Ollama(
|
||||||
model=ollama_settings.llm_model, base_url=ollama_settings.api_base
|
model=ollama_settings.llm_model,
|
||||||
|
base_url=ollama_settings.api_base,
|
||||||
|
temperature=settings.llm.temperature,
|
||||||
|
context_window=settings.llm.context_window,
|
||||||
|
additional_kwargs=settings_kwargs,
|
||||||
)
|
)
|
||||||
case "mock":
|
case "mock":
|
||||||
self.llm = MockLLM()
|
self.llm = MockLLM()
|
||||||
|
|
|
@ -137,9 +137,11 @@ class VectorStoreComponent:
|
||||||
index=index,
|
index=index,
|
||||||
similarity_top_k=similarity_top_k,
|
similarity_top_k=similarity_top_k,
|
||||||
doc_ids=context_filter.docs_ids if context_filter else None,
|
doc_ids=context_filter.docs_ids if context_filter else None,
|
||||||
filters=_doc_id_metadata_filter(context_filter)
|
filters=(
|
||||||
|
_doc_id_metadata_filter(context_filter)
|
||||||
if self.settings.vectorstore.database != "qdrant"
|
if self.settings.vectorstore.database != "qdrant"
|
||||||
else None,
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
"""FastAPI app creation, logger configuration and main API routes."""
|
"""FastAPI app creation, logger configuration and main API routes."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from fastapi import Depends, FastAPI, Request
|
from fastapi import Depends, FastAPI, Request
|
||||||
|
|
|
@ -12,6 +12,7 @@ Authorization can be done by following fastapi's guides:
|
||||||
* https://fastapi.tiangolo.com/tutorial/security/
|
* https://fastapi.tiangolo.com/tutorial/security/
|
||||||
* https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
|
* https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# mypy: ignore-errors
|
# mypy: ignore-errors
|
||||||
# Disabled mypy error: All conditional function variants must have identical signatures
|
# Disabled mypy error: All conditional function variants must have identical signatures
|
||||||
# We are changing the implementation of the authenticated method, based on
|
# We are changing the implementation of the authenticated method, based on
|
||||||
|
|
|
@ -98,6 +98,10 @@ class LLMSettings(BaseModel):
|
||||||
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
|
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
|
||||||
"gpt-3.5-turbo LLM.",
|
"gpt-3.5-turbo LLM.",
|
||||||
)
|
)
|
||||||
|
temperature: float = Field(
|
||||||
|
0.1,
|
||||||
|
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class VectorstoreSettings(BaseModel):
|
class VectorstoreSettings(BaseModel):
|
||||||
|
@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
tfs_z: float = Field(
|
||||||
|
1.0,
|
||||||
|
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
|
||||||
|
)
|
||||||
|
top_k: int = Field(
|
||||||
|
40,
|
||||||
|
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
|
||||||
|
)
|
||||||
|
top_p: float = Field(
|
||||||
|
0.9,
|
||||||
|
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
|
||||||
|
)
|
||||||
|
repeat_penalty: float = Field(
|
||||||
|
1.1,
|
||||||
|
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceSettings(BaseModel):
|
class HuggingFaceSettings(BaseModel):
|
||||||
embedding_hf_model_name: str = Field(
|
embedding_hf_model_name: str = Field(
|
||||||
|
@ -184,6 +205,30 @@ class OllamaSettings(BaseModel):
|
||||||
None,
|
None,
|
||||||
description="Model to use. Example: 'nomic-embed-text'.",
|
description="Model to use. Example: 'nomic-embed-text'.",
|
||||||
)
|
)
|
||||||
|
tfs_z: float = Field(
|
||||||
|
1.0,
|
||||||
|
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
|
||||||
|
)
|
||||||
|
num_predict: int = Field(
|
||||||
|
None,
|
||||||
|
description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)",
|
||||||
|
)
|
||||||
|
top_k: int = Field(
|
||||||
|
40,
|
||||||
|
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
|
||||||
|
)
|
||||||
|
top_p: float = Field(
|
||||||
|
0.9,
|
||||||
|
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
|
||||||
|
)
|
||||||
|
repeat_last_n: int = Field(
|
||||||
|
64,
|
||||||
|
description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)",
|
||||||
|
)
|
||||||
|
repeat_penalty: float = Field(
|
||||||
|
1.1,
|
||||||
|
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class UISettings(BaseModel):
|
class UISettings(BaseModel):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
"""This file should be imported only and only if you want to run the UI locally."""
|
"""This file should be imported only and only if you want to run the UI locally."""
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
|
@ -5,6 +5,7 @@ llm:
|
||||||
mode: ollama
|
mode: ollama
|
||||||
max_new_tokens: 512
|
max_new_tokens: 512
|
||||||
context_window: 3900
|
context_window: 3900
|
||||||
|
temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
|
||||||
|
|
||||||
embedding:
|
embedding:
|
||||||
mode: ollama
|
mode: ollama
|
||||||
|
@ -13,10 +14,14 @@ ollama:
|
||||||
llm_model: mistral
|
llm_model: mistral
|
||||||
embedding_model: nomic-embed-text
|
embedding_model: nomic-embed-text
|
||||||
api_base: http://localhost:11434
|
api_base: http://localhost:11434
|
||||||
|
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
|
||||||
|
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
|
||||||
|
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
|
||||||
|
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
|
||||||
|
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
|
||||||
|
|
||||||
vectorstore:
|
vectorstore:
|
||||||
database: qdrant
|
database: qdrant
|
||||||
|
|
||||||
qdrant:
|
qdrant:
|
||||||
path: local_data/private_gpt/qdrant
|
path: local_data/private_gpt/qdrant
|
||||||
|
|
||||||
|
|
|
@ -39,11 +39,16 @@ llm:
|
||||||
# Should be matching the selected model
|
# Should be matching the selected model
|
||||||
max_new_tokens: 512
|
max_new_tokens: 512
|
||||||
context_window: 3900
|
context_window: 3900
|
||||||
|
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
|
||||||
|
|
||||||
llamacpp:
|
llamacpp:
|
||||||
prompt_style: "mistral"
|
prompt_style: "mistral"
|
||||||
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
||||||
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
|
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
|
||||||
|
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
|
||||||
|
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
|
||||||
|
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
|
||||||
|
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
|
||||||
|
|
||||||
embedding:
|
embedding:
|
||||||
# Should be matching the value above in most cases
|
# Should be matching the value above in most cases
|
||||||
|
|
|
@ -5,6 +5,7 @@ NOTE: We are not testing the switch based on the config in
|
||||||
is currently architecture (it is hard to patch the `settings` and the app while
|
is currently architecture (it is hard to patch the `settings` and the app while
|
||||||
the tests are directly importing them).
|
the tests are directly importing them).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Annotated
|
from typing import Annotated
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
Loading…
Reference in New Issue