From 9c192ddd732a598786592df1705623be1894a067 Mon Sep 17 00:00:00 2001 From: Gianni Acquisto <32604056+gianniacquisto@users.noreply.github.com> Date: Sun, 26 Nov 2023 19:17:29 +0100 Subject: [PATCH] Added max_new_tokens as a config option to llm yaml block (#1317) * added max_new_tokens as a configuration option to the llm block in settings * Update fern/docs/pages/manual/settings.mdx Co-authored-by: lopagela * Update private_gpt/settings/settings.py Add default value for max_new_tokens = 256 Co-authored-by: lopagela * Addressed location of docs comment * reformatting from running 'make check' * remove default config value from settings.yaml --------- Co-authored-by: lopagela --- fern/docs/pages/installation/installation.mdx | 15 +++++++++++++++ private_gpt/components/llm/llm_component.py | 1 + private_gpt/settings/settings.py | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/fern/docs/pages/installation/installation.mdx b/fern/docs/pages/installation/installation.mdx index 61fc012..baad4a4 100644 --- a/fern/docs/pages/installation/installation.mdx +++ b/fern/docs/pages/installation/installation.mdx @@ -89,6 +89,21 @@ Currently, not all the parameters of `llama.cpp` and `llama-cpp-python` are avai In case you need to customize parameters such as the number of layers loaded into the GPU, you might change these at the `llm_component.py` file under the `private_gpt/components/llm/llm_component.py`. +##### Available LLM config options + +The `llm` section of the settings allows for the following configurations: + +- `mode`: how to run your llm +- `max_new_tokens`: this lets you configure the number of new tokens the LLM will generate and add to the context window (by default Llama.cpp uses `256`) + +Example: + +```yaml +llm: + mode: local + max_new_tokens: 256 +``` + If you are getting an out of memory error, you might also try a smaller model or stick to the proposed recommended models, instead of custom tuning the parameters. diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 58c4ed0..cfe3a73 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -31,6 +31,7 @@ class LLMComponent: self.llm = LlamaCPP( model_path=str(models_path / settings.local.llm_hf_model_file), temperature=0.1, + max_new_tokens=settings.llm.max_new_tokens, # llama2 has a context window of 4096 tokens, # but we set it lower to allow for some wiggle room context_window=3900, diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 532e2d1..7308104 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -82,6 +82,10 @@ class DataSettings(BaseModel): class LLMSettings(BaseModel): mode: Literal["local", "openai", "sagemaker", "mock"] + max_new_tokens: int = Field( + 256, + description="The maximum number of token that the LLM is authorized to generate in one completion.", + ) class VectorstoreSettings(BaseModel):