From 9c192ddd732a598786592df1705623be1894a067 Mon Sep 17 00:00:00 2001
From: Gianni Acquisto <32604056+gianniacquisto@users.noreply.github.com>
Date: Sun, 26 Nov 2023 19:17:29 +0100
Subject: [PATCH] Added max_new_tokens as a config option to llm yaml block
 (#1317)

* added max_new_tokens as a configuration option to the llm block in settings

* Update fern/docs/pages/manual/settings.mdx

Co-authored-by: lopagela <lpglm@orange.fr>

* Update private_gpt/settings/settings.py

Add default value for max_new_tokens = 256

Co-authored-by: lopagela <lpglm@orange.fr>

* Addressed location of docs comment

* reformatting from running 'make check'

* remove default config value from settings.yaml

---------

Co-authored-by: lopagela <lpglm@orange.fr>
---
 fern/docs/pages/installation/installation.mdx | 15 +++++++++++++++
 private_gpt/components/llm/llm_component.py   |  1 +
 private_gpt/settings/settings.py              |  4 ++++
 3 files changed, 20 insertions(+)

diff --git a/fern/docs/pages/installation/installation.mdx b/fern/docs/pages/installation/installation.mdx
index 61fc012..baad4a4 100644
--- a/fern/docs/pages/installation/installation.mdx
+++ b/fern/docs/pages/installation/installation.mdx
@@ -89,6 +89,21 @@ Currently, not all the parameters of `llama.cpp` and `llama-cpp-python` are avai
 In case you need to customize parameters such as the number of layers loaded into the GPU, you might change
 these at the `llm_component.py` file under the `private_gpt/components/llm/llm_component.py`.
 
+##### Available LLM config options
+
+The `llm` section of the settings allows for the following configurations:
+
+- `mode`: how to run your llm
+- `max_new_tokens`: this lets you configure the number of new tokens the LLM will generate and add to the context window (by default Llama.cpp uses `256`)
+
+Example:
+
+```yaml
+llm:
+  mode: local
+  max_new_tokens: 256
+```
+
 If you are getting an out of memory error, you might also try a smaller model or stick to the proposed
 recommended models, instead of custom tuning the parameters.
 
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
index 58c4ed0..cfe3a73 100644
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -31,6 +31,7 @@ class LLMComponent:
                 self.llm = LlamaCPP(
                     model_path=str(models_path / settings.local.llm_hf_model_file),
                     temperature=0.1,
+                    max_new_tokens=settings.llm.max_new_tokens,
                     # llama2 has a context window of 4096 tokens,
                     # but we set it lower to allow for some wiggle room
                     context_window=3900,
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 532e2d1..7308104 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -82,6 +82,10 @@ class DataSettings(BaseModel):
 
 class LLMSettings(BaseModel):
     mode: Literal["local", "openai", "sagemaker", "mock"]
+    max_new_tokens: int = Field(
+        256,
+        description="The maximum number of token that the LLM is authorized to generate in one completion.",
+    )
 
 
 class VectorstoreSettings(BaseModel):