feat(settings): Configurable context_window and tokenizer (#1437)

2023-12-21 14:49:35 +01:00 · 2023-12-21 14:49:35 +01:00 · 4780540870
parent 6eeb95ec7f
commit 4780540870
4 changed files with 43 additions and 7 deletions
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@ -1,11 +1,13 @@
 import logging
 from injector import inject, singleton
 from llama_index import set_global_tokenizer
 from llama_index.llms import MockLLM
 from llama_index.llms.base import LLM
 from transformers import AutoTokenizer  # type: ignore
 from private_gpt.components.llm.prompt_helper import get_prompt_style
-from private_gpt.paths import models_path
+from private_gpt.paths import models_cache_path, models_path
 from private_gpt.settings.settings import Settings
 logger = logging.getLogger(__name__)
@ -18,6 +20,14 @@ class LLMComponent:
    @inject
    def __init__(self, settings: Settings) -> None:
        llm_mode = settings.llm.mode
        if settings.llm.tokenizer:
            set_global_tokenizer(
                AutoTokenizer.from_pretrained(
                    pretrained_model_name_or_path=settings.llm.tokenizer,
                    cache_dir=str(models_cache_path),
                )
            )
        logger.info("Initializing the LLM in mode=%s", llm_mode)
        match settings.llm.mode:
            case "local":
@ -29,9 +39,7 @@ class LLMComponent:
                    model_path=str(models_path / settings.local.llm_hf_model_file),
                    temperature=0.1,
                    max_new_tokens=settings.llm.max_new_tokens,
-                    # llama2 has a context window of 4096 tokens,
+                    context_window=settings.llm.context_window,
                    # but we set it lower to allow for some wiggle room
                    context_window=3900,
                    generate_kwargs={},
                    # All to GPU
                    model_kwargs={"n_gpu_layers": -1},
@ -46,6 +54,8 @@ class LLMComponent:
                self.llm = SagemakerLLM(
                    endpoint_name=settings.sagemaker.llm_endpoint_name,
                    max_new_tokens=settings.llm.max_new_tokens,
                    context_window=settings.llm.context_window,
                )
            case "openai":
                from llama_index.llms import OpenAI
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@ -86,6 +86,18 @@ class LLMSettings(BaseModel):
        256,
        description="The maximum number of token that the LLM is authorized to generate in one completion.",
    )
    context_window: int = Field(
        3900,
        description="The maximum number of context tokens for the model.",
    )
    tokenizer: str = Field(
        None,
        description="The model id of a predefined tokenizer hosted inside a model repo on "
        "huggingface.co. Valid model ids can be located at the root-level, like "
        "`bert-base-uncased`, or namespaced under a user or organization name, "
        "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
        "gpt-3.5-turbo LLM.",
    )
 class VectorstoreSettings(BaseModel):
--- a/scripts/setup
+++ b/scripts/setup
@ -3,6 +3,7 @@ import os
 import argparse
 from huggingface_hub import hf_hub_download, snapshot_download
 from transformers import AutoTokenizer
 from private_gpt.paths import models_path, models_cache_path
 from private_gpt.settings.settings import settings
@ -15,8 +16,9 @@ if __name__ == '__main__':
    resume_download = args.resume
 os.makedirs(models_path, exist_ok=True)
 embedding_path = models_path / "embedding"
 # Download Embedding model
 embedding_path = models_path / "embedding"
 print(f"Downloading embedding {settings().local.embedding_hf_model_name}")
 snapshot_download(
    repo_id=settings().local.embedding_hf_model_name,
@ -24,9 +26,9 @@ snapshot_download(
    local_dir=embedding_path,
 )
 print("Embedding model downloaded!")
 print("Downloading models for local execution...")
 # Download LLM and create a symlink to the model file
 print(f"Downloading LLM {settings().local.llm_hf_model_file}")
 hf_hub_download(
    repo_id=settings().local.llm_hf_repo_id,
    filename=settings().local.llm_hf_model_file,
@ -34,6 +36,14 @@ hf_hub_download(
    local_dir=models_path,
    resume_download=resume_download,
 )
 print("LLM model downloaded!")
 # Download Tokenizer
 print(f"Downloading tokenizer {settings().llm.tokenizer}")
 AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=settings().llm.tokenizer,
    cache_dir=models_cache_path,
 )
 print("Tokenizer downloaded!")
 print("Setup done")
--- a/settings.yaml
+++ b/settings.yaml
@ -34,6 +34,10 @@ ui:
 llm:
  mode: local
  # Should be matching the selected model
  max_new_tokens: 512
  context_window: 32768
  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
 embedding:
  # Should be matching the value above in most cases