diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 535952c..a06076e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -27,6 +27,7 @@ import numpy as np import numpy.typing as npt +# Disable warning for model and model_alias settings BaseSettings.model_config['protected_namespaces'] = () @@ -58,14 +59,10 @@ class Settings(BaseSettings): description="Split layers across multiple GPUs in proportion.", ) rope_freq_base: float = Field( - default=10000, ge=1, description="RoPE base frequency" + default=0.0, description="RoPE base frequency" ) rope_freq_scale: float = Field( - default=1.0, description="RoPE frequency scaling factor" - ) - low_vram: bool = Field( - default=False, - description="Whether to use less VRAM. This will reduce performance.", + default=0.0, description="RoPE frequency scaling factor" ) mul_mat_q: bool = Field( default=True, description="if true, use experimental mul_mat_q kernels" @@ -106,6 +103,10 @@ class Settings(BaseSettings): default=False, description="Enable NUMA support.", ) + chat_format: str = Field( + default="llama-2", + description="Chat format to use.", + ) cache: bool = Field( default=False, description="Use a cache to reduce processing times for evaluated prompts.", @@ -349,7 +350,6 @@ def create_app(settings: Optional[Settings] = None): tensor_split=settings.tensor_split, rope_freq_base=settings.rope_freq_base, rope_freq_scale=settings.rope_freq_scale, - low_vram=settings.low_vram, mul_mat_q=settings.mul_mat_q, f16_kv=settings.f16_kv, logits_all=settings.logits_all, @@ -361,6 +361,8 @@ def create_app(settings: Optional[Settings] = None): last_n_tokens_size=settings.last_n_tokens_size, lora_base=settings.lora_base, lora_path=settings.lora_path, + numa=settings.numa, + chat_format=settings.chat_format, verbose=settings.verbose, ) if settings.cache: