Update server params

This commit is contained in:
Andrei Betlen 2023-09-29 19:59:12 -04:00
parent 3720c739d4
commit d9bce17794

View file

@ -27,6 +27,7 @@ import numpy as np
import numpy.typing as npt
# Disable warning for model and model_alias settings
BaseSettings.model_config['protected_namespaces'] = ()
@ -58,14 +59,10 @@ class Settings(BaseSettings):
description="Split layers across multiple GPUs in proportion.",
)
rope_freq_base: float = Field(
default=10000, ge=1, description="RoPE base frequency"
default=0.0, description="RoPE base frequency"
)
rope_freq_scale: float = Field(
default=1.0, description="RoPE frequency scaling factor"
)
low_vram: bool = Field(
default=False,
description="Whether to use less VRAM. This will reduce performance.",
default=0.0, description="RoPE frequency scaling factor"
)
mul_mat_q: bool = Field(
default=True, description="if true, use experimental mul_mat_q kernels"
@ -106,6 +103,10 @@ class Settings(BaseSettings):
default=False,
description="Enable NUMA support.",
)
chat_format: str = Field(
default="llama-2",
description="Chat format to use.",
)
cache: bool = Field(
default=False,
description="Use a cache to reduce processing times for evaluated prompts.",
@ -349,7 +350,6 @@ def create_app(settings: Optional[Settings] = None):
tensor_split=settings.tensor_split,
rope_freq_base=settings.rope_freq_base,
rope_freq_scale=settings.rope_freq_scale,
low_vram=settings.low_vram,
mul_mat_q=settings.mul_mat_q,
f16_kv=settings.f16_kv,
logits_all=settings.logits_all,
@ -361,6 +361,8 @@ def create_app(settings: Optional[Settings] = None):
last_n_tokens_size=settings.last_n_tokens_size,
lora_base=settings.lora_base,
lora_path=settings.lora_path,
numa=settings.numa,
chat_format=settings.chat_format,
verbose=settings.verbose,
)
if settings.cache: