Offload KQV by default

This commit is contained in:
Andrei Betlen 2024-01-18 11:08:57 -05:00
parent 6bfe98bd80
commit 48c3b77e6f
2 changed files with 2 additions and 2 deletions

View file

@ -77,7 +77,7 @@ class Llama:
mul_mat_q: bool = True,
logits_all: bool = False,
embedding: bool = False,
offload_kqv: bool = False,
offload_kqv: bool = True,
# Sampling Params
last_n_tokens_size: int = 64,
# LoRA Params

View file

@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):
logits_all: bool = Field(default=True, description="Whether to return logits.")
embedding: bool = Field(default=True, description="Whether to use embeddings.")
offload_kqv: bool = Field(
default=False, description="Whether to offload kqv to the GPU."
default=True, description="Whether to offload kqv to the GPU."
)
# Sampling Params
last_n_tokens_size: int = Field(