From cdf59768f52cbf3e54bfe2877d0e5cd3049c04a6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 00:04:22 -0400 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_cpp.py | 9 +++++---- llama_cpp/server/app.py | 6 ++++++ vendor/llama.cpp | 2 +- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4295ba7..362ebd9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -83,6 +83,7 @@ class Llama: # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -129,6 +130,7 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -1081,6 +1083,7 @@ class Llama: model_path=self.model_path, n_ctx=self.params.n_ctx, n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1100,6 +1103,7 @@ class Llama: model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 870eced..71e78d9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -68,7 +68,7 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = c_int(1) +LLAMA_FILE_VERSION = c_int(2) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" @@ -109,6 +109,7 @@ class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context ("n_parts", c_int), # -1 for default + ("n_gpu_layers", c_int), # number of layers to store in VRAM ("seed", c_int), # RNG seed, 0 for random ("f16_kv", c_bool), # use fp16 for KV cache ( @@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors @@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t # Destination needs to have allocated enough memory. # Returns the number of bytes copied def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3415a5a..8a83674 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -17,6 +17,11 @@ class Settings(BaseSettings): description="The path to the model to use for generating completions." ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b608b55..08737ef 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b608b55a3ea8e4760c617418538465449175bdb8 +Subproject commit 08737ef720f0510c7ec2aa84d7f70c691073c35d