From d11ccc30361a947177e46d31c632988661f5af02 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 23 Mar 2024 17:14:15 -0400 Subject: [PATCH 01/22] fix(server): minor type fixes --- llama_cpp/server/app.py | 6 +++--- llama_cpp/server/types.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2e1081e..d5e2f7e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -493,7 +493,7 @@ async def tokenize( ) -> TokenizeInputResponse: tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - return {"tokens": tokens} + return TokenizeInputResponse(tokens=tokens) @router.post( @@ -508,7 +508,7 @@ async def count_query_tokens( ) -> TokenizeInputCountResponse: tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - return {"count": len(tokens)} + return TokenizeInputCountResponse(count=len(tokens)) @router.post( @@ -523,4 +523,4 @@ async def detokenize( ) -> DetokenizeInputResponse: text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") - return {"text": text} + return DetokenizeInputResponse(text=text) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index c8b2ebc..378f8d7 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -268,7 +268,7 @@ class ModelList(TypedDict): class TokenizeInputRequest(BaseModel): model: Optional[str] = model_field - input: Optional[str] = Field(description="The input to tokenize.") + input: str = Field(description="The input to tokenize.") model_config = { "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]} From 364678bde55ca75db2788563e2d75e6303720e6d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 24 Mar 2024 12:27:49 -0400 Subject: [PATCH 02/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 50ccaf5..a0e584d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652 +Subproject commit a0e584defd8c16e7a51ab895f595df0448d710d0 From a93b9149f8ebab3219680e8f8cd46e53177aad74 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 25 Mar 2024 11:10:14 -0400 Subject: [PATCH 03/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a0e584d..2f34b86 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a0e584defd8c16e7a51ab895f595df0448d710d0 +Subproject commit 2f34b865b62b1d2b5eb8a27885e4de220deeacbd From b64fa4e2c05385d7cb36154dce42a29ca2b669ee Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 25 Mar 2024 23:09:07 -0400 Subject: [PATCH 04/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2f34b86..e190f1f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2f34b865b62b1d2b5eb8a27885e4de220deeacbd +Subproject commit e190f1fca6f60d80944f9e8709d343a025c4d245 From 901fe02461fbad0aa817060b8a4062924b89f418 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 26 Mar 2024 22:58:53 -0400 Subject: [PATCH 05/22] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 30 ++++++++++++++++++++---------- vendor/llama.cpp | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1b8f6ca..dc39e4c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -175,8 +175,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 4 -LLAMA_SESSION_VERSION = 4 +# define LLAMA_SESSION_VERSION 5 +LLAMA_SESSION_VERSION = 5 # struct llama_model; @@ -274,6 +274,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -677,6 +678,7 @@ It might not exist for progress report where '.' is output repeatedly.""" # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool pure; // quantize all tensors to the default type # void * imatrix; // pointer to importance matrix data +# void * kv_overrides; // pointer to vector containing overrides # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -691,6 +693,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type imatrix (ctypes.c_void_p): pointer to importance matrix data + kv_overrides (ctypes.c_void_p): pointer to vector containing overrides """ _fields_ = [ @@ -703,6 +706,7 @@ class llama_model_quantize_params(ctypes.Structure): ("only_copy", ctypes.c_bool), ("pure", ctypes.c_bool), ("imatrix", ctypes.c_void_p), + ("kv_overrides", ctypes.c_void_p), ] @@ -1838,9 +1842,9 @@ def llama_synchronize(ctx: llama_context_p, /): # // Token logits obtained from the last call to llama_decode() -# // The logits for the last token are stored in the last row -# // Logits for which llama_batch.logits[i] == 0 are undefined -# // Rows: n_tokens provided with llama_batch +# // The logits for which llama_batch.logits[i] != 0 are stored contiguously +# // in the order they have appeared in the batch. +# // Rows: number of tokens for which llama_batch.logits[i] != 0 # // Cols: n_vocab # LLAMA_API float * llama_get_logits(struct llama_context * ctx); @ctypes_function( @@ -1859,7 +1863,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: # // Logits for the ith token. Equivalent to: -# // llama_get_logits(ctx) + i*n_vocab +# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab +# // returns NULL for invalid ids. # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_logits_ith", @@ -1874,8 +1879,12 @@ def llama_get_logits_ith( ... -# // Get all output token embeddings -# // shape: [n_tokens*n_embd] (1-dimensional) +# // Get all output token embeddings. +# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, +# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously +# // in the order they have appeared in the batch. +# // shape: [n_outputs*n_embd] +# // Otherwise, returns NULL. # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @ctypes_function( "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) @@ -1886,9 +1895,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float] ... -# // Get the embeddings for the ith token -# // llama_get_embeddings(ctx) + i*n_embd +# // Get the embeddings for the ith token. Equivalent to: +# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd # // shape: [n_embd] (1-dimensional) +# // returns NULL for invalid ids. # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_embeddings_ith", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e190f1f..32c8486 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e190f1fca6f60d80944f9e8709d343a025c4d245 +Subproject commit 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 From 125b2358c9ea92cbcb416e2bd2c8f93132e641ac Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 28 Mar 2024 12:06:46 -0400 Subject: [PATCH 06/22] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 10 +++++++--- vendor/llama.cpp | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index dc39e4c..1db47be 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -199,14 +199,18 @@ llama_seq_id = ctypes.c_int32 # enum llama_vocab_type { # LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab -# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece -# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding -# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece +# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback +# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE +# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece # }; LLAMA_VOCAB_TYPE_NONE = 0 +"""For models without vocab""" LLAMA_VOCAB_TYPE_SPM = 1 +"""LLaMA tokenizer based on byte-level BPE with byte fallback""" LLAMA_VOCAB_TYPE_BPE = 2 +"""GPT-2 tokenizer based on byte-level BPE""" LLAMA_VOCAB_TYPE_WPM = 3 +"""BERT tokenizer based on WordPiece""" # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 32c8486..5106ef4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 +Subproject commit 5106ef482c65ac60ac14da9a68c7b37bca4c6993 From dcbe57fcf8c4af829ef0085720b8730b0516f99a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 29 Mar 2024 12:45:27 -0400 Subject: [PATCH 07/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5106ef4..0695747 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5106ef482c65ac60ac14da9a68c7b37bca4c6993 +Subproject commit 069574775cea67d8a977904e4d534aff47a671f4 From 1e60dba082464b8828dca0a6d05a2fe46fcc4f7c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 29 Mar 2024 13:34:23 -0400 Subject: [PATCH 08/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0695747..ba0c7c7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 069574775cea67d8a977904e4d534aff47a671f4 +Subproject commit ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c From aa9f1ae011fbc22893750209af500fee3167f21c Mon Sep 17 00:00:00 2001 From: windspirit95 <32880949+windspirit95@users.noreply.github.com> Date: Mon, 1 Apr 2024 02:30:13 +0900 Subject: [PATCH 09/22] feat: Add logprobs support to chat completions (#1311) * Add logprobs return in ChatCompletionResponse * Fix duplicate field * Set default to false * Simplify check * Add server example --------- Co-authored-by: Andrei Betlen --- llama_cpp/llama.py | 1 + llama_cpp/llama_chat_format.py | 5 +++++ llama_cpp/llama_types.py | 1 + llama_cpp/server/app.py | 12 ++++++++++++ llama_cpp/server/types.py | 10 +++++++++- 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fed84d5..66caaa9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1653,6 +1653,7 @@ class Llama: top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, stream=stream, stop=stop, seed=seed, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ccf4fd0..06cf9ce 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -231,6 +231,7 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, + "logprobs": completion["choices"][0]["logprobs"], "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat( "delta": { "role": "assistant", }, + "logprobs": None, "finish_reason": None, } ], @@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), + "logprobs": chunk["choices"][0]["logprobs"], "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler( temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, + logprobs: int = 0, min_p: float = 0.05, typical_p: float = 1.0, stream: bool = False, @@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler( top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=logprobs, stream=stream, stop=stop, seed=seed, diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 1b1befe..87e000f 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict): class ChatCompletionResponseChoice(TypedDict): index: int message: "ChatCompletionResponseMessage" + logprobs: Optional[CompletionLogprobs] finish_reason: Optional[str] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index d5e2f7e..815ed3c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -405,6 +405,18 @@ async def create_chat_completion( } }, }, + "logprobs": { + "summary": "Logprobs", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + "logprobs": True, + "top_logprobs": 10 + }, + }, } ), llama_proxy: LlamaProxy = Depends(get_llama_proxy), diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 378f8d7..ce9c87a 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logprobs: Optional[int] = Field(None) seed: Optional[int] = Field(None) # ignored or currently unsupported @@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + logprobs: Optional[bool] = Field( + default=False, + description="Whether to output the logprobs or not. Default is True" + ) + top_logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.", + ) temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field From f165048a69b2ad5b70d4fd2ec4adebfb411b5b47 Mon Sep 17 00:00:00 2001 From: Limour <93720049+Limour-dev@users.noreply.github.com> Date: Mon, 1 Apr 2024 22:19:28 +0800 Subject: [PATCH 10/22] feat: add support for KV cache quantization options (#1307) * add KV cache quantization options https://github.com/abetlen/llama-cpp-python/discussions/1220 https://github.com/abetlen/llama-cpp-python/issues/1305 * Add ggml_type * Use ggml_type instead of string for quantization * Add server support --------- Co-authored-by: Andrei Betlen --- llama_cpp/llama.py | 59 ++++++++++----------------------- llama_cpp/llama_cpp.py | 64 ++++++++++++++++++++++++++++++++++++ llama_cpp/server/model.py | 3 ++ llama_cpp/server/settings.py | 9 +++++ 4 files changed, 94 insertions(+), 41 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 66caaa9..dcc7be7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -105,6 +105,9 @@ class Llama: draft_model: Optional[LlamaDraftModel] = None, # Tokenizer Override tokenizer: Optional[BaseLlamaTokenizer] = None, + # KV cache quantization + type_k: Optional[int] = None, + type_v: Optional[int] = None, # Misc verbose: bool = True, # Extra Params @@ -172,6 +175,8 @@ class Llama: draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. verbose: Print verbose output to stderr. + type_k: KV cache data type for K (default: f16) + type_v: KV cache data type for V (default: f16) Raises: ValueError: If the model path does not exist. @@ -298,7 +303,11 @@ class Llama: ) # Must be set to True for speculative decoding self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - + # KV cache quantization + if type_k is not None: + self.context_params.type_k = type_k + if type_v is not None: + self.context_params.type_v = type_v # Sampling Params self.last_n_tokens_size = last_n_tokens_size @@ -1724,6 +1733,7 @@ class Llama: n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, + pooling_type=self.context_params.pooling_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, @@ -1733,6 +1743,7 @@ class Llama: yarn_orig_ctx=self.context_params.yarn_orig_ctx, logits_all=self.context_params.logits_all, embedding=self.context_params.embeddings, + offload_kqv=self.context_params.offload_kqv, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params @@ -1744,51 +1755,17 @@ class Llama: # Chat Format Params chat_format=self.chat_format, chat_handler=self.chat_handler, + # Speculative Decidng + draft_model=self.draft_model, + # KV cache quantization + type_k=self.context_params.type_k, + type_v=self.context_params.type_v, # Misc verbose=self.verbose, ) def __setstate__(self, state): - self.__init__( - model_path=state["model_path"], - # Model Params - n_gpu_layers=state["n_gpu_layers"], - split_mode=state["split_mode"], - main_gpu=state["main_gpu"], - tensor_split=state["tensor_split"], - vocab_only=state["vocab_only"], - use_mmap=state["use_mmap"], - use_mlock=state["use_mlock"], - kv_overrides=state["kv_overrides"], - # Context Params - seed=state["seed"], - n_ctx=state["n_ctx"], - n_batch=state["n_batch"], - n_threads=state["n_threads"], - n_threads_batch=state["n_threads_batch"], - rope_freq_base=state["rope_freq_base"], - rope_freq_scale=state["rope_freq_scale"], - rope_scaling_type=state["rope_scaling_type"], - yarn_ext_factor=state["yarn_ext_factor"], - yarn_attn_factor=state["yarn_attn_factor"], - yarn_beta_fast=state["yarn_beta_fast"], - yarn_beta_slow=state["yarn_beta_slow"], - yarn_orig_ctx=state["yarn_orig_ctx"], - logits_all=state["logits_all"], - embedding=state["embedding"], - # Sampling Params - last_n_tokens_size=state["last_n_tokens_size"], - # LoRA Params - lora_base=state["lora_base"], - lora_path=state["lora_path"], - # Backend Params - numa=state["numa"], - # Chat Format Params - chat_format=state["chat_format"], - chat_handler=state["chat_handler"], - # Misc - verbose=state["verbose"], - ) + self.__init__(**state) def save_state(self) -> LlamaState: assert self._ctx.ctx is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1db47be..accc02c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa byref = ctypes.byref # type: ignore +# from ggml.h +# // NOTE: always add types at the end of the enum to keep backward compatibility +# enum ggml_type { +# GGML_TYPE_F32 = 0, +# GGML_TYPE_F16 = 1, +# GGML_TYPE_Q4_0 = 2, +# GGML_TYPE_Q4_1 = 3, +# // GGML_TYPE_Q4_2 = 4, support has been removed +# // GGML_TYPE_Q4_3 = 5, support has been removed +# GGML_TYPE_Q5_0 = 6, +# GGML_TYPE_Q5_1 = 7, +# GGML_TYPE_Q8_0 = 8, +# GGML_TYPE_Q8_1 = 9, +# GGML_TYPE_Q2_K = 10, +# GGML_TYPE_Q3_K = 11, +# GGML_TYPE_Q4_K = 12, +# GGML_TYPE_Q5_K = 13, +# GGML_TYPE_Q6_K = 14, +# GGML_TYPE_Q8_K = 15, +# GGML_TYPE_IQ2_XXS = 16, +# GGML_TYPE_IQ2_XS = 17, +# GGML_TYPE_IQ3_XXS = 18, +# GGML_TYPE_IQ1_S = 19, +# GGML_TYPE_IQ4_NL = 20, +# GGML_TYPE_IQ3_S = 21, +# GGML_TYPE_IQ2_S = 22, +# GGML_TYPE_IQ4_XS = 23, +# GGML_TYPE_I8 = 24, +# GGML_TYPE_I16 = 25, +# GGML_TYPE_I32 = 26, +# GGML_TYPE_I64 = 27, +# GGML_TYPE_F64 = 28, +# GGML_TYPE_IQ1_M = 29, +# GGML_TYPE_COUNT, +# }; +GGML_TYPE_F32 = 0 +GGML_TYPE_F16 = 1 +GGML_TYPE_Q4_0 = 2 +GGML_TYPE_Q4_1 = 3 +GGML_TYPE_Q5_0 = 6 +GGML_TYPE_Q5_1 = 7 +GGML_TYPE_Q8_0 = 8 +GGML_TYPE_Q8_1 = 9 +GGML_TYPE_Q2_K = 10 +GGML_TYPE_Q3_K = 11 +GGML_TYPE_Q4_K = 12 +GGML_TYPE_Q5_K = 13 +GGML_TYPE_Q6_K = 14 +GGML_TYPE_Q8_K = 15 +GGML_TYPE_IQ2_XXS = 16 +GGML_TYPE_IQ2_XS = 17 +GGML_TYPE_IQ3_XXS = 18 +GGML_TYPE_IQ1_S = 19 +GGML_TYPE_IQ4_NL = 20 +GGML_TYPE_IQ3_S = 21 +GGML_TYPE_IQ2_S = 22 +GGML_TYPE_IQ4_XS = 23 +GGML_TYPE_I8 = 24 +GGML_TYPE_I16 = 25 +GGML_TYPE_I32 = 26 +GGML_TYPE_I64 = 27 +GGML_TYPE_F64 = 28 +GGML_TYPE_IQ1_M = 29 +GGML_TYPE_COUNT = 30 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index dace8d5..c24fca6 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -175,6 +175,9 @@ class LlamaProxy: chat_handler=chat_handler, # Speculative Decoding draft_model=draft_model, + # KV Cache Quantization + type_k=settings.type_k, + type_v=settings.type_v, # Tokenizer tokenizer=tokenizer, # Misc diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index daa913f..9ebdd0d 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -159,6 +159,15 @@ class ModelSettings(BaseSettings): default=10, description="Number of tokens to predict using the draft model.", ) + # KV Cache Quantization + type_k: Optional[int] = Field( + default=None, + description="Type of the key cache quantization.", + ) + type_v: Optional[int] = Field( + default=None, + description="Type of the value cache quantization.", + ) # Misc verbose: bool = Field( default=True, description="Whether to print debug information." From a0f373e31009d678312ea8d9a3c70bac96232490 Mon Sep 17 00:00:00 2001 From: lawfordp2017 Date: Mon, 1 Apr 2024 08:21:00 -0600 Subject: [PATCH 11/22] fix: Changed local API doc references to hosted (#1317) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3323f38..1e2b640 100644 --- a/README.md +++ b/README.md @@ -321,7 +321,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1` ### JSON and JSON Schema Mode -To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion). +To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion). #### JSON Mode @@ -529,7 +529,7 @@ llama = Llama( ### Embeddings -To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding). +To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding). ```python import llama_cpp From 45bf5ae5829aab6d38f5920b620f653fb9261f16 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 Apr 2024 10:28:22 -0400 Subject: [PATCH 12/22] chore: Bump version --- CHANGELOG.md | 11 ++++++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a85eaa4..8786c01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.58] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c +- feat: add support for KV cache quantization options by @Limour-dev in #1307 +- feat: Add logprobs support to chat completions by @windspirit95 in #1311 +- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289 +- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273 +- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317 + ## [0.2.57] - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 @@ -24,7 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.2.55] -- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5 +- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244 ## [0.2.54] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 1e802fa..e246595 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.57" \ No newline at end of file +__version__ = "0.2.58" \ No newline at end of file From 62aad610e155fdb74064316ee7e83a1251bc8fc0 Mon Sep 17 00:00:00 2001 From: Yuri Mikhailov Date: Tue, 2 Apr 2024 04:25:43 +0900 Subject: [PATCH 13/22] fix: last tokens passing to sample_repetition_penalties function (#1295) Co-authored-by: ymikhaylov Co-authored-by: Andrei --- llama_cpp/_internals.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 22d0bef..79f6543 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -730,12 +730,14 @@ class _LlamaSamplingContext: if len(self.prev) > 0: nl_token = ctx_main.model.token_nl() nl_logit = logits_array[nl_token] - if self.params.penalty_last_n > 0: + last_tokens = self.prev[-self.params.penalty_last_n:] + last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) + if last_tokens_size > 0: + last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens) ctx_main.sample_repetition_penalties( token_data_array, - # TODO: Only create this once - (llama_cpp.llama_token * len(self.prev))(*self.prev), - self.params.penalty_last_n, + last_tokens_p, + last_tokens_size, self.params.penalty_repeat, self.params.penalty_freq, self.params.penalty_present, From e465157804b2aa734ac3b8f7957d3a92acf13fa9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 Apr 2024 00:55:19 -0400 Subject: [PATCH 14/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ba0c7c7..5260486 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c +Subproject commit 52604860f93063ef98863921da697576af1c7665 From 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 Apr 2024 15:30:31 -0400 Subject: [PATCH 15/22] fix: segfault when logits_all=False. Closes #1319 --- llama_cpp/llama.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index dcc7be7..e07d57a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -535,14 +535,16 @@ class Llama: # Save tokens self.input_ids[n_past : n_past + n_tokens] = batch # Save logits - rows = n_tokens - cols = self._n_vocab - offset = ( - 0 if self.context_params.logits_all else n_tokens - 1 - ) # NOTE: Only save the last token logits if logits_all is False - self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[ - : - ] = self._ctx.get_logits()[offset * cols : rows * cols] + if self.context_params.logits_all: + rows = n_tokens + cols = self._n_vocab + logits = self._ctx.get_logits()[: rows * cols] + self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits + else: + rows = 1 + cols = self._n_vocab + logits = self._ctx.get_logits()[: rows * cols] + self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits # Update n_tokens self.n_tokens += n_tokens From 5a930ee9a1a54d12d48cdb7ce8cc63f6bc41d1bb Mon Sep 17 00:00:00 2001 From: Andrei Date: Wed, 3 Apr 2024 15:32:13 -0400 Subject: [PATCH 16/22] feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal (#1247) * Generate binary wheel index on release * Add total release downloads badge * Update download label * Use official cibuildwheel action * Add workflows to build CUDA and Metal wheels * Update generate index workflow * Update workflow name --- .github/workflows/build-and-release.yaml | 10 +- .github/workflows/build-wheels-cuda.yaml | 131 ++++++++++++++++++ .github/workflows/build-wheels-metal.yaml | 87 ++++++++++++ .../generate-index-from-release.yaml | 48 +++++++ README.md | 1 + scripts/release-to-pep-503.sh | 58 ++++++++ 6 files changed, 330 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/build-wheels-cuda.yaml create mode 100644 .github/workflows/build-wheels-metal.yaml create mode 100644 .github/workflows/generate-index-from-release.yaml create mode 100755 scripts/release-to-pep-503.sh diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 63c81f1..76b5f7f 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macOS-latest] + os: [ubuntu-20.04, windows-2019, macos-11] steps: - uses: actions/checkout@v3 @@ -23,19 +23,19 @@ jobs: with: python-version: "3.8" - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.12.1 - - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install -e .[all] - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + uses: pypa/cibuildwheel@v2.16.5 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" + with: + package-dir: . + output-dir: wheelhouse - uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml new file mode 100644 index 0000000..a222dce --- /dev/null +++ b/.github/workflows/build-wheels-cuda.yaml @@ -0,0 +1,131 @@ +name: Build Wheels (CUDA) + +on: workflow_dispatch + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-20.04', 'windows-latest') + 'pyver' = @("3.10", "3.11", "3.12") + 'cuda' = @("12.1.1", "12.2.2", "12.3.2") + 'releasetag' = @("basic") + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Setup Mamba + uses: conda-incubator/setup-miniconda@v2.2.0 + with: + activate-environment: "build" + python-version: ${{ matrix.pyver }} + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + add-pip-as-python-dependency: true + auto-activate-base: false + + - name: VS Integration Cache + id: vs-integration-cache + if: runner.os == 'Windows' + uses: actions/cache@v3.3.2 + with: + path: ./MSBuildExtensions + key: cuda-${{ matrix.cuda }}-vs-integration + + - name: Get Visual Studio Integration + if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' + run: | + if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} + $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) + for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} + Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' + & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null + Remove-Item 'cudainstaller.zip' + + - name: Install Visual Studio Integration + if: runner.os == 'Windows' + run: | + $y = (gi '.\MSBuildExtensions').fullname + '\*' + (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) + $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') + echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV + + - name: Install Dependencies + env: + MAMBA_DOWNLOAD_FAILFAST: "0" + MAMBA_NO_LOW_SPEED_LIMIT: "1" + run: | + $cudaVersion = $env:CUDAVER + mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion + python -m pip install build wheel + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + if ($IsLinux) { + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + } + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all' + $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" + if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + } + if ($env:AVXVER -eq 'AVX512') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on' + } + if ($env:AVXVER -eq 'basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + } + python -m build --wheel + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + - uses: softprops/action-gh-release@v1 + with: + files: dist/* + # Set tag_name to -cu + tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml new file mode 100644 index 0000000..2cca477 --- /dev/null +++ b/.github/workflows/build-wheels-metal.yaml @@ -0,0 +1,87 @@ +name: Build Wheels (Metal) + +on: workflow_dispatch + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('macos-11', 'macos-12', 'macos-13') + 'pyver' = @('3.10', '3.11', '3.12') + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} Python ${{ matrix.pyver }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + env: + OSVER: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Install Dependencies + run: | + python -m pip install build wheel cmake + + - name: Build Wheel + run: | + XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer" + XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin" + export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on" + [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0" + [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0" + [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0" + + export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64" + VERBOSE=1 python -m build --wheel + + if [[ "$OSVER" == "macos-13" ]]; then + export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" + export MACOSX_DEPLOYMENT_TARGET="14.0" + VERBOSE=1 python -m build --wheel + fi + + for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done + + export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64" + VERBOSE=1 python -m build --wheel + + if [[ "$OSVER" == "macos-13" ]]; then + export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" + export MACOSX_DEPLOYMENT_TARGET="14.0" + VERBOSE=1 python -m build --wheel + fi + + - uses: softprops/action-gh-release@v1 + with: + files: dist/* + # set release name to -metal + tag_name: ${{ github.ref_name }}-metal + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml new file mode 100644 index 0000000..9042d6c --- /dev/null +++ b/.github/workflows/generate-index-from-release.yaml @@ -0,0 +1,48 @@ +name: Wheels Index + +on: + # Trigger on any new release + release: + types: [published] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v4 + - name: Build + run: | + ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' + ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' + ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' + ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + # Upload entire repository + path: 'index' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/README.md b/README.md index 1e2b640..f251f45 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]() Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. This package provides: diff --git a/scripts/release-to-pep-503.sh b/scripts/release-to-pep-503.sh new file mode 100755 index 0000000..00ae567 --- /dev/null +++ b/scripts/release-to-pep-503.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Get output directory or default to index/whl/cpu +output_dir=${1:-"index/whl/cpu"} + +# Create output directory +mkdir -p $output_dir + +# Change to output directory +pushd $output_dir + +# Create an index html file +echo "" > index.html +echo "" >> index.html +echo " " >> index.html +echo " " >> index.html +echo " llama-cpp-python" >> index.html +echo "
" >> index.html +echo " " >> index.html +echo "" >> index.html +echo "" >> index.html + +# Create llama-cpp-python directory +mkdir -p llama-cpp-python + +# Change to llama-cpp-python directory +pushd llama-cpp-python + +# Create an index html file +echo "" > index.html +echo "" >> index.html +echo " " >> index.html +echo "

Links for llama-cpp-python

" >> index.html + +# Get all releases +releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name) + +# Get pattern from second arg or default to valid python package version pattern +pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"} + +# Filter releases by pattern +releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern) + +# For each release, get all assets +for release in $releases; do + assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets) + echo "

$release

" >> index.html + for asset in $(echo $assets | jq -r .[].browser_download_url); do + if [[ $asset == *".whl" ]]; then + echo " $asset" >> index.html + echo "
" >> index.html + fi + done +done + +echo " " >> index.html +echo "" >> index.html +echo "" >> index.html From 5a5193636b4fe6c1b186bf3c8ee18e1b96401bed Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 Apr 2024 15:35:28 -0400 Subject: [PATCH 17/22] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5260486..60cdf40 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 52604860f93063ef98863921da697576af1c7665 +Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640 From 34081ddc5bbe2fe667ab201abcdbda64c5caa49c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 Apr 2024 15:38:27 -0400 Subject: [PATCH 18/22] chore: Bump version --- CHANGELOG.md | 7 +++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8786c01..fc4e29b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.59] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c +- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247 +- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3 +- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295 + ## [0.2.58] - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index e246595..2f5219c 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.58" \ No newline at end of file +__version__ = "0.2.59" \ No newline at end of file From 612e78d32297002b52b264f29ae32bcd85ad54c6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 Apr 2024 16:15:29 -0400 Subject: [PATCH 19/22] fix(ci): use correct script name --- scripts/{release-to-pep-503.sh => releases-to-pep-503.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{release-to-pep-503.sh => releases-to-pep-503.sh} (100%) diff --git a/scripts/release-to-pep-503.sh b/scripts/releases-to-pep-503.sh similarity index 100% rename from scripts/release-to-pep-503.sh rename to scripts/releases-to-pep-503.sh From c50309e52ab90ffc71ebfc083f7397d8d79851eb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 Apr 2024 02:49:19 -0400 Subject: [PATCH 20/22] docs: LLAMA_CUBLAS -> LLAMA_CUDA --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f251f45..5e16614 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,10 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
cuBLAS (CUDA) -To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing: +To install with cuBLAS, set the `LLAMA_CUDA=on` environment variable before installing: ```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python ```
@@ -569,7 +569,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this: ```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' +CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35 ``` From 1db3b58fdc7c216749401a2901fc64495032f1ff Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 Apr 2024 02:57:06 -0400 Subject: [PATCH 21/22] docs: Add docs explaining how to install pre-built wheels. --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/README.md b/README.md index 5e16614..a0ef83c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho If this fails, add `--verbose` to the `pip install` see the full cmake build log. +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with basic CPU support. + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +``` + ### Installation Configuration `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list. @@ -108,6 +117,30 @@ To install with cuBLAS, set the `LLAMA_CUDA=on` environment variable before inst CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python ``` +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: + +- CUDA Version is 12.1, 12.2 or 12.3 +- Python Version is 3.10, 3.11 or 3.12 + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/ +``` + +Where `` is one of the following: +- `cu121`: CUDA 12.1 +- `cu122`: CUDA 12.2 +- `cu123`: CUDA 12.3 + +For example, to install the CUDA 12.1 wheel: + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 +``` +
@@ -119,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python ``` +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: + +- MacOS Version is 11.0 or later +- Python Version is 3.10, 3.11 or 3.12 + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal +``` +
From 909ef66951fb9f3f5c724841b0c98a598a2e0802 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 Apr 2024 03:08:47 -0400 Subject: [PATCH 22/22] docs: Rename cuBLAS section to CUDA --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0ef83c..c4e194b 100644 --- a/README.md +++ b/README.md @@ -109,9 +109,9 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
-cuBLAS (CUDA) +CUDA -To install with cuBLAS, set the `LLAMA_CUDA=on` environment variable before installing: +To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing: ```bash CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python