From 2a9979fce16ea647918473b85b6f03effb7e1e4c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 02:48:26 -0400 Subject: [PATCH 01/43] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 12 +++++++++--- vendor/llama.cpp | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c2b909e..7f5d265 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1167,13 +1167,19 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... -# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); +# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); +@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) +def llama_pooling_type(ctx: llama_context_p, /) -> int: + ... + + +# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) def llama_vocab_type(model: llama_model_p, /) -> int: ... -# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); +# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) def llama_rope_type(model: llama_model_p, /) -> int: ... @@ -3091,7 +3097,7 @@ def llama_sample_token_greedy( ... -# /// @details Randomly selects a token from the candidates based on their probabilities. +# /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. # LLAMA_API llama_token llama_sample_token( # struct llama_context * ctx, # llama_token_data_array * candidates); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4e96a81..784e11d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4e96a812b3ce7322a29a3008db2ed73d9087b176 +Subproject commit 784e11dea1f5ce9638851b2b0dddb107e2a609c8 From de37420fcf52f759768cec51ddbd5cc768cd9ef3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 03:08:32 -0400 Subject: [PATCH 02/43] fix(ci): Fix python macos test runners issue --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 77df546..06af61f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -57,7 +57,7 @@ jobs: build-macos: - runs-on: macos-latest + runs-on: macos-13 strategy: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] From 266abfc1a36a4b33c0da3f3689676827df3e6a88 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 03:09:46 -0400 Subject: [PATCH 03/43] fix(ci): Fix metal tests as well --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 06af61f..6b0e7e8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -107,7 +107,7 @@ jobs: build-macos-metal: - runs-on: macos-latest + runs-on: macos-13 steps: - uses: actions/checkout@v3 From 7f52335c50be425d7dce6302ae38ecff87b0ee74 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 21:21:29 -0400 Subject: [PATCH 04/43] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 4 ++++ vendor/llama.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 7f5d265..3b96adc 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -811,6 +811,7 @@ It might not exist for progress report where '.' is output repeatedly.""" # bool quantize_output_tensor; // quantize output.weight # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool pure; // quantize all tensors to the default type +# bool keep_split; // quantize to the same number of shards # void * imatrix; // pointer to importance matrix data # void * kv_overrides; // pointer to vector containing overrides # } llama_model_quantize_params; @@ -826,6 +827,7 @@ class llama_model_quantize_params(ctypes.Structure): quantize_output_tensor (bool): quantize output.weight only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type + keep_split (bool): quantize to the same number of shards imatrix (ctypes.c_void_p): pointer to importance matrix data kv_overrides (ctypes.c_void_p): pointer to vector containing overrides """ @@ -839,6 +841,7 @@ class llama_model_quantize_params(ctypes.Structure): quantize_output_tensor: bool only_copy: bool pure: bool + keep_split: bool imatrix: ctypes.c_void_p kv_overrides: ctypes.c_void_p @@ -851,6 +854,7 @@ class llama_model_quantize_params(ctypes.Structure): ("quantize_output_tensor", ctypes.c_bool), ("only_copy", ctypes.c_bool), ("pure", ctypes.c_bool), + ("keep_split", ctypes.c_bool), ("imatrix", ctypes.c_void_p), ("kv_overrides", ctypes.c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 784e11d..46e12c4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 784e11dea1f5ce9638851b2b0dddb107e2a609c8 +Subproject commit 46e12c4692a37bdd31a0432fc5153d7d22bc7f72 From fcfea66857c73c9b95326835debb80dbc0b76f17 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 21:21:48 -0400 Subject: [PATCH 05/43] fix: pydantic deprecation warning --- llama_cpp/server/settings.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index eab5a8a..0c858f9 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -2,8 +2,10 @@ from __future__ import annotations import multiprocessing -from typing import Optional, List, Literal, Union -from pydantic import Field, root_validator +from typing import Optional, List, Literal, Union, Dict, cast +from typing_extensions import Self + +from pydantic import Field, model_validator from pydantic_settings import BaseSettings import llama_cpp @@ -173,15 +175,16 @@ class ModelSettings(BaseSettings): default=True, description="Whether to print debug information." ) - @root_validator(pre=True) # pre=True to ensure this runs before any other validation - def set_dynamic_defaults(cls, values): + @model_validator(mode="before") # pre=True to ensure this runs before any other validation + def set_dynamic_defaults(self) -> Self: # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count() cpu_count = multiprocessing.cpu_count() + values = cast(Dict[str, int], self) if values.get('n_threads', 0) == -1: values['n_threads'] = cpu_count if values.get('n_threads_batch', 0) == -1: values['n_threads_batch'] = cpu_count - return values + return self class ServerSettings(BaseSettings): From f6ed21f9a26874625b020996fed1f107c1990f50 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Thu, 25 Apr 2024 20:32:44 -0500 Subject: [PATCH 06/43] feat: Allow for possibly non-pooled embeddings (#1380) * allow for possibly non-pooled embeddings * add more to embeddings section in README.md --------- Co-authored-by: Andrei --- README.md | 8 +++++- llama_cpp/_internals.py | 14 ++++++++++ llama_cpp/llama.py | 57 +++++++++++++++++++++++++++------------- llama_cpp/llama_cpp.py | 6 +++++ llama_cpp/llama_types.py | 2 +- 5 files changed, 67 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index cd41a38..b5e7d20 100644 --- a/README.md +++ b/README.md @@ -575,7 +575,7 @@ llama = Llama( ### Embeddings -To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding). +To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly. ```python import llama_cpp @@ -589,6 +589,12 @@ embeddings = llm.create_embedding("Hello, world!") embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"]) ``` +There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token. + +Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings. + +It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually. + ### Adjusting the Context Window The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ff2d657..cc3d989 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -273,6 +273,10 @@ class _LlamaContext: assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) + def pooling_type(self) -> int: + assert self.ctx is not None + return llama_cpp.llama_pooling_type(self.ctx) + def kv_cache_clear(self): assert self.ctx is not None llama_cpp.llama_kv_cache_clear(self.ctx) @@ -641,6 +645,16 @@ def _should_add_bos(model: _LlamaModel) -> bool: return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM +# Embedding functions + + +def _normalize_embedding(embedding): + norm = float(np.linalg.norm(embedding)) + if norm == 0.0: + return embedding + return [v / norm for v in embedding] + + # Python wrappers over common/sampling structs diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0a576d4..481842b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -50,6 +50,7 @@ from ._internals import ( _LlamaTokenDataArray, # type: ignore _LlamaSamplingParams, # type: ignore _LlamaSamplingContext, # type: ignore + _normalize_embedding, # type: ignore ) from ._logger import set_verbose from ._utils import suppress_stdout_stderr @@ -760,7 +761,7 @@ class Llama: input = input if isinstance(input, list) else [input] # get numeric embeddings - embeds: List[List[float]] + embeds: Union[List[List[float]], List[List[List[float]]]] total_tokens: int embeds, total_tokens = self.embed(input, return_count=True) # type: ignore @@ -787,7 +788,7 @@ class Llama: def embed( self, input: Union[str, List[str]], - normalize: bool = True, + normalize: bool = False, truncate: bool = True, return_count: bool = False, ): @@ -803,6 +804,10 @@ class Llama: n_embd = self.n_embd() n_batch = self.n_batch + # get pooling information + pooling_type = self.pooling_type() + logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + if self.context_params.embeddings == False: raise RuntimeError( "Llama model must be created with embedding=True to call this method" @@ -820,29 +825,37 @@ class Llama: self._batch.reset() # decode and fetch embeddings - data: List[List[float]] = [] + data: Union[List[List[float]], List[List[List[float]]]] = [] - def decode_batch(n_seq: int): + def decode_batch(seq_sizes: List[int]): assert self._ctx.ctx is not None llama_cpp.llama_kv_cache_clear(self._ctx.ctx) self._ctx.decode(self._batch) self._batch.reset() # store embeddings - for i in range(n_seq): - ptr = llama_cpp.llama_get_embeddings_seq( - self._ctx.ctx, i - ) - if not ptr: - raise RuntimeError("Failed to get embeddings from sequence pooling type is not set") - embedding: List[float] = ptr[:n_embd] - if normalize: - norm = float(np.linalg.norm(embedding)) - embedding = [v / norm for v in embedding] - data.append(embedding) + if pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE: + pos: int = 0 + for i, size in enumerate(seq_sizes): + ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx) + embedding: List[List[float]] = [ + ptr[pos + j * n_embd : pos + (j + 1) * n_embd] for j in range(size) + ] + if normalize: + embedding = [_normalize_embedding(e) for e in embedding] + data.append(embedding) + pos += size + else: + for i in range(len(seq_sizes)): + ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i) + embedding: List[float] = ptr[:n_embd] + if normalize: + embedding = _normalize_embedding(embedding) + data.append(embedding) # init state total_tokens = 0 + s_batch = [] t_batch = 0 p_batch = 0 @@ -863,17 +876,21 @@ class Llama: # time to eval batch if t_batch + n_tokens > n_batch: - decode_batch(p_batch) + decode_batch(s_batch) + s_batch = [] t_batch = 0 p_batch = 0 # add to batch - self._batch.add_sequence(tokens, p_batch, False) + self._batch.add_sequence(tokens, p_batch, logits_all) + + # update batch stats + s_batch.append(n_tokens) t_batch += n_tokens p_batch += 1 # hanlde last batch - decode_batch(p_batch) + decode_batch(s_batch) if self.verbose: llama_cpp.llama_print_timings(self._ctx.ctx) @@ -1845,6 +1862,10 @@ class Llama: """Return the newline token.""" return self._model.token_nl() + def pooling_type(self) -> str: + """Return the pooling type.""" + return self._ctx.pooling_type() + @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3b96adc..609dd36 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1189,6 +1189,12 @@ def llama_rope_type(model: llama_model_p, /) -> int: ... +# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_model * model); +@ctypes_function("llama_pooling_type", [llama_model_p_ctypes], ctypes.c_int) +def llama_pooling_type(model: llama_model_p, /) -> int: + ... + + # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); @ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_vocab(model: llama_model_p, /) -> int: diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 87e000f..4677785 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -24,7 +24,7 @@ class EmbeddingUsage(TypedDict): class Embedding(TypedDict): index: int object: str - embedding: List[float] + embedding: Union[List[float], List[List[float]]] class CreateEmbeddingResponse(TypedDict): From 173ebc78782b2eaed40b3b29989502cb6ecdea44 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 Apr 2024 21:36:09 -0400 Subject: [PATCH 07/43] fix: Remove duplicate pooling_type definition and add misisng n_vocab definition in bindings --- llama_cpp/llama_cpp.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 609dd36..3b96adc 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1189,12 +1189,6 @@ def llama_rope_type(model: llama_model_p, /) -> int: ... -# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_model * model); -@ctypes_function("llama_pooling_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_pooling_type(model: llama_model_p, /) -> int: - ... - - # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); @ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_vocab(model: llama_model_p, /) -> int: From 65edc906712eb205afaedc7d69f27e33be162228 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 Apr 2024 10:11:31 -0400 Subject: [PATCH 08/43] chore: Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25b8835..806876c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.65] + +- feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72 +- feat: Allow for possibly non-pooled embeddings by @iamlemec in #1380 + ## [0.2.64] - feat: Update llama.cpp to ggerganov/llama.cpp@4e96a812b3ce7322a29a3008db2ed73d9087b176 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index f736458..6db6333 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.64" \ No newline at end of file +__version__ = "0.2.65" \ No newline at end of file From 9e7f738220a28de2c0b3fdac8ceceb5a2c4d02ec Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Sun, 28 Apr 2024 02:47:07 +0200 Subject: [PATCH 09/43] ci: Update dependabot.yml (#1391) Add github-actions update --- .github/dependabot.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 91abb11..c58c9ae 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,3 +9,7 @@ updates: directory: "/" # Location of package manifests schedule: interval: "weekly" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" From c58b56123d8659195609098e14eacd13b7374742 Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Sun, 28 Apr 2024 02:47:49 +0200 Subject: [PATCH 10/43] ci: Update action versions in build-wheels-metal.yaml (#1390) * Bump actions/setup-python@v4 to v5 * Update build-wheels-metal.yaml * Update build-wheels-metal.yaml * Update build-wheels-metal.yaml --- .github/workflows/build-wheels-metal.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 2cca477..fc798c8 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -41,7 +41,7 @@ jobs: with: submodules: "recursive" - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.pyver }} @@ -78,7 +78,7 @@ jobs: VERBOSE=1 python -m build --wheel fi - - uses: softprops/action-gh-release@v1 + - uses: softprops/action-gh-release@v2 with: files: dist/* # set release name to -metal From e6bbfb863c38e7575e4fe87a823ac6ce2e15c27c Mon Sep 17 00:00:00 2001 From: iyubondyrev <76585902+iyubondyrev@users.noreply.github.com> Date: Sun, 28 Apr 2024 02:48:47 +0200 Subject: [PATCH 11/43] examples: fix quantize example (#1387) @iyubondyrev thank you! --- examples/low_level_api/quantize.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py index 8bd03f8..cdb038a 100644 --- a/examples/low_level_api/quantize.py +++ b/examples/low_level_api/quantize.py @@ -4,14 +4,16 @@ import llama_cpp def main(args): + fname_inp = args.fname_inp.encode("utf-8") + fname_out = args.fname_out.encode("utf-8") if not os.path.exists(fname_inp): raise RuntimeError(f"Input file does not exist ({fname_inp})") if os.path.exists(fname_out): raise RuntimeError(f"Output file already exists ({fname_out})") - fname_inp = args.fname_inp.encode("utf-8") - fname_out = args.fname_out.encode("utf-8") - itype = args.itype - return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) + ftype = args.type + args = llama_cpp.llama_model_quantize_default_params() + args.ftype = ftype + return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args) if return_code != 0: raise RuntimeError("Failed to quantize model") @@ -20,6 +22,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("fname_inp", type=str, help="Path to input model") parser.add_argument("fname_out", type=str, help="Path to output model") - parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") + parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum") args = parser.parse_args() main(args) + From f178636e1b2a30b12498aa656011779796b9ba11 Mon Sep 17 00:00:00 2001 From: Jeffrey Fong Date: Sun, 28 Apr 2024 08:49:52 +0800 Subject: [PATCH 12/43] fix: Functionary bug fixes (#1385) * fix completion tokens tracking, prompt forming * fix 'function_call' and 'tool_calls' depending on 'functions' and 'tools', incompatibility with python 3.8 * Updated README * fix for openai server compatibility --------- Co-authored-by: Andrei --- README.md | 2 + llama_cpp/llama_chat_format.py | 96 +++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index b5e7d20..a33524c 100644 --- a/README.md +++ b/README.md @@ -484,6 +484,8 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF") ) ``` + +**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.). ### Multi-modal Models diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 17b570a..71aac80 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1828,27 +1828,35 @@ def functionary_v1_v2_chat_handler( version: Literal["v1", "v2"], functions: Optional[List[llama_types.ChatCompletionFunctions]] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Union[Dict, str] = "auto", ): all_messages: List[llama_types.ChatCompletionRequestMessage] = [] - if functions is not None: + if tool_choice == "none": all_messages.append( llama_types.ChatCompletionRequestSystemMessage( - role="system", content=generate_schema_from_functions(functions) + role="system", content=generate_schema_from_functions([]) ) ) - elif tools is not None: - all_messages.append( - llama_types.ChatCompletionRequestSystemMessage( - role="system", - content=generate_schema_from_functions( - [ - tool["function"] - for tool in tools - if tool["type"] == "function" - ] - ), + else: + if functions is not None: + all_messages.append( + llama_types.ChatCompletionRequestSystemMessage( + role="system", content=generate_schema_from_functions(functions) + ) + ) + elif tools is not None and tool_choice != "none": + all_messages.append( + llama_types.ChatCompletionRequestSystemMessage( + role="system", + content=generate_schema_from_functions( + [ + tool["function"] + for tool in tools + if tool["type"] == "function" + ] + ), + ) ) - ) all_messages.append( llama_types.ChatCompletionRequestSystemMessage( @@ -1888,7 +1896,7 @@ def functionary_v1_v2_chat_handler( function_call = "auto" prompt = prepare_messages_for_inference( - messages, tokenizer, version, functions, tools + messages, tokenizer, version, functions, tools, function_call ) # If no tools/functions are provided @@ -1985,17 +1993,12 @@ def functionary_v1_v2_chat_handler( content = "" function_calls, function_bodies = [], [] + completion_tokens = 0 if version == "v1": # If no or "auto" tool_choice/function_call if isinstance(function_call, str) and function_call == "auto": stops = ["\n", END_ASSISTANT_TOKEN] - # If tool_choice/function_call is "none" - elif isinstance(function_call, str) and function_call == "none": - prompt = prepare_messages_for_inference( - messages, tokenizer, version, [], [] - ) - stops = END_ASSISTANT_TOKEN # If tool_choice/function_call is provided elif isinstance(function_call, dict): prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n" @@ -2009,12 +2012,15 @@ def functionary_v1_v2_chat_handler( completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] + # If the generation does not involve a function call if ( START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text ): + completion["usage"]["completion_tokens"] = completion_tokens return _convert_completion_to_chat(completion, stream=stream) # type: ignore # If the generation involves a function call in completion, generate the parameters elif ( @@ -2032,23 +2038,14 @@ def functionary_v1_v2_chat_handler( ) grammar = get_grammar(function_calls[-1]) completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) + completion_tokens += completion["usage"]["completion_tokens"] function_bodies.append(completion["choices"][0]["text"].strip()) # If the prompt involves a function call, just append generated parameters to function_bodies else: function_bodies.append(completion_text.strip()) else: - # If tool_choice/function_call is "none" - if isinstance(function_call, str) and function_call == "none": - prompt = ( - prepare_messages_for_inference(messages, tokenizer, version, [], []) - + "all\n<|content|>" - ) - stops = [STOP_TOKEN, FROM_TOKEN] - completion = create_completion(stop=stops) - completion["choices"][0]["text"] = completion["choices"][0]["text"].strip() - return _convert_completion_to_chat(completion, stream=stream) # type: ignore # If tool_choice/function_call is provided - elif isinstance(function_call, dict): + if isinstance(function_call, dict): prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" function_call = function_call["name"] function_calls.append(function_call) @@ -2056,6 +2053,7 @@ def functionary_v1_v2_chat_handler( stops = [STOP_TOKEN, FROM_TOKEN] completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] function_bodies.append(completion_text.strip()) # If "auto" or no tool_choice/function_call elif isinstance(function_call, str) and function_call == "auto": @@ -2065,6 +2063,7 @@ def functionary_v1_v2_chat_handler( stops = CONTENT_TOKEN completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] function_name = completion_text.strip() if function_name == "all": prompt += "all\n<|content|>" @@ -2077,12 +2076,23 @@ def functionary_v1_v2_chat_handler( stops = [RECIPIENT_TOKEN, STOP_TOKEN] completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] if function_name == "all": - content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n") + if completion_text.endswith("\n<|from|>assistant\n"): + content += completion_text[:-len("\n<|from|>assistant\n")] + if completion_text.endswith("\n<|from|> assistant\n"): + content += completion_text[-len("\n<|from|> assistant\n")] + else: + content += completion_text content = content.lstrip() # Check whether the model wants to generate another turn if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text: - cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip() + if completion_text.endswith("\n<|from|>assistant\n"): + cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip() + elif completion_text.endswith("\n<|from|> assistant\n"): + cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip() + else: + cleaned_completion_text = completion_text.strip() prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>" else: break @@ -2092,6 +2102,7 @@ def functionary_v1_v2_chat_handler( prompt += completion_text.strip() grammar = None completion = create_completion(stop=stops) + completion_tokens += completion["usage"]["completion_tokens"] if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]: prompt += "\n<|from|>assistant\n<|recipient|>" else: @@ -2120,12 +2131,16 @@ def functionary_v1_v2_chat_handler( ) # TODO: support stream mode - function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { - "function_call": { - "name": tool_calls[0]["function"]["name"], - "arguments": tool_calls[0]["function"]["arguments"], - } - } if len(tool_calls) == 1 else {} + function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {} + if len(tool_calls) > 0: + if tools is not None: + function_call_dict["tool_calls"] = tool_calls + else: + function_call_dict["function_call"] = { + "name": tool_calls[0]["function"]["name"], + "arguments": tool_calls[0]["function"]["arguments"], + } + completion["usage"]["completion_tokens"] = completion_tokens return llama_types.CreateChatCompletionResponse( id="chat" + completion["id"], object="chat.completion", @@ -2138,7 +2153,6 @@ def functionary_v1_v2_chat_handler( "message": { "role": "assistant", "content": None if content == "" else content, - "tool_calls": tool_calls, **function_call_dict, }, "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop", From 17bdfc818f50988ddb6dfc5a30cbec68571e4c56 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:50:28 -0400 Subject: [PATCH 13/43] chore(deps): bump conda-incubator/setup-miniconda from 2.2.0 to 3.0.4 (#1397) Bumps [conda-incubator/setup-miniconda](https://github.com/conda-incubator/setup-miniconda) from 2.2.0 to 3.0.4. - [Release notes](https://github.com/conda-incubator/setup-miniconda/releases) - [Changelog](https://github.com/conda-incubator/setup-miniconda/blob/main/CHANGELOG.md) - [Commits](https://github.com/conda-incubator/setup-miniconda/compare/v2.2.0...v3.0.4) --- updated-dependencies: - dependency-name: conda-incubator/setup-miniconda dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-wheels-cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index a222dce..9647036 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -52,7 +52,7 @@ jobs: python-version: ${{ matrix.pyver }} - name: Setup Mamba - uses: conda-incubator/setup-miniconda@v2.2.0 + uses: conda-incubator/setup-miniconda@v3.0.4 with: activate-environment: "build" python-version: ${{ matrix.pyver }} From 27038db3d6cbab301dc96e21d7f24fc53c5ccbf5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:50:39 -0400 Subject: [PATCH 14/43] chore(deps): bump actions/cache from 3.3.2 to 4.0.2 (#1398) Bumps [actions/cache](https://github.com/actions/cache) from 3.3.2 to 4.0.2. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v3.3.2...v4.0.2) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-wheels-cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 9647036..274b946 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -65,7 +65,7 @@ jobs: - name: VS Integration Cache id: vs-integration-cache if: runner.os == 'Windows' - uses: actions/cache@v3.3.2 + uses: actions/cache@v4.0.2 with: path: ./MSBuildExtensions key: cuda-${{ matrix.cuda }}-vs-integration From 79318ba1d181f5765d3abfab00d27ec5c42a3e8d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:50:50 -0400 Subject: [PATCH 15/43] chore(deps): bump docker/login-action from 2 to 3 (#1399) Bumps [docker/login-action](https://github.com/docker/login-action) from 2 to 3. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](https://github.com/docker/login-action/compare/v2...v3) --- updated-dependencies: - dependency-name: docker/login-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 750b91e..726f872 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -23,7 +23,7 @@ jobs: uses: docker/setup-buildx-action@v2 - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} From 7074c4d256e06c31a6386bb099bad1e891284334 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:51:02 -0400 Subject: [PATCH 16/43] chore(deps): bump docker/build-push-action from 4 to 5 (#1400) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 4 to 5. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v4...v5) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 726f872..68cc06f 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -31,7 +31,7 @@ jobs: - name: Build and push id: docker_build - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . file: "docker/simple/Dockerfile" From c07db99e5b34b9d791ac9d14cdde46928f4694ae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:51:13 -0400 Subject: [PATCH 17/43] chore(deps): bump pypa/cibuildwheel from 2.16.5 to 2.17.0 (#1401) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.16.5 to 2.17.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.16.5...v2.17.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-and-release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 07742f1..2137da3 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -29,7 +29,7 @@ jobs: python -m pip install -e .[all] - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -55,7 +55,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" From c9b85bf09859161d3ab9de49ed448297488e0c89 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Apr 2024 23:41:54 -0400 Subject: [PATCH 18/43] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 152 ++++++++++++++++++----------------------- vendor/llama.cpp | 2 +- 2 files changed, 69 insertions(+), 85 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3b96adc..d00dfcb 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -552,19 +552,25 @@ class llama_batch(ctypes.Structure): # LLAMA_KV_OVERRIDE_TYPE_INT, # LLAMA_KV_OVERRIDE_TYPE_FLOAT, # LLAMA_KV_OVERRIDE_TYPE_BOOL, +# LLAMA_KV_OVERRIDE_TYPE_STR, # }; LLAMA_KV_OVERRIDE_TYPE_INT = 0 LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 +LLAMA_KV_OVERRIDE_TYPE_STR = 3 # struct llama_model_kv_override { -# char key[128]; # enum llama_model_kv_override_type tag; + +# char key[128]; + + # union { -# int64_t int_value; -# double float_value; -# bool bool_value; +# int64_t val_i64; +# double val_f64; +# bool val_bool; +# char val_str[128]; # }; # }; class llama_model_kv_override_value(ctypes.Union): @@ -572,16 +578,28 @@ class llama_model_kv_override_value(ctypes.Union): ("int_value", ctypes.c_int64), ("float_value", ctypes.c_double), ("bool_value", ctypes.c_bool), + ("str_value", ctypes.c_char * 128), ] + if TYPE_CHECKING: + int_value: int + float_value: float + bool_value: bool + str_value: bytes + class llama_model_kv_override(ctypes.Structure): _fields_ = [ - ("key", ctypes.c_char * 128), ("tag", ctypes.c_int), + ("key", ctypes.c_char * 128), ("value", llama_model_kv_override_value), ] + if TYPE_CHECKING: + tag: int + key: bytes + value: Union[int, float, bool, bytes] + # struct llama_model_params { # int32_t n_gpu_layers; // number of layers to store in VRAM @@ -609,9 +627,10 @@ class llama_model_kv_override(ctypes.Structure): # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool vocab_only; // only load the vocabulary, no weights -# bool use_mmap; // use mmap if possible -# bool use_mlock; // force system to keep model in RAM +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool check_tensors; // validate model tensor data # }; class llama_model_params(ctypes.Structure): """Parameters for llama_model @@ -626,7 +645,8 @@ class llama_model_params(ctypes.Structure): kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible - use_mlock (bool): force system to keep model in RAM""" + use_mlock (bool): force system to keep model in RAM + check_tensors (bool): validate model tensor data""" if TYPE_CHECKING: n_gpu_layers: int @@ -639,6 +659,7 @@ class llama_model_params(ctypes.Structure): vocab_only: bool use_mmap: bool use_mlock: bool + check_tensors: bool _fields_ = [ ("n_gpu_layers", ctypes.c_int32), @@ -651,6 +672,7 @@ class llama_model_params(ctypes.Structure): ("vocab_only", ctypes.c_bool), ("use_mmap", ctypes.c_bool), ("use_mlock", ctypes.c_bool), + ("check_tensors", ctypes.c_bool), ] @@ -1041,8 +1063,7 @@ GGML_NUMA_STRATEGY_COUNT = 5 [ctypes.c_int], None, ) -def llama_numa_init(numa: int, /): - ... +def llama_numa_init(numa: int, /): ... # // Call once at the end of the program - currently only used for MPI @@ -1067,8 +1088,7 @@ def llama_backend_free(): ) def llama_load_model_from_file( path_model: bytes, params: llama_model_params, / -) -> Optional[llama_model_p]: - ... +) -> Optional[llama_model_p]: ... # LLAMA_API void llama_free_model(struct llama_model * model); @@ -1077,8 +1097,7 @@ def llama_load_model_from_file( [llama_model_p_ctypes], None, ) -def llama_free_model(model: llama_model_p, /): - ... +def llama_free_model(model: llama_model_p, /): ... # LLAMA_API struct llama_context * llama_new_context_with_model( @@ -1091,8 +1110,7 @@ def llama_free_model(model: llama_model_p, /): ) def llama_new_context_with_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: - ... +) -> Optional[llama_context_p]: ... # // Frees all allocated memory @@ -1113,104 +1131,87 @@ def llama_free(ctx: llama_context_p, /): [], ctypes.c_int64, ) -def llama_time_us() -> int: - ... +def llama_time_us() -> int: ... # LLAMA_API size_t llama_max_devices(void); @ctypes_function("llama_max_devices", [], ctypes.c_size_t) -def llama_max_devices() -> int: - ... +def llama_max_devices() -> int: ... # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) -def llama_supports_mmap() -> bool: - ... +def llama_supports_mmap() -> bool: ... # LLAMA_API bool llama_supports_mlock (void); @ctypes_function("llama_supports_mlock", [], ctypes.c_bool) -def llama_supports_mlock() -> bool: - ... +def llama_supports_mlock() -> bool: ... # LLAMA_API bool llama_supports_gpu_offload(void); @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool) -def llama_supports_gpu_offload() -> bool: - ... +def llama_supports_gpu_offload() -> bool: ... # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) -def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: - ... +def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ctx(ctx: llama_context_p, /) -> int: - ... +def llama_n_ctx(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_batch(ctx: llama_context_p, /) -> int: - ... +def llama_n_batch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ubatch(ctx: llama_context_p, /) -> int: - ... +def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_seq_max(ctx: llama_context_p, /) -> int: - ... +def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) -def llama_pooling_type(ctx: llama_context_p, /) -> int: - ... +def llama_pooling_type(ctx: llama_context_p, /) -> int: ... # LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_vocab_type(model: llama_model_p, /) -> int: - ... +def llama_vocab_type(model: llama_model_p, /) -> int: ... # LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_rope_type(model: llama_model_p, /) -> int: - ... +def llama_rope_type(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); @ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_vocab(model: llama_model_p, /) -> int: - ... +def llama_n_vocab(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_ctx_train(model: llama_model_p, /) -> int: - ... +def llama_n_ctx_train(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_n_embd (const struct llama_model * model); @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_embd(model: llama_model_p, /) -> int: - ... +def llama_n_embd(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_n_layer (const struct llama_model * model); @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_layer(model: llama_model_p, /) -> int: - ... +def llama_n_layer(model: llama_model_p, /) -> int: ... # // Get the model's RoPE frequency scaling factor @@ -1912,8 +1913,7 @@ def llama_state_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_load_session_file( @@ -1941,8 +1941,7 @@ def llama_load_session_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: - ... +) -> int: ... # LLAMA_API bool llama_state_save_file( @@ -1966,8 +1965,7 @@ def llama_state_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_save_session_file( @@ -1992,8 +1990,7 @@ def llama_save_session_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: - ... +) -> int: ... # // Get the exact size needed to copy the KV cache of a single sequence @@ -2071,8 +2068,7 @@ def llama_state_seq_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: - ... +) -> int: ... # LLAMA_API size_t llama_state_seq_load_file( @@ -2102,8 +2098,7 @@ def llama_state_seq_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: - ... +) -> int: ... # // @@ -2366,8 +2361,7 @@ def llama_get_embeddings_seq( ) def llama_token_get_text( model: llama_model_p, token: Union[llama_token, int], / -) -> bytes: - ... +) -> bytes: ... # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); @@ -2376,8 +2370,7 @@ def llama_token_get_text( ) def llama_token_get_score( model: llama_model_p, token: Union[llama_token, int], / -) -> float: - ... +) -> float: ... # LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); @@ -2386,8 +2379,7 @@ def llama_token_get_score( ) def llama_token_get_type( model: llama_model_p, token: Union[llama_token, int], / -) -> int: - ... +) -> int: ... # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) @@ -2395,9 +2387,7 @@ def llama_token_get_type( @ctypes_function( "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool ) -def llama_token_is_eog( - model: llama_model_p, token: Union[llama_token, int], / -) -> bool: +def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool: """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)""" ... @@ -2466,20 +2456,17 @@ def llama_token_prefix(model: llama_model_p) -> int: # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token) -def llama_token_middle(model: llama_model_p, /) -> int: - ... +def llama_token_middle(model: llama_model_p, /) -> int: ... # LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token) -def llama_token_suffix(model: llama_model_p, /) -> int: - ... +def llama_token_suffix(model: llama_model_p, /) -> int: ... # LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle @ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) -def llama_token_eot(model: llama_model_p, /) -> int: - ... +def llama_token_eot(model: llama_model_p, /) -> int: ... # // @@ -2620,8 +2607,7 @@ def llama_chat_apply_template( chat: CtypesArray[llama_chat_message], n_msg: int, /, -) -> int: - ... +) -> int: ... # // @@ -3234,8 +3220,7 @@ def llama_beam_search( n_past: Union[ctypes.c_int, int], n_predict: Union[ctypes.c_int, int], /, -): - ... +): ... # /// @details Build a split GGUF final path for this chunk. @@ -3354,5 +3339,4 @@ def llama_log_set( [ctypes.c_void_p, llama_context_p_ctypes], None, ) -def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): - ... +def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 46e12c4..4dba7e8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 46e12c4692a37bdd31a0432fc5153d7d22bc7f72 +Subproject commit 4dba7e8114d84241c842b986e008af8b88d1a019 From a411612b385cef100d76145da1fbd02a7b7cc894 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Apr 2024 23:42:19 -0400 Subject: [PATCH 19/43] feat: Add support for str type kv_overrides --- llama_cpp/llama.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 481842b..96aac66 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -73,7 +73,7 @@ class Llama: vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, - kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, + kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None, # Context Params seed: int = llama_cpp.LLAMA_DEFAULT_SEED, n_ctx: int = 512, @@ -254,6 +254,13 @@ class Llama: elif isinstance(v, float): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT self._kv_overrides_array[i].value.float_value = v + elif isinstance(v, str): # type: ignore + v_bytes = v.encode("utf-8") + if len(v_bytes) > 128: # TODO: Make this a constant + raise ValueError(f"Value for {k} is too long: {v}") + v_bytes = v_bytes.ljust(128, b"\0") + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR + self._kv_overrides_array[i].value.str_value[:128] = v_bytes else: raise ValueError(f"Unknown value type for {k}: {v}") From 2355ce2227476ea69f1d6142dedd2f4b7f27a86b Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Sun, 28 Apr 2024 05:44:47 +0200 Subject: [PATCH 20/43] ci: Add support for pre-built cuda 12.4.1 wheels (#1388) * Add support for cuda 12.4.1 * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml * Update build-wheels-cuda.yaml Revert --- .github/workflows/build-wheels-cuda.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 274b946..ae9e863 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -22,7 +22,7 @@ jobs: $matrix = @{ 'os' = @('ubuntu-20.04', 'windows-latest') 'pyver' = @("3.10", "3.11", "3.12") - 'cuda' = @("12.1.1", "12.2.2", "12.3.2") + 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") 'releasetag' = @("basic") } @@ -47,7 +47,7 @@ jobs: with: submodules: "recursive" - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.pyver }} @@ -74,7 +74,7 @@ jobs: if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' run: | if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} - $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) + $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null @@ -122,7 +122,7 @@ jobs: # write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - uses: softprops/action-gh-release@v1 + - uses: softprops/action-gh-release@v2 with: files: dist/* # Set tag_name to -cu From 0c3bc4b92813381f3ffddcc9b579e221ee48667b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Apr 2024 12:37:22 -0400 Subject: [PATCH 21/43] fix(ci): Update generate wheel index script to include cu12.3 and cu12.4 Closes #1406 --- .github/workflows/generate-index-from-release.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index 9042d6c..858e507 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -37,6 +37,8 @@ jobs: ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' + ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' + ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' - name: Upload artifact uses: actions/upload-pages-artifact@v3 From 03c654a3d9889bcf8a0402c3b7478ad85fbaf111 Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Tue, 30 Apr 2024 04:52:23 +0200 Subject: [PATCH 22/43] ci(fix): Workflow actions updates and fix arm64 wheels not included in release (#1392) * Update test.yaml Bump actions/checkout@v3 to v4 Bump action/setup-python@v4 to v5 * Update test-pypi.yaml Bum actions/setup-python@v4 to v5 * Update build-and-release.yaml Bump softprops/action-gh-release@v1 to v2 Bump actions/checkout@v3 to v4 Bump actions/setup-python@v3 to v5 * Update publish.yaml Bump actions/checkout@v3 to v4 Bump actions/sertup-python@v4 to v5 * Update publish-to-test.yaml Bump actions/checkout@v3 to v4 Bump actions/setup-python @v4 to v5 * Update test-pypi.yaml Add Python 3.12 * Update build-and-release.yaml * Update build-docker.yaml Bump docker/setup-qemu-action@v2 to v3 Bump docker/setup-buildx-action@v2 to v3 * Update build-and-release.yaml * Update build-and-release.yaml --- .github/workflows/build-and-release.yaml | 19 +++++++++---------- .github/workflows/build-docker.yaml | 6 +++--- .github/workflows/publish-to-test.yaml | 6 +++--- .github/workflows/publish.yaml | 4 ++-- .github/workflows/test-pypi.yaml | 14 +++++++------- .github/workflows/test.yaml | 18 +++++++++--------- 6 files changed, 33 insertions(+), 34 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 2137da3..76bf708 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -14,12 +14,12 @@ jobs: os: [ubuntu-20.04, windows-2019, macos-11] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" # Used to host cibuildwheel - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: python-version: "3.8" @@ -62,23 +62,22 @@ jobs: CIBW_ARCHS: "aarch64" CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" with: - output-dir: wheelhouse/ + output-dir: wheelhouse - name: Upload wheels as artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: - name: wheels-${{ matrix.version }} - path: wheelhouse/*.whl + path: ./wheelhouse/*.whl build_sdist: name: Build source distribution runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install dependencies @@ -102,8 +101,8 @@ jobs: with: name: artifact path: dist - - uses: softprops/action-gh-release@v1 + - uses: softprops/action-gh-release@v2 with: files: dist/* env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 68cc06f..4ebe3bb 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -12,15 +12,15 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Login to GitHub Container Registry uses: docker/login-action@v3 diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml index 47e7c40..2bf0ea9 100644 --- a/.github/workflows/publish-to-test.yaml +++ b/.github/workflows/publish-to-test.yaml @@ -16,11 +16,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Append Dev Version to __version__ @@ -40,4 +40,4 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository-url: https://test.pypi.org/legacy/ \ No newline at end of file + repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 1afdd66..bc4ec90 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -10,11 +10,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install dependencies diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml index cc6a3a7..aa8e8fa 100644 --- a/.github/workflows/test-pypi.yaml +++ b/.github/workflows/test-pypi.yaml @@ -8,11 +8,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -28,11 +28,11 @@ jobs: runs-on: windows-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -48,11 +48,11 @@ jobs: runs-on: macos-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -61,4 +61,4 @@ jobs: python3 -m pip install --verbose llama-cpp-python[all] - name: Test with pytest run: | - python3 -c "import llama_cpp" \ No newline at end of file + python3 -c "import llama_cpp" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6b0e7e8..292343c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -21,7 +21,7 @@ jobs: with: submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -40,11 +40,11 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -63,11 +63,11 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -83,11 +83,11 @@ jobs: # runs-on: ubuntu-latest # steps: - # - uses: actions/checkout@v3 + # - uses: actions/checkout@v4 # with: # submodules: "recursive" # - name: Set up Python 3.8 - # uses: actions/setup-python@v4 + # uses: actions/setup-python@v5 # with: # python-version: "3.8" # - name: Set up OpenCL & CLBlast @@ -110,11 +110,11 @@ jobs: runs-on: macos-13 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "recursive" - name: Set up Python 3.8 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install dependencies From 32c000f3ec5d0417612256a92297362c4c21c4ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:52:58 -0400 Subject: [PATCH 23/43] chore(deps): bump softprops/action-gh-release from 1 to 2 (#1408) Bumps [softprops/action-gh-release](https://github.com/softprops/action-gh-release) from 1 to 2. - [Release notes](https://github.com/softprops/action-gh-release/releases) - [Changelog](https://github.com/softprops/action-gh-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/softprops/action-gh-release/compare/v1...v2) --- updated-dependencies: - dependency-name: softprops/action-gh-release dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From be43018e09ef795fc8b214a4d13fd702dc670a74 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:53:21 -0400 Subject: [PATCH 24/43] chore(deps): bump actions/configure-pages from 4 to 5 (#1411) Bumps [actions/configure-pages](https://github.com/actions/configure-pages) from 4 to 5. - [Release notes](https://github.com/actions/configure-pages/releases) - [Commits](https://github.com/actions/configure-pages/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/configure-pages dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/generate-index-from-release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index 858e507..500c461 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -31,7 +31,7 @@ jobs: - name: Checkout uses: actions/checkout@v4 - name: Setup Pages - uses: actions/configure-pages@v4 + uses: actions/configure-pages@v5 - name: Build run: | ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' From df2b5b5d4485c485b6043d193f10092452d760d6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:53:42 -0400 Subject: [PATCH 25/43] chore(deps): bump actions/upload-artifact from 3 to 4 (#1412) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3 to 4. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-and-release.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 76bf708..78bcb87 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -37,7 +37,7 @@ jobs: package-dir: . output-dir: wheelhouse - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: path: ./wheelhouse/*.whl @@ -65,7 +65,7 @@ jobs: output-dir: wheelhouse - name: Upload wheels as artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: path: ./wheelhouse/*.whl @@ -87,7 +87,7 @@ jobs: - name: Build source distribution run: | python -m build --sdist - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: path: ./dist/*.tar.gz From 97fb860eba42e1018abc9b603f8e4602e84dd153 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Apr 2024 23:34:55 -0400 Subject: [PATCH 26/43] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++ tests/test_llama.py | 2 +- vendor/llama.cpp | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d00dfcb..9c8f778 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -284,6 +284,27 @@ LLAMA_VOCAB_TYPE_WPM = 3 """BERT tokenizer based on WordPiece""" +# // pre-tokenization types +# enum llama_vocab_pre_type { +# LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, +# LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, +# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, +# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, +# LLAMA_VOCAB_PRE_TYPE_FALCON = 4, +# LLAMA_VOCAB_PRE_TYPE_MPT = 5, +# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, +# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, +# }; +LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 +LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 +LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 +LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 +LLAMA_VOCAB_PRE_TYPE_FALCON = 4 +LLAMA_VOCAB_PRE_TYPE_MPT = 5 +LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 +LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 + + # // note: these values should be synchronized with ggml_rope # // TODO: maybe move this enum to ggml.h (ggml_rope_type) # enum llama_rope_type { diff --git a/tests/test_llama.py b/tests/test_llama.py index fa2f6df..469ef91 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -6,7 +6,7 @@ from scipy.special import log_softmax import llama_cpp -MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf" +MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf" def test_llama_cpp_tokenization(): diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4dba7e8..8843a98 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4dba7e8114d84241c842b986e008af8b88d1a019 +Subproject commit 8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 From fe2da0953895000e20655d2c991308f702508494 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 30 Apr 2024 01:35:38 -0400 Subject: [PATCH 27/43] feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) (#1147) * Test dummy image tags in chat templates * Format and improve types for llava_cpp.py * Add from_pretrained support to llava chat format. * Refactor llava chat format to use a jinja2 * Revert chat format test * Add moondream support (wip) * Update moondream chat format * Update moondream chat format * Update moondream prompt * Add function calling support * Cache last image embed * Add Llava1.6 support * Add nanollava support * Add obisidian support * Remove unnecessary import * Re-order multimodal chat formats * Logits all no longer required for multi-modal models * Update README.md * Update docs * Update README * Fix typo * Update README * Fix typo --- README.md | 41 ++- docs/server.md | 2 + llama_cpp/llama_chat_format.py | 633 +++++++++++++++++++++++++++------ llama_cpp/llava_cpp.py | 111 ++++-- llama_cpp/server/model.py | 71 +++- 5 files changed, 712 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index a33524c..d34caf4 100644 --- a/README.md +++ b/README.md @@ -490,14 +490,15 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi ### Multi-modal Models -`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to -read information from both text and images. +`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images. You'll first need to download one of the available multi-modal models in GGUF format: - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1) +- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) +- [moondream2](https://huggingface.co/vikhyatk/moondream2) Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. @@ -509,7 +510,6 @@ Then you'll need to use a custom chat handler to load the clip model and process model_path="./path/to/llava/llama-model.gguf", chat_handler=chat_handler, n_ctx=2048, # n_ctx should be increased to accomodate the image embedding - logits_all=True,# needed to make llava work ) >>> llm.create_chat_completion( messages = [ @@ -517,14 +517,45 @@ Then you'll need to use a custom chat handler to load the clip model and process { "role": "user", "content": [ - {"type": "image_url", "image_url": {"url": "https://.../image.png"}}, - {"type" : "text", "text": "Describe this image in detail please."} + {"type" : "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } ] } ] ) ``` +You can also pull the model from the Hugging Face Hub using the `from_pretrained` method. + +```python +>>> from llama_cpp import Llama +>>> from llama_cpp.llama_chat_format import MoondreamChatHandler +>>> chat_handler = MoondreamChatHandler.from_pretrained( + repo_id="vikhyatk/moondream2", + filename="*mmproj*", +) +>>> llm = Llama.from_pretrained( + repo_id="vikhyatk/moondream2" + filename="*text-model*", + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accomodate the image embedding +) +>>> llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": [ + {"type" : "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } + + ] + } + ] +) +``` + +**Note**: Multi-modal models also support tool calling and JSON mode. +
Loading a Local Image diff --git a/docs/server.md b/docs/server.md index c66c9cc..cd6f86c 100644 --- a/docs/server.md +++ b/docs/server.md @@ -98,6 +98,8 @@ You'll first need to download one of the available multi-modal models in GGUF fo - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1) +- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) +- [moondream2](https://huggingface.co/vikhyatk/moondream2) Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 71aac80..cbd8d20 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -6,6 +6,8 @@ import ctypes import dataclasses import random import string + +from contextlib import ExitStack from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast import jinja2 @@ -2163,42 +2165,80 @@ def functionary_v1_v2_chat_handler( class Llava15ChatHandler: - _clip_free = None + DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "\nASSISTANT: " + "{% endif %}" + ) def __init__(self, clip_model_path: str, verbose: bool = False): import llama_cpp.llava_cpp as llava_cpp - self._llava_cpp = llava_cpp self.clip_model_path = clip_model_path self.verbose = verbose - self._clip_free = self._llava_cpp._libllava.clip_free # type: ignore + + self._llava_cpp = llava_cpp # TODO: Fix + self._exit_stack = ExitStack() + self._last_image_embed: Optional[llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]] = None + self._last_image_hash: Optional[int] = None if not os.path.exists(clip_model_path): raise ValueError(f"Clip model path does not exist: {clip_model_path}") with suppress_stdout_stderr(disable=self.verbose): - self.clip_ctx = self._llava_cpp.clip_model_load( + clip_ctx = self._llava_cpp.clip_model_load( self.clip_model_path.encode(), 0 ) - def __del__(self): - with suppress_stdout_stderr(disable=self.verbose): - if self.clip_ctx is not None and self._clip_free is not None: - self._clip_free(self.clip_ctx) - self.clip_ctx = None + if clip_ctx is None: + raise ValueError(f"Failed to load clip model: {clip_model_path}") + + self.clip_ctx = clip_ctx + + def clip_free(): + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.clip_free(self.clip_ctx) + + self._exit_stack.callback(clip_free) + + def last_image_embed_free(): + with suppress_stdout_stderr(disable=self.verbose): + if self._last_image_embed is not None: + self._llava_cpp.llava_image_embed_free(self._last_image_embed) + self._last_image_embed = None + + self._exit_stack.callback(last_image_embed_free) def load_image(self, image_url: str) -> bytes: - if image_url.startswith("data:"): - import base64 - - image_bytes = base64.b64decode(image_url.split(",")[1]) - return image_bytes - else: - import urllib.request - - with urllib.request.urlopen(image_url) as f: - image_bytes = f.read() - return image_bytes + return self._load_image(image_url) def __call__( self, @@ -2216,6 +2256,7 @@ class Llava15ChatHandler: typical_p: float = 1.0, stream: bool = False, stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, response_format: Optional[ llama_types.ChatCompletionRequestResponseFormat ] = None, @@ -2230,121 +2271,477 @@ class Llava15ChatHandler: model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: - assert ( - llama.context_params.logits_all is True - ) # BUG: logits_all=True is required for llava assert self.clip_ctx is not None - system_prompt = _get_system_message(messages) - system_prompt = ( - system_prompt - if system_prompt != "" - else "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." - ) - user_role = "\nUSER:" - assistant_role = "\nASSISTANT:" - llama.reset() - llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True)) - for message in messages: - if message["role"] == "user" and message["content"] is not None: - if isinstance(message["content"], str): - llama.eval( - llama.tokenize( - f"{user_role} {message['content']}".encode("utf8"), - add_bos=False, - ) - ) - else: - assert isinstance(message["content"], list) - llama.eval( - llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False) - ) - for content in message["content"]: - if content["type"] == "text": - llama.eval( - llama.tokenize( - f"{content['text']}".encode("utf8"), add_bos=False - ) - ) - if content["type"] == "image_url": - image_bytes = ( - self.load_image(content["image_url"]["url"]) - if isinstance(content["image_url"], dict) - else self.load_image(content["image_url"]) - ) - import array - data_array = array.array("B", image_bytes) - c_ubyte_ptr = ( - ctypes.c_ubyte * len(data_array) - ).from_buffer(data_array) - with suppress_stdout_stderr(disable=self.verbose): - embed = ( - self._llava_cpp.llava_image_embed_make_with_bytes( - self.clip_ctx, - llama.context_params.n_threads, - c_ubyte_ptr, - len(image_bytes), - ) - ) - try: - n_past = ctypes.c_int(llama.n_tokens) - n_past_p = ctypes.pointer(n_past) - with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.llava_eval_image_embed( - llama.ctx, - embed, - llama.n_batch, - n_past_p, - ) - assert llama.n_ctx() >= n_past.value - llama.n_tokens = n_past.value - finally: - with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.llava_image_embed_free(embed) - if message["role"] == "assistant" and message["content"] is not None: - llama.eval( - llama.tokenize( - f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False + system_prompt = _get_system_message(messages) + if system_prompt == "": + messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages + + image_urls = self.get_image_urls(messages) + template = jinja2.Template(self.CHAT_FORMAT) + text = template.render(messages=messages, add_generation_prompt=True) + split_text = self.split_text_on_image_urls(text, image_urls) + + def embed_image_bytes(image_bytes: bytes): + if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash: + return self._last_image_embed + with suppress_stdout_stderr(disable=self.verbose): + embed = ( + self._llava_cpp.llava_image_embed_make_with_bytes( + self.clip_ctx, + llama.context_params.n_threads_batch, + (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), + len(image_bytes), ) ) - assert llama.n_ctx() >= llama.n_tokens - llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False)) - assert llama.n_ctx() >= llama.n_tokens + self._last_image_embed = embed + self._last_image_hash = hash(image_bytes) + return embed + # Evaluate prompt + llama.reset() + for i, (type_, value) in enumerate(split_text): + if type_ == "text": + tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0) + if llama.n_tokens + len(tokens) > llama.n_ctx(): + raise ValueError("Prompt exceeds n_ctx") # TODO: Fix + llama.eval(tokens) + else: + image_bytes = self.load_image(value) + embed = embed_image_bytes(image_bytes) + if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): + raise ValueError("Prompt exceeds n_ctx") # TODO: Fix + n_past = ctypes.c_int(llama.n_tokens) + n_past_p = ctypes.pointer(n_past) + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.llava_eval_image_embed( + llama.ctx, + embed, + llama.n_batch, + n_past_p, + ) + llama.n_tokens = n_past.value + + # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() if response_format is not None and response_format["type"] == "json_object": grammar = _grammar_for_response_format(response_format) - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=grammar, - ), + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None: + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, ) + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + @staticmethod + def _load_image(image_url: str) -> bytes: + # TODO: Add Pillow support for other image formats beyond (jpg, png) + if image_url.startswith("data:"): + import base64 + + image_bytes = base64.b64decode(image_url.split(",")[1]) + return image_bytes + else: + import urllib.request + + with urllib.request.urlopen(image_url) as f: + image_bytes = f.read() + return image_bytes + + @staticmethod + def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): + image_urls: List[str] = [] + for message in messages: + if message["role"] == "user": + if message["content"] is None: + continue + for content in message["content"]: + if isinstance(content, dict) and "type" in content: + if content["type"] == "image_url": + if ( + isinstance(content["image_url"], dict) + and "url" in content["image_url"] + ): + image_urls.append(content["image_url"]["url"]) + else: + image_urls.append(content["image_url"]) + return image_urls + + @staticmethod + def split_text_on_image_urls(text: str, image_urls: List[str]): + def find_first(s: str, substrs: List[str]): + for i, substr in enumerate(substrs): + pos = s.find(substr) + if pos != -1: + return pos, i + return None, None + + split_text: List[Tuple[Literal["text", "image_url"], str]] = [] + remaining = text + while remaining: + # Find first image_url + pos, i = find_first(remaining, image_urls) + if pos is not None and i is not None: + if pos > 0: + split_text.append(("text", remaining[:pos])) + split_text.append(("image_url", image_urls[i])) + remaining = remaining[pos + len(image_urls[i]) :] + else: + split_text.append(("text", remaining)) + remaining = "" + return split_text + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "Llava15ChatHandler": + import fnmatch + from pathlib import Path + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface-hub package. " + "You can install it with `pip install huggingface-hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + clip_model_path=model_path, + **kwargs, + ) + +class ObsidianChatHandler(Llava15ChatHandler): + # Prompt Format + # The model followed ChatML format. However, with ### as the seperator + + # <|im_start|>user + # What is this sign about?\n + # ### + # <|im_start|>assistant + # The sign is about bullying, and it is placed on a black background with a red background. + # ### + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}\n" + "###\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "###\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "###\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + +class MoondreamChatHandler(Llava15ChatHandler): + # Chat Format: + # f"\n\n{chat_history}Question: {question}\n\nAnswer:" + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + + # + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n\n" + "{% endif %}" + "{% endif %}" + + # Question: + "{% if content.type == 'text' %}" + "Question: {{ content.text }}\n\n" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + # Question: + "{% if message.content is string %}" + "Question: {{ message.content }}\n\n" + "{% endif %}" + + "{% endif %}" + + # Answer: + "{% if message.role == 'assistant' %}" + "Answer:{{ message.content }}\n\n" + "{% endif %}" + "{% endfor %}" + + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + +class Llava16ChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. " + + # Example prompt + # "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + + # + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n" + "{% endif %}" + "{% endif %}" + + # Question: + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + + "{% endif %}" + + # Answer: + "{% if message.role == 'assistant' %}" + "{{ message.content }}" + "{% endif %}" + "{% endfor %}" + + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + +class NanoLlavaChatHandler(Llava15ChatHandler): + # Prompt Format + # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: + + # <|im_start|>system + # Answer the question<|im_end|><|im_start|>user + # + # What is the picture about?<|im_end|><|im_start|>assistant + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "<|im_end|>" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) @register_chat_completion_handler("chatml-function-calling") diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index 543c87d..3ded1f2 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import sys import os import ctypes @@ -14,10 +16,22 @@ from ctypes import ( Structure, ) import pathlib -from typing import List, Union, NewType, Optional, TypeVar, Callable, Any +from typing import ( + List, + Union, + NewType, + Optional, + TypeVar, + Callable, + Any, + TYPE_CHECKING, + Generic, +) +from typing_extensions import TypeAlias import llama_cpp.llama_cpp as llama_cpp + # Load the library def _load_shared_library(lib_base_name: str): # Construct the paths to the possible shared library names @@ -62,7 +76,7 @@ def _load_shared_library(lib_base_name: str): for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore + return ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") @@ -79,8 +93,27 @@ _libllava = _load_shared_library(_libllava_base_name) # ctypes helper +if TYPE_CHECKING: + CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore + + CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore + + CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore + + CtypesVoidPointer: TypeAlias = ctypes.c_void_p + + class CtypesRef(Generic[CtypesCData]): + pass + + CtypesPointerOrRef: TypeAlias = Union[ + CtypesPointer[CtypesCData], CtypesRef[CtypesCData] + ] + + CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore + F = TypeVar("F", bound=Callable[..., Any]) + def ctypes_function_for_shared_library(lib: ctypes.CDLL): def ctypes_function( name: str, argtypes: List[Any], restype: Any, enabled: bool = True @@ -111,6 +144,7 @@ ctypes_function = ctypes_function_for_shared_library(_libllava) clip_ctx_p = NewType("clip_ctx_p", int) clip_ctx_p_ctypes = c_void_p + # struct llava_image_embed { # float * embed; # int n_image_pos; @@ -121,36 +155,72 @@ class llava_image_embed(Structure): ("n_image_pos", c_int), ] + # /** sanity check for clip <-> llava embed size match */ # LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); -@ctypes_function("llava_validate_embed_size", [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], c_bool) -def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /) -> bool: - ... +@ctypes_function( + "llava_validate_embed_size", + [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], + c_bool, +) +def llava_validate_embed_size( + ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, / +) -> bool: ... # /** build an image embed from image file bytes */ # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); -@ctypes_function("llava_image_embed_make_with_bytes", [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], POINTER(llava_image_embed)) -def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int], /) -> "_Pointer[llava_image_embed]": - ... +@ctypes_function( + "llava_image_embed_make_with_bytes", + [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], + POINTER(llava_image_embed), +) +def llava_image_embed_make_with_bytes( + ctx_clip: clip_ctx_p, + n_threads: Union[c_int, int], + image_bytes: CtypesArray[c_uint8], + image_bytes_length: Union[c_int, int], + /, +) -> "_Pointer[llava_image_embed]": ... + # /** build an image embed from a path to an image filename */ # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -@ctypes_function("llava_image_embed_make_with_filename", [clip_ctx_p_ctypes, c_int, c_char_p], POINTER(llava_image_embed)) -def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /) -> "_Pointer[llava_image_embed]": - ... +@ctypes_function( + "llava_image_embed_make_with_filename", + [clip_ctx_p_ctypes, c_int, c_char_p], + POINTER(llava_image_embed), +) +def llava_image_embed_make_with_filename( + ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, / +) -> "_Pointer[llava_image_embed]": ... + # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); # /** free an embedding made with llava_image_embed_make_* */ @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None) -def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): - ... +def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ... + # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); -@ctypes_function("llava_eval_image_embed", [llama_cpp.llama_context_p_ctypes, POINTER(llava_image_embed), c_int, POINTER(c_int)], c_bool) -def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]", /) -> bool: - ... +@ctypes_function( + "llava_eval_image_embed", + [ + llama_cpp.llama_context_p_ctypes, + POINTER(llava_image_embed), + c_int, + POINTER(c_int), + ], + c_bool, +) +def llava_eval_image_embed( + ctx_llama: llama_cpp.llama_context_p, + embed: "_Pointer[llava_image_embed]", + n_batch: Union[c_int, int], + n_past: "_Pointer[c_int]", + /, +) -> bool: ... ################################################ @@ -161,11 +231,12 @@ def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointe # /** load mmproj model */ # CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) -def clip_model_load(fname: bytes, verbosity: Union[c_int, int], /) -> Optional[clip_ctx_p]: - ... +def clip_model_load( + fname: bytes, verbosity: Union[c_int, int], / +) -> Optional[clip_ctx_p]: ... + # /** free mmproj model */ # CLIP_API void clip_free(struct clip_ctx * ctx); @ctypes_function("clip_free", [clip_ctx_p_ctypes], None) -def clip_free(ctx: clip_ctx_p, /): - ... +def clip_free(ctx: clip_ctx_p, /): ... diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index c24fca6..1ad592d 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -72,9 +72,74 @@ class LlamaProxy: chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None, "clip model not found" - chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( - clip_model_path=settings.clip_model_path, verbose=settings.verbose - ) + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "obsidian": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "llava-1-6": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "moondream": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "nanollava": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None From 26c7876ba0b31bd0e9f7bfc8f341b8b5b3d81fa7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 01:48:40 -0400 Subject: [PATCH 28/43] chore: Bump version --- CHANGELOG.md | 11 +++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 806876c..26df77f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.66] + +- feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 +- feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) by @abetlen in #1147 +- ci(fix): Workflow actions updates and fix arm64 wheels not included in release by @Smartappli in #1392 +- ci: Add support for pre-built cuda 12.4.1 wheels by @Smartappli in #1388 +- feat: Add support for str type kv_overrides by @abetlen in a411612b385cef100d76145da1fbd02a7b7cc894 +- fix: Functionary bug fixes by @jeffrey-fong in #1385 +- examples: fix quantize example by @iyubondyrev in #1387 +- ci: Update dependabot.yml by @Smartappli in #1391 + ## [0.2.65] - feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 6db6333..49657c9 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.65" \ No newline at end of file +__version__ = "0.2.66" \ No newline at end of file From d03f15bb73a1d520970357b702a9e7d4cc2a7a62 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 02:58:55 -0400 Subject: [PATCH 29/43] fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release. --- .github/workflows/build-and-release.yaml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 78bcb87..bb3c9f8 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -39,9 +39,10 @@ jobs: - uses: actions/upload-artifact@v4 with: + name: wheels path: ./wheelhouse/*.whl - build_arm64_wheels: + build_wheels_arm64: name: Build arm64 wheels runs-on: ubuntu-latest steps: @@ -67,6 +68,7 @@ jobs: - name: Upload wheels as artifacts uses: actions/upload-artifact@v4 with: + name: wheels_arm64 path: ./wheelhouse/*.whl build_sdist: @@ -89,18 +91,25 @@ jobs: python -m build --sdist - uses: actions/upload-artifact@v4 with: + name: sdist path: ./dist/*.tar.gz release: name: Release - needs: [build_wheels, build_arm64_wheels, build_sdist] + needs: [build_wheels, build_wheels_arm64, build_sdist] runs-on: ubuntu-latest steps: + - name: Merge Artifacts + uses: actions/upload-artifact/merge@v4 + with: + name: release + - uses: actions/download-artifact@v3 with: - name: artifact + name: release path: dist + - uses: softprops/action-gh-release@v2 with: files: dist/* From 3489ef09d3775f4a87fb7114f619e8ba9cb6b656 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 03:08:46 -0400 Subject: [PATCH 30/43] fix: Ensure image renders before text in chat formats regardless of message content order. --- llama_cpp/llama_chat_format.py | 54 ++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cbd8d20..63eaf8a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2175,12 +2175,11 @@ class Llava15ChatHandler: "{% if message.role == 'user' %}" "{% if message.content is string %}" "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" "{% endif %}" + "{% if message.content is iterable %}" + "\nUSER: " + + "{% for content in message.content %}" "{% if content.type == 'image_url' and content.image_url is string %}" "{{ content.image_url }}" "{% endif %}" @@ -2188,6 +2187,13 @@ class Llava15ChatHandler: "{{ content.image_url.url }}" "{% endif %}" "{% endfor %}" + + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" "{% endif %}" "{% if message.role == 'assistant' and message.content is not none %}" @@ -2575,14 +2581,22 @@ class ObsidianChatHandler(Llava15ChatHandler): "{{ message.content }}" "{% endif %}" "{% if message.content is iterable %}" + + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" "{% if content.type == 'text' %}" "{{ content.text }}" "{% endif %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url }}" - "{% endif %}" "{% endfor %}" + "{% endif %}" "###\n" "{% endif %}" @@ -2606,9 +2620,9 @@ class MoondreamChatHandler(Llava15ChatHandler): "{% for message in messages %}" "{% if message.role == 'user' %}" "{% if message.content is iterable %}" - "{% for content in message.content %}" # + "{% for content in message.content %}" "{% if content.type == 'image_url' %}" "{% if content.image_url is string %}" "{{ content.image_url }}\n\n" @@ -2617,12 +2631,15 @@ class MoondreamChatHandler(Llava15ChatHandler): "{{ content.image_url.url }}\n\n" "{% endif %}" "{% endif %}" + "{% endfor %}" # Question: + "{% for content in message.content %}" "{% if content.type == 'text' %}" "Question: {{ content.text }}\n\n" "{% endif %}" "{% endfor %}" + "{% endif %}" # Question: @@ -2657,9 +2674,9 @@ class Llava16ChatHandler(Llava15ChatHandler): "{% endif %}" "{% if message.role == 'user' %}" "{% if message.content is iterable %}" - "{% for content in message.content %}" # + "{% for content in message.content %}" "{% if content.type == 'image_url' %}" "{% if content.image_url is string %}" "{{ content.image_url }}\n" @@ -2668,12 +2685,15 @@ class Llava16ChatHandler(Llava15ChatHandler): "{{ content.image_url.url }}\n" "{% endif %}" "{% endif %}" + "{% endfor %}" # Question: + "{% for content in message.content %}" "{% if content.type == 'text' %}" "{{ content.text }}" "{% endif %}" "{% endfor %}" + "{% endif %}" # Question: @@ -2719,14 +2739,22 @@ class NanoLlavaChatHandler(Llava15ChatHandler): "{{ message.content }}" "{% endif %}" "{% if message.content is iterable %}" + + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" "{% if content.type == 'text' %}" "{{ content.text }}" "{% endif %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url }}" - "{% endif %}" "{% endfor %}" + "{% endif %}" "<|im_end|>" "{% endif %}" From f417cce28a3f3386206dabe98e87c038d05b228d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 03:11:02 -0400 Subject: [PATCH 31/43] chore: Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26df77f..4b2c34b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.67] + +- fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656 +- fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release by @abetlen in d03f15bb73a1d520970357b702a9e7d4cc2a7a62 + ## [0.2.66] - feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 49657c9..286baf4 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.66" \ No newline at end of file +__version__ = "0.2.67" \ No newline at end of file From c8cd8c17c6f178285579ce1f06347edb7324088c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 03:12:46 -0400 Subject: [PATCH 32/43] docs: Update README to include CUDA 12.4 wheels --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d34caf4..310873f 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: -- CUDA Version is 12.1, 12.2 or 12.3 +- CUDA Version is 12.1, 12.2, 12.3, or 12.4 - Python Version is 3.10, 3.11 or 3.12 ```bash @@ -133,6 +133,7 @@ Where `` is one of the following: - `cu121`: CUDA 12.1 - `cu122`: CUDA 12.2 - `cu123`: CUDA 12.3 +- `cu124`: CUDA 12.4 For example, to install the CUDA 12.1 wheel: From 6332527a698f5aa2f524773f42a0510ba5f650a0 Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Tue, 30 Apr 2024 15:16:14 +0200 Subject: [PATCH 33/43] fix(ci): Fix build-and-release.yaml (#1413) * Update build-and-release.yaml * Update build-and-release.yaml --- .github/workflows/build-and-release.yaml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index bb3c9f8..8fbd68f 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -39,7 +39,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-${{ matrix.os }} path: ./wheelhouse/*.whl build_wheels_arm64: @@ -100,14 +100,9 @@ jobs: runs-on: ubuntu-latest steps: - - name: Merge Artifacts - uses: actions/upload-artifact/merge@v4 + - uses: actions/download-artifact@v4 with: - name: release - - - uses: actions/download-artifact@v3 - with: - name: release + merge-multiple: true path: dist - uses: softprops/action-gh-release@v2 From 8c2b24d5aafffffadf37c3067ca12a45b59d95c4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 09:27:55 -0400 Subject: [PATCH 34/43] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 10 +++++++--- vendor/llama.cpp | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9c8f778..46aa516 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -242,8 +242,8 @@ LLAMA_FILE_MAGIC_GGSQ = 0x67677371 # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 5 -LLAMA_SESSION_VERSION = 5 +# define LLAMA_SESSION_VERSION 6 +LLAMA_SESSION_VERSION = 6 # define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ @@ -730,6 +730,7 @@ class llama_model_params(ctypes.Structure): # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU +# bool flash_attn; // whether to use flash attention # // Abort callback @@ -766,6 +767,7 @@ class llama_context_params(ctypes.Structure): logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU + flash_attn (bool): whether to use flash attention abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback """ @@ -795,6 +797,7 @@ class llama_context_params(ctypes.Structure): logits_all: bool embeddings: bool offload_kqv: bool + flash_attn: bool abort_callback: Callable[[ctypes.c_void_p], bool] abort_callback_data: ctypes.c_void_p @@ -823,6 +826,7 @@ class llama_context_params(ctypes.Structure): ("logits_all", ctypes.c_bool), ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), + ("flash_attn", ctypes.c_bool), ("abort_callback", ggml_abort_callback), ("abort_callback_data", ctypes.c_void_p), ] @@ -1615,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int: ... -# // Clear the KV cache +# // Clear the KV cache - both cell info is erased and KV data is zeroed # LLAMA_API void llama_kv_cache_clear( # struct llama_context * ctx); @ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8843a98..77e15be 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 +Subproject commit 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 From 22d77eefd2edaf0148f53374d0cac74d0e25d06e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 09:29:16 -0400 Subject: [PATCH 35/43] feat: Add option to enable `flash_attn` to Lllama params and ModelSettings --- llama_cpp/llama.py | 4 ++++ llama_cpp/server/settings.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 96aac66..172f4c6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -92,6 +92,7 @@ class Llama: logits_all: bool = False, embedding: bool = False, offload_kqv: bool = True, + flash_attn: bool = False, # Sampling Params last_n_tokens_size: int = 64, # LoRA Params @@ -168,6 +169,7 @@ class Llama: logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. embedding: Embedding mode only. offload_kqv: Offload K, Q, V to GPU. + flash_attn: Use flash attention. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. @@ -310,6 +312,7 @@ class Llama: ) # Must be set to True for speculative decoding self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv + self.context_params.flash_attn = flash_attn # KV cache quantization if type_k is not None: self.context_params.type_k = type_k @@ -1774,6 +1777,7 @@ class Llama: logits_all=self.context_params.logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, + flash_offload=self.context_params.flash_offload, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 0c858f9..ed05a88 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -96,6 +96,9 @@ class ModelSettings(BaseSettings): offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) + flash_attn: bool = Field( + default=False, description="Whether to use flash attention." + ) # Sampling Params last_n_tokens_size: int = Field( default=64, From 29b6e9a5c832b7d148044b45fb3cd6f3f923d96b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 09:32:47 -0400 Subject: [PATCH 36/43] fix: wrong parameter for flash attention in pickle __getstate__ --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 172f4c6..f927f0c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1777,7 +1777,7 @@ class Llama: logits_all=self.context_params.logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, - flash_offload=self.context_params.flash_offload, + flash_attn=self.context_params.flash_attn, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params From b14dd98922c7f18468ae202eadbaf58fe17f5320 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 09:39:56 -0400 Subject: [PATCH 37/43] chore: Bump version --- CHANGELOG.md | 6 ++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b2c34b..c9681f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.68] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ +- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e +- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413 + ## [0.2.67] - fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 286baf4..63c6225 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.67" \ No newline at end of file +__version__ = "0.2.68" \ No newline at end of file From 26478ab293bde6374e2c350fed7b07c3027939b6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 10:11:38 -0400 Subject: [PATCH 38/43] docs: Update README.md --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 310873f..fce8eda 100644 --- a/README.md +++ b/README.md @@ -529,19 +529,22 @@ Then you'll need to use a custom chat handler to load the clip model and process You can also pull the model from the Hugging Face Hub using the `from_pretrained` method. ```python ->>> from llama_cpp import Llama ->>> from llama_cpp.llama_chat_format import MoondreamChatHandler ->>> chat_handler = MoondreamChatHandler.from_pretrained( +from llama_cpp import Llama +from llama_cpp.llama_chat_format import MoondreamChatHandler + +chat_handler = MoondreamChatHandler.from_pretrained( repo_id="vikhyatk/moondream2", filename="*mmproj*", ) ->>> llm = Llama.from_pretrained( - repo_id="vikhyatk/moondream2" + +llm = Llama.from_pretrained( + repo_id="vikhyatk/moondream2", filename="*text-model*", chat_handler=chat_handler, n_ctx=2048, # n_ctx should be increased to accomodate the image embedding ) ->>> llm.create_chat_completion( + +respoonse = llm.create_chat_completion( messages = [ { "role": "user", @@ -553,6 +556,7 @@ You can also pull the model from the Hugging Face Hub using the `from_pretrained } ] ) +print(response["choices"][0]["text"]) ``` **Note**: Multi-modal models also support tool calling and JSON mode. From 945c62c567a862e1c631a86bf88b86ebacefd40d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 10:15:04 -0400 Subject: [PATCH 39/43] docs: Change all examples from interpreter style to script style. --- README.md | 74 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index fce8eda..6406714 100644 --- a/README.md +++ b/README.md @@ -277,20 +277,26 @@ The high-level API provides a simple managed interface through the [`Llama`](htt Below is a short example demonstrating how to use the high-level API to for basic text completion: ```python ->>> from llama_cpp import Llama ->>> llm = Llama( +from llama_cpp import Llama + +llm = Llama( model_path="./models/7B/llama-model.gguf", # n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed # n_ctx=2048, # Uncomment to increase the context window ) ->>> output = llm( +output = llm( "Q: Name the planets in the solar system? A: ", # Prompt max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window stop=["Q:", "\n"], # Stop generating just before the model would generate a new question echo=True # Echo the prompt back in the output ) # Generate a completion, can also call create_completion ->>> print(output) +print(output) +``` + +By default `llama-cpp-python` generates completions in an OpenAI compatible format: + +```python { "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", "object": "text_completion", @@ -345,12 +351,12 @@ The model will will format the messages into a single prompt using the following Set `verbose=True` to see the selected chat format. ```python ->>> from llama_cpp import Llama ->>> llm = Llama( +from llama_cpp import Llama +llm = Llama( model_path="path/to/llama-2/llama-model.gguf", chat_format="llama-2" ) ->>> llm.create_chat_completion( +llm.create_chat_completion( messages = [ {"role": "system", "content": "You are an assistant who perfectly describes images."}, { @@ -375,9 +381,9 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the The following example will constrain the response to valid JSON strings only. ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") ->>> llm.create_chat_completion( +from llama_cpp import Llama +llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") +llm.create_chat_completion( messages=[ { "role": "system", @@ -397,9 +403,9 @@ The following example will constrain the response to valid JSON strings only. To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument. ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") ->>> llm.create_chat_completion( +from llama_cpp import Llama +llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") +llm.create_chat_completion( messages=[ { "role": "system", @@ -424,9 +430,9 @@ To constrain the response further to a specific JSON Schema add the schema to th The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling") ->>> llm.create_chat_completion( +from llama_cpp import Llama +llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling") +llm.create_chat_completion( messages = [ { "role": "system", @@ -476,9 +482,9 @@ The various gguf-converted files for this set of models can be found [here](http Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```python ->>> from llama_cpp import Llama ->>> from llama_cpp.llama_tokenizer import LlamaHFTokenizer ->>> llm = Llama.from_pretrained( +from llama_cpp import Llama +from llama_cpp.llama_tokenizer import LlamaHFTokenizer +llm = Llama.from_pretrained( repo_id="meetkai/functionary-small-v2.2-GGUF", filename="functionary-small-v2.2.q4_0.gguf", chat_format="functionary-v2", @@ -504,15 +510,15 @@ You'll first need to download one of the available multi-modal models in GGUF fo Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. ```python ->>> from llama_cpp import Llama ->>> from llama_cpp.llama_chat_format import Llava15ChatHandler ->>> chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") ->>> llm = Llama( +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Llava15ChatHandler +chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") +llm = Llama( model_path="./path/to/llava/llama-model.gguf", chat_handler=chat_handler, n_ctx=2048, # n_ctx should be increased to accomodate the image embedding ) ->>> llm.create_chat_completion( +llm.create_chat_completion( messages = [ {"role": "system", "content": "You are an assistant who perfectly describes images."}, { @@ -709,18 +715,18 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github Below is a short example demonstrating how to use the low-level API to tokenize a prompt: ```python ->>> import llama_cpp ->>> import ctypes ->>> llama_cpp.llama_backend_init(False) # Must be called once at the start of each program ->>> params = llama_cpp.llama_context_default_params() +import llama_cpp +import ctypes +llama_cpp.llama_backend_init(False) # Must be called once at the start of each program +params = llama_cpp.llama_context_default_params() # use bytes for char * params ->>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) ->>> ctx = llama_cpp.llama_new_context_with_model(model, params) ->>> max_tokens = params.n_ctx +model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) +ctx = llama_cpp.llama_new_context_with_model(model, params) +max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cpp.llama_token * int(max_tokens))() ->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True)) ->>> llama_cpp.llama_free(ctx) +tokens = (llama_cpp.llama_token * int(max_tokens))() +n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True)) +llama_cpp.llama_free(ctx) ``` Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. From 3226b3c5ef93998f5c412fb926ff136b50816bd7 Mon Sep 17 00:00:00 2001 From: Jonathan Soma Date: Tue, 30 Apr 2024 14:33:23 -0400 Subject: [PATCH 40/43] fix: UTF-8 handling with grammars (#1415) Use Python's built-in UTF-8 handling to get code points --- llama_cpp/llama_grammar.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 6c7b57a..d9a3823 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -556,17 +556,11 @@ def add_rule( # } def decode_utf8(src: const_char_p) -> Tuple[int, const_char_p]: """Decodes a UTF-8 character from the source string.""" - lookup = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4) - first_byte = ord(src[0]) # type: int - highbits = first_byte >> 4 # type: int - len = lookup[highbits] # type: int - mask = (1 << (8 - len)) - 1 # type: int - value = first_byte & mask # type: int - end = src + len # type: const_char_p # may overrun! - pos = src + 1 # type: const_char_p - while pos < end and pos[0]: - value = (value << 6) + (ord(pos[0]) & 0x3F) - pos += 1 + # Get the codepoint of the first character + value = ord(src[0]) + # Move the pointer ahead one character + pos = src + 1 + return value, pos From f116175a5a7c84569c88cad231855c1e6e59ff6e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 15:45:34 -0400 Subject: [PATCH 41/43] fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks. Closes #796 Closes #729 --- llama_cpp/_internals.py | 8 +++++--- llama_cpp/_utils.py | 25 +++++++++++-------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index cc3d989..d7409f6 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -15,6 +15,7 @@ import numpy.typing as npt from .llama_types import * from .llama_grammar import LlamaGrammar +from ._utils import suppress_stdout_stderr import llama_cpp.llama_cpp as llama_cpp @@ -47,9 +48,10 @@ class _LlamaModel: if not os.path.exists(path_model): raise ValueError(f"Model path does not exist: {path_model}") - self.model = llama_cpp.llama_load_model_from_file( - self.path_model.encode("utf-8"), self.params - ) + with suppress_stdout_stderr(disable=verbose): + self.model = llama_cpp.llama_load_model_from_file( + self.path_model.encode("utf-8"), self.params + ) if self.model is None: raise ValueError(f"Failed to load model from file: {path_model}") diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 4a10647..781b265 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,13 +1,15 @@ import os import sys -import sys from typing import Any, Dict # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor outnull_file = open(os.devnull, "w") errnull_file = open(os.devnull, "w") +STDOUT_FILENO = 1 +STDERR_FILENO = 2 + class suppress_stdout_stderr(object): # NOTE: these must be "saved" here to avoid exceptions when using # this context manager inside of a __del__ method @@ -22,12 +24,8 @@ class suppress_stdout_stderr(object): if self.disable: return self - # Check if sys.stdout and sys.stderr have fileno method - if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'): - return self # Return the instance without making changes - - self.old_stdout_fileno_undup = self.sys.stdout.fileno() - self.old_stderr_fileno_undup = self.sys.stderr.fileno() + self.old_stdout_fileno_undup = STDOUT_FILENO + self.old_stderr_fileno_undup = STDERR_FILENO self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup) self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup) @@ -47,15 +45,14 @@ class suppress_stdout_stderr(object): return # Check if sys.stdout and sys.stderr have fileno method - if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'): - self.sys.stdout = self.old_stdout - self.sys.stderr = self.old_stderr + self.sys.stdout = self.old_stdout + self.sys.stderr = self.old_stderr - self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) - self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) + self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) + self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) - self.os.close(self.old_stdout_fileno) - self.os.close(self.old_stderr_fileno) + self.os.close(self.old_stdout_fileno) + self.os.close(self.old_stderr_fileno) class MetaSingleton(type): From 946156fb6c09389cc184cb5209f96aac54c08b85 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 15:46:45 -0400 Subject: [PATCH 42/43] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 77e15be..f364eb6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 +Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961 From 4f01c452b6c738dc56eacac3758119b12c57ea94 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Apr 2024 15:50:30 -0400 Subject: [PATCH 43/43] fix: Change default verbose value of verbose in image chat format handlers to True to match Llama --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 63eaf8a..0af410a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2205,7 +2205,7 @@ class Llava15ChatHandler: "{% endif %}" ) - def __init__(self, clip_model_path: str, verbose: bool = False): + def __init__(self, clip_model_path: str, verbose: bool = True): import llama_cpp.llava_cpp as llava_cpp self.clip_model_path = clip_model_path