Update llama.cpp

2024-03-06 01:32:00 -05:00 · 2024-03-06 01:32:00 -05:00 · 93dc56ace8
parent 87a6e5797e
commit 93dc56ace8
3 changed files with 30 additions and 12 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -293,7 +293,7 @@ class Llama:
        self.context_params.logits_all = (
            logits_all if draft_model is None else True
        )  # Must be set to True for speculative decoding
-        self.context_params.embedding = embedding
+        self.context_params.embeddings = embedding # TODO: Rename to embeddings
        self.context_params.offload_kqv = offload_kqv

        # Sampling Params
@ -787,7 +787,7 @@ class Llama:
        n_embd = self.n_embd()
        n_batch = self.n_batch

-        if self.context_params.embedding == False:
+        if self.context_params.embeddings == False:
            raise RuntimeError(
                "Llama model must be created with embedding=True to call this method"
            )
@ -1725,7 +1725,7 @@ class Llama:
            yarn_beta_slow=self.context_params.yarn_beta_slow,
            yarn_orig_ctx=self.context_params.yarn_orig_ctx,
            logits_all=self.context_params.logits_all,
-            embedding=self.context_params.embedding,
+            embedding=self.context_params.embeddings,
            # Sampling Params
            last_n_tokens_size=self.last_n_tokens_size,
            # LoRA Params
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -399,7 +399,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
 # // - seq_id : the sequence to which the respective token belongs
-# // - logits : if zero, the logits for the respective token will not be output
+# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@ -409,7 +409,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
 #     llama_pos    *  pos;
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
-#     int8_t       *  logits;
+#     int8_t       *  logits; // TODO: rename this to "output"


 #     // NOTE: helpers for smooth API transition - can be deprecated in the future
@ -572,7 +572,7 @@ class llama_model_params(ctypes.Structure):

 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-#     bool embedding;   // embedding mode only
+#     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

 #     // Abort callback
@ -605,7 +605,7 @@ class llama_context_params(ctypes.Structure):
        type_k (int): data type for K cache
        type_v (int): data type for V cache
        logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        embedding (bool): embedding mode only
+        embeddings (bool): if true, extract embeddings (together with logits)
        offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
@ -632,7 +632,7 @@ class llama_context_params(ctypes.Structure):
        ("type_k", ctypes.c_int),
        ("type_v", ctypes.c_int),
        ("logits_all", ctypes.c_bool),
-        ("embedding", ctypes.c_bool),
+        ("embeddings", ctypes.c_bool),
        ("offload_kqv", ctypes.c_bool),
        ("abort_callback", ggml_abort_callback),
        ("abort_callback_data", ctypes.c_void_p),
@ -1774,8 +1774,8 @@ def llama_get_logits_ith(
    ...


-# Get the embeddings for the input
-# shape: [n_embd] (1-dimensional)
+# // Get all output token embeddings
+# // shape: [n_tokens*n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function(
    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@ -1786,8 +1786,9 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
    ...


-# // Get the embeddings for the ith sequence
+# // Get the embeddings for the ith token
 # // llama_get_embeddings(ctx) + i*n_embd
+# // shape: [n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
    "llama_get_embeddings_ith",
@ -1802,6 +1803,23 @@ def llama_get_embeddings_ith(
    ...


+# // Get the embeddings for a sequence id
+# // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+# // shape: [n_embd] (1-dimensional)
+# LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
+@ctypes_function(
+    "llama_get_embeddings_seq",
+    [llama_context_p_ctypes, llama_seq_id],
+    ctypes.POINTER(ctypes.c_float),
+)
+def llama_get_embeddings_seq(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> CtypesArray[ctypes.c_float]:
+    """Get the embeddings for a sequence id
+    Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    shape: [n_embd] (1-dimensional)"""
+    ...
+
 # //
 # // Vocab
 # //
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72
+Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae