Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-03-06 01:32:00 -05:00
parent 87a6e5797e
commit 93dc56ace8
3 changed files with 30 additions and 12 deletions

View file

@ -293,7 +293,7 @@ class Llama:
self.context_params.logits_all = (
logits_all if draft_model is None else True
) # Must be set to True for speculative decoding
self.context_params.embedding = embedding
self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv
# Sampling Params
@ -787,7 +787,7 @@ class Llama:
n_embd = self.n_embd()
n_batch = self.n_batch
if self.context_params.embedding == False:
if self.context_params.embeddings == False:
raise RuntimeError(
"Llama model must be created with embedding=True to call this method"
)
@ -1725,7 +1725,7 @@ class Llama:
yarn_beta_slow=self.context_params.yarn_beta_slow,
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
logits_all=self.context_params.logits_all,
embedding=self.context_params.embedding,
embedding=self.context_params.embeddings,
# Sampling Params
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params

View file

@ -399,7 +399,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
# // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
# // - pos : the positions of the respective token in the sequence
# // - seq_id : the sequence to which the respective token belongs
# // - logits : if zero, the logits for the respective token will not be output
# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
# //
# typedef struct llama_batch {
# int32_t n_tokens;
@ -409,7 +409,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
# llama_pos * pos;
# int32_t * n_seq_id;
# llama_seq_id ** seq_id;
# int8_t * logits;
# int8_t * logits; // TODO: rename this to "output"
# // NOTE: helpers for smooth API transition - can be deprecated in the future
@ -572,7 +572,7 @@ class llama_model_params(ctypes.Structure):
# // Keep the booleans together to avoid misalignment during copy-by-value.
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only
# bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# // Abort callback
@ -605,7 +605,7 @@ class llama_context_params(ctypes.Structure):
type_k (int): data type for K cache
type_v (int): data type for V cache
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
@ -632,7 +632,7 @@ class llama_context_params(ctypes.Structure):
("type_k", ctypes.c_int),
("type_v", ctypes.c_int),
("logits_all", ctypes.c_bool),
("embedding", ctypes.c_bool),
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("abort_callback", ggml_abort_callback),
("abort_callback_data", ctypes.c_void_p),
@ -1774,8 +1774,8 @@ def llama_get_logits_ith(
...
# Get the embeddings for the input
# shape: [n_embd] (1-dimensional)
# // Get all output token embeddings
# // shape: [n_tokens*n_embd] (1-dimensional)
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function(
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@ -1786,8 +1786,9 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
...
# // Get the embeddings for the ith sequence
# // Get the embeddings for the ith token
# // llama_get_embeddings(ctx) + i*n_embd
# // shape: [n_embd] (1-dimensional)
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
"llama_get_embeddings_ith",
@ -1802,6 +1803,23 @@ def llama_get_embeddings_ith(
...
# // Get the embeddings for a sequence id
# // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
# // shape: [n_embd] (1-dimensional)
# LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
@ctypes_function(
"llama_get_embeddings_seq",
[llama_context_p_ctypes, llama_seq_id],
ctypes.POINTER(ctypes.c_float),
)
def llama_get_embeddings_seq(
ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
) -> CtypesArray[ctypes.c_float]:
"""Get the embeddings for a sequence id
Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
shape: [n_embd] (1-dimensional)"""
...
# //
# // Vocab
# //

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72
Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae