Update llama.cpp

2023-10-03 15:23:35 -04:00 · 2023-10-03 15:23:35 -04:00 · a7d17b8ac9
parent 305482bd41
commit a7d17b8ac9
2 changed files with 60 additions and 3 deletions
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -102,8 +102,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E

 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 1
-LLAMA_SESSION_VERSION = 1
+# define LLAMA_SESSION_VERSION 2
+LLAMA_SESSION_VERSION = 2


 # struct llama_model;
@ -624,6 +624,16 @@ _lib.llama_n_embd.argtypes = [llama_model_p]
 _lib.llama_n_embd.restype = c_int


+# // Get the model's RoPE frequency scaling factor
+# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
+def llama_rope_freq_scale_train(model: llama_model_p) -> float:
+    return _lib.llama_rope_freq_scale_train(model)
+
+
+_lib.llama_rope_freq_scale_train.argtypes = [llama_model_p]
+_lib.llama_rope_freq_scale_train.restype = c_float
+
+
 # // Get a string describing the model type
 # LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 def llama_model_desc(
@ -768,6 +778,8 @@ _lib.llama_get_kv_cache_token_count.restype = c_int


 # // Remove all tokens data of cells in [c0, c1)
+# // c0 < 0 : [0,  c1]
+# // c1 < 0 : [c0, inf)
 # LLAMA_API void llama_kv_cache_tokens_rm(
 #         struct llama_context * ctx,
 #                      int32_t   c0,
@ -783,6 +795,8 @@ _lib.llama_kv_cache_tokens_rm.restype = None


 # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
 # LLAMA_API void llama_kv_cache_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
@ -808,6 +822,8 @@ _lib.llama_kv_cache_seq_rm.restype = None

 # // Copy all tokens that belong to the specified sequence to another sequence
 # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
 # LLAMA_API void llama_kv_cache_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id_src,
@ -851,6 +867,8 @@ _lib.llama_kv_cache_seq_keep.restype = None

 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 # // If the KV cache is RoPEd, the KV data is updated accordingly
+# // p0 < 0 : [0,  p1]
+# // p1 < 0 : [p0, inf)
 # LLAMA_API void llama_kv_cache_seq_shift(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
@ -1215,6 +1233,43 @@ _lib.llama_token_nl.argtypes = [llama_context_p]
 _lib.llama_token_nl.restype = llama_token


+# // codellama infill tokens
+# LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
+def llama_token_prefix(ctx: llama_context_p) -> int:
+    return _lib.llama_token_prefix(ctx)
+
+
+_lib.llama_token_prefix.argtypes = [llama_context_p]
+_lib.llama_token_prefix.restype = llama_token
+
+
+# LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
+def llama_token_middle(ctx: llama_context_p) -> int:
+    return _lib.llama_token_middle(ctx)
+
+
+_lib.llama_token_middle.argtypes = [llama_context_p]
+_lib.llama_token_middle.restype = llama_token
+
+
+# LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
+def llama_token_suffix(ctx: llama_context_p) -> int:
+    return _lib.llama_token_suffix(ctx)
+
+
+_lib.llama_token_suffix.argtypes = [llama_context_p]
+_lib.llama_token_suffix.restype = llama_token
+
+
+# LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+def llama_token_eot(ctx: llama_context_p) -> int:
+    return _lib.llama_token_eot(ctx)
+
+
+_lib.llama_token_eot.argtypes = [llama_context_p]
+_lib.llama_token_eot.restype = llama_token
+
+
 # //
 # // Tokenization
 # //
@ -1728,6 +1783,7 @@ _lib.llama_grammar_accept_token.restype = None
 # struct llama_beam_view {
 #     const llama_token * tokens;

+
 #     size_t n_tokens;
 #     float  p;        // Cumulative beam probability (renormalized relative to all beams)
 #     bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
@ -1794,6 +1850,7 @@ def llama_beam_search(
        ctx, callback, callback_data, n_beams, n_past, n_predict
    )

+
 _lib.llama_beam_search.argtypes = [
    llama_context_p,
    llama_beam_search_callback_fn_t,
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit f5ef5cfb18148131fcf45bdd2331f0db5ab7c3d0
+Subproject commit 79f34abddb72ac5ddbf118f3d87520b611a10a7d