From ff580031d2d5258f1644506514957ef1ce882963 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 19 Oct 2023 02:55:08 -0400 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama.py | 4 ++- llama_cpp/llama_cpp.py | 66 +++++++++++++++++++++++++++++++++++------- vendor/llama.cpp | 2 +- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8951bed..8bb5efb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -444,7 +444,7 @@ class Llama: maxlen=self._n_ctx if self.context_params.logits_all else 1, ) - def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: + def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]: """Tokenize a string. Args: @@ -466,6 +466,7 @@ class Llama: tokens, n_ctx, add_bos, + special ) if n_tokens < 0: n_tokens = abs(n_tokens) @@ -477,6 +478,7 @@ class Llama: tokens, n_tokens, add_bos, + special ) if n_tokens < 0: raise RuntimeError( diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 42e57a6..83d41ba 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -240,11 +240,11 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # typedef struct llama_batch { # int32_t n_tokens; -# llama_token * token; -# float * embd; -# llama_pos * pos; -# llama_seq_id * seq_id; -# int8_t * logits; +# llama_token * token; +# float * embd; +# llama_pos * pos; +# llama_seq_id ** seq_id; +# int8_t * logits; # // NOTE: helpers for smooth API transition - can be deprecated in the future @@ -262,7 +262,7 @@ class llama_batch(Structure): ("token", POINTER(llama_token)), ("embd", c_float_p), ("pos", POINTER(llama_pos)), - ("seq_id", POINTER(llama_seq_id)), + ("seq_id", POINTER(POINTER(llama_seq_id))), ("logits", POINTER(c_int8)), ("all_pos_0", llama_pos), ("all_pos_1", llama_pos), @@ -1069,7 +1069,8 @@ _lib.llama_batch_get_one.argtypes = [ _lib.llama_batch_get_one.restype = llama_batch -# // Allocates a batch of tokens on the heap +# // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens +# // Each token can be assigned up to n_seq_max sequence ids # // The batch has to be freed with llama_batch_free() # // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float) # // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token @@ -1077,14 +1078,17 @@ _lib.llama_batch_get_one.restype = llama_batch # // All members are left uninitialized # LLAMA_API struct llama_batch llama_batch_init( # int32_t n_tokens, -# int32_t embd); +# int32_t embd, +# int32_t n_seq_max); def llama_batch_init( - n_tokens: Union[c_int, int], embd: Union[c_int, int] + n_tokens: Union[c_int32, int], + embd: Union[c_int32, int], + n_seq_max: Union[c_int32, int], ) -> llama_batch: - return _lib.llama_batch_init(n_tokens, embd) + return _lib.llama_batch_init(n_tokens, embd, n_seq_max) -_lib.llama_batch_init.argtypes = [c_int, c_int] +_lib.llama_batch_init.argtypes = [c_int32, c_int32, c_int32] _lib.llama_batch_init.restype = llama_batch @@ -1308,6 +1312,46 @@ _lib.llama_tokenize.argtypes = [ _lib.llama_tokenize.restype = c_int +# /// @details Convert the provided text into tokens. +# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. +# /// @return Returns the number of tokens on success, no more than n_max_tokens +# /// @return Returns a negative number on failure - the number of tokens that would have been returned +# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. +# /// Does not insert a leading space. +# LLAMA_API int llama_tokenize( +# const struct llama_model * model, +# const char * text, +# int text_len, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos, +# bool special); +def llama_tokenize( + model: llama_model_p, + text: bytes, + text_len: Union[c_int, int], + tokens, # type: Array[llama_token] + n_max_tokens: Union[c_int, int], + add_bos: Union[c_bool, bool], + special: Union[c_bool, bool], +) -> int: + return _lib.llama_tokenize( + model, text, text_len, tokens, n_max_tokens, add_bos, special + ) + + +_lib.llama_tokenize.argtypes = [ + llama_model_p, + c_char_p, + c_int, + llama_token_p, + c_int, + c_bool, + c_bool, +] +_lib.llama_tokenize.restype = c_int + + # // Token Id -> Piece. # // Uses the vocabulary in the provided context. # // Does not write null terminator to the buffer. diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 11bff29..004797f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 11bff290458f12f020b588792707f76ec658a27a +Subproject commit 004797f6ac135383f8c1d1f5bd415ddee2f79318