From fafe47114c22a9a1b316a81555e43a79bea2ede9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 17:47:21 -0400 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama.py | 9 +- llama_cpp/llama_cpp.py | 219 +++++++++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 3 files changed, 186 insertions(+), 44 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6ccb823..332a882 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,7 +127,6 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.n_parts = n_parts self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv @@ -149,6 +148,10 @@ class Llama: self.lora_base = lora_base self.lora_path = lora_path + ### DEPRECATED ### + self.n_parts = n_parts + ### DEPRECATED ### + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") @@ -1225,7 +1228,6 @@ class Llama: verbose=self.verbose, model_path=self.model_path, n_ctx=self.params.n_ctx, - n_parts=self.params.n_parts, n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, @@ -1239,6 +1241,9 @@ class Llama: n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + ### DEPRECATED ### + n_parts=self.n_parts, + ### DEPRECATED ### ) def __setstate__(self, state): diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0dcb16c..541ee00 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -72,31 +72,61 @@ _lib_base_name = "llama" # Load the library _lib = _load_shared_library(_lib_base_name) -# C types -LLAMA_FILE_VERSION = c_int(2) -LLAMA_FILE_MAGIC = b"ggjt" -LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" -LLAMA_SESSION_MAGIC = b"ggsn" +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# struct llama_context; llama_context_p = c_void_p +# typedef int llama_token; llama_token = c_int llama_token_p = POINTER(llama_token) +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; class llama_token_data(Structure): _fields_ = [ - ("id", llama_token), # token id - ("logit", c_float), # log-odds of the token - ("p", c_float), # probability of the token + ("id", llama_token), + ("logit", c_float), + ("p", c_float), ] llama_token_data_p = POINTER(llama_token_data) +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; class llama_token_data_array(Structure): _fields_ = [ ("data", llama_token_data_p), @@ -107,54 +137,72 @@ class llama_token_data_array(Structure): llama_token_data_array_p = POINTER(llama_token_data_array) +# typedef void (*llama_progress_callback)(float progress, void *ctx); llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +# struct llama_context_params { +# int n_ctx; // text context +# int n_gpu_layers; // number of layers to store in VRAM +# int seed; // RNG seed, -1 for random + +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only + + +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# }; class llama_context_params(Structure): _fields_ = [ - ("n_ctx", c_int), # text context - ("n_parts", c_int), # -1 for default - ("n_gpu_layers", c_int), # number of layers to store in VRAM - ("seed", c_int), # RNG seed, 0 for random - ("f16_kv", c_bool), # use fp16 for KV cache + ("n_ctx", c_int), + ("n_gpu_layers", c_int), + ("seed", c_int), + ("f16_kv", c_bool), ( "logits_all", c_bool, - ), # the llama_eval() call computes all logits, not just the last one - ("vocab_only", c_bool), # only load the vocabulary, no weights - ("use_mmap", c_bool), # use mmap if possible - ("use_mlock", c_bool), # force system to keep model in RAM - ("embedding", c_bool), # embedding mode only - # called with a progress value between 0 and 1, pass NULL to disable + ), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), ("progress_callback", llama_progress_callback), - # context pointer passed to the progress callback ("progress_callback_user_data", c_void_p), ] llama_context_params_p = POINTER(llama_context_params) +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# }; LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( - 4 -) # tok_embeddings.weight and output.weight are F16 -# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# Functions +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +# LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -163,6 +211,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +# LLAMA_API bool llama_mmap_supported(); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -171,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool +# LLAMA_API bool llama_mlock_supported(); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -179,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool -# Various functions for loading a ggml llama model. -# Allocate (almost) all memory needed for the model. -# Return NULL on failure +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(); +def llama_init_backend(): + return _lib.llama_init_backend() + + +_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.restype = None + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: @@ -193,6 +267,7 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); def llama_free(ctx: llama_context_p): return _lib.llama_free(ctx) @@ -204,6 +279,11 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# enum llama_ftype ftype, +# int nthread); def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int ) -> int: @@ -220,6 +300,11 @@ _lib.llama_model_quantize.restype = c_int # The model needs to be reloaded before applying a new adapter, otherwise the adapter # will be applied on top of the previous one # Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); def llama_apply_lora_from_file( ctx: llama_context_p, path_lora: c_char_p, @@ -234,6 +319,7 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -243,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) @@ -253,6 +340,7 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -264,6 +352,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); def llama_copy_state_data( ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: @@ -276,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] ) -> int: @@ -287,6 +377,7 @@ _lib.llama_set_state_data.restype = c_size_t # Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); def llama_load_session_file( ctx: llama_context_p, path_session: bytes, @@ -309,6 +400,7 @@ _lib.llama_load_session_file.argtypes = [ _lib.llama_load_session_file.restype = c_size_t +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); def llama_save_session_file( ctx: llama_context_p, path_session: bytes, @@ -331,6 +423,12 @@ _lib.llama_save_session_file.restype = c_size_t # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls # Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); def llama_eval( ctx: llama_context_p, tokens, # type: Array[llama_token] @@ -350,6 +448,12 @@ _lib.llama_eval.restype = c_int # Returns the number of tokens on success, no more than n_max_tokens # Returns a negative number on failure - the number of tokens that would have been returned # TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); def llama_tokenize( ctx: llama_context_p, text: bytes, @@ -364,6 +468,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -372,6 +477,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -380,6 +486,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -393,6 +500,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); def llama_get_logits( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -405,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); def llama_get_embeddings( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -416,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -426,6 +536,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens +# LLAMA_API llama_token llama_token_bos(); def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -434,6 +545,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token +# LLAMA_API llama_token llama_token_eos(); def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -442,6 +554,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token +# LLAMA_API llama_token llama_token_nl(); def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -454,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -477,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -507,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] ): @@ -521,6 +637,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -540,6 +657,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -559,6 +677,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -578,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -596,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); def llama_sample_temperature( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -618,6 +739,7 @@ _lib.llama_sample_temperature.restype = None # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); def llama_sample_token_mirostat( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -645,6 +767,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -666,6 +789,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -681,6 +805,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -698,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -706,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p] _lib.llama_print_timings.restype = None +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) @@ -715,9 +842,19 @@ _lib.llama_reset_timings.restype = None # Print system information +# LLAMA_API const char * llama_print_system_info(void); def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() _lib.llama_print_system_info.argtypes = [] _lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend() + _llama_initialized = True diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c238b58..7e4ea5b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c238b5873a1ea496db03ffcfe124c9d0d83afbc6 +Subproject commit 7e4ea5beff567f53be92f75f9089e6f11fa5dabd