From 1eb130a6b2445f4f9a41424362a64c26f3424529 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Apr 2023 17:40:27 -0400 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama_cpp.py | 9 ++++++--- vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 78d8e1f..97c6565 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( 4 ) # tok_embeddings.weight and output.weight are F16 +LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors +LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors # Functions @@ -169,11 +171,12 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success -def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype) +# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +def llama_model_quantize(fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int) -> c_int: + return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] _lib.llama_model_quantize.restype = c_int diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c8c2c52..50cb666 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c8c2c524827be8fd681a63f0e5a697b0bf4c587b +Subproject commit 50cb666b8a2e35a49b08c0f6bc81138c8f6f2ac1