Update llama.cpp

This commit is contained in:
Andrei Betlen 2023-04-21 17:40:27 -04:00
parent ba3959eafd
commit 1eb130a6b2
2 changed files with 7 additions and 4 deletions

View file

@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
4
) # tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors
LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors
# Functions
@ -169,11 +171,12 @@ _lib.llama_free.restype = None
# TODO: not great API - very likely to change
# Returns 0 on success
def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int:
return _lib.llama_model_quantize(fname_inp, fname_out, itype)
# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
def llama_model_quantize(fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int) -> c_int:
return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
_lib.llama_model_quantize.restype = c_int

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit c8c2c524827be8fd681a63f0e5a697b0bf4c587b
Subproject commit 50cb666b8a2e35a49b08c0f6bc81138c8f6f2ac1