Update llama.cpp

This commit is contained in:
Andrei Betlen 2023-09-28 22:42:03 -04:00
parent 4177ae6d34
commit 1a1c3dc418
4 changed files with 742 additions and 356 deletions

View file

@ -30,6 +30,7 @@ import numpy.typing as npt
from .utils import suppress_stdout_stderr
class BaseLlamaCache(ABC):
"""Base cache class for a llama.cpp model."""
@ -215,30 +216,37 @@ class Llama:
self,
model_path: str,
*,
# NOTE: These parameters are likely to change in the future.
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
n_batch: int = 512,
# Model Params
n_gpu_layers: int = 0,
main_gpu: int = 0,
tensor_split: Optional[List[float]] = None,
rope_freq_base: float = 10000.0,
rope_freq_scale: float = 1.0,
low_vram: bool = False,
mul_mat_q: bool = True,
f16_kv: bool = True,
logits_all: bool = False,
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
embedding: bool = False,
# Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
n_batch: int = 512,
n_threads: Optional[int] = None,
n_threads_batch: Optional[int] = None,
rope_freq_base: float = 10000.0,
rope_freq_scale: float = 1.0,
mul_mat_q: bool = True,
f16_kv: bool = True,
logits_all: bool = False,
embedding: bool = False,
# Sampling Params
last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None,
lora_scale: float = 1.0,
lora_path: Optional[str] = None,
# Backend Params
numa: bool = False,
# Misc
verbose: bool = True,
**kwargs # type: ignore
# Extra Params
**kwargs, # type: ignore
):
"""Load a llama.cpp model from `model_path`.
@ -277,52 +285,64 @@ class Llama:
self.verbose = verbose
self.numa = numa
if not Llama.__backend_initialized:
if self.verbose:
llama_cpp.llama_backend_init(numa)
llama_cpp.llama_backend_init(self.numa)
else:
with suppress_stdout_stderr():
llama_cpp.llama_backend_init(numa)
llama_cpp.llama_backend_init(self.numa)
Llama.__backend_initialized = True
self.model_path = model_path
self.params = llama_cpp.llama_context_default_params()
self.params.seed = seed
self.params.n_ctx = n_ctx
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
self.params.main_gpu = main_gpu
self.params.rope_freq_base = rope_freq_base
self.params.rope_freq_scale = rope_freq_scale
self.params.low_vram = low_vram
self.params.mul_mat_q = mul_mat_q
self.params.f16_kv = f16_kv
self.params.logits_all = logits_all
self.params.vocab_only = vocab_only
self.params.use_mmap = use_mmap if lora_path is None else False
self.params.use_mlock = use_mlock
self.params.embedding = embedding
# Model Params
self.model_params = llama_cpp.llama_model_default_params()
self.model_params.n_gpu_layers = (
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
self.model_params.main_gpu = main_gpu
self.tensor_split = tensor_split
self._p_tensor_split = None
if self.tensor_split is not None:
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
self._c_tensor_split = FloatArray(
*tensor_split
*tensor_split # type: ignore
) # keep a reference to the array so it is not gc'd
self.params.tensor_split = self._c_tensor_split
self.model_params.tensor_split = self._c_tensor_split
self.model_params.vocab_only = vocab_only
self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock
self.n_batch = min(n_ctx, n_batch) # ???
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.n_threads_batch = n_threads_batch or max(
multiprocessing.cpu_count() // 2, 1
)
# Context Params
self.context_params = llama_cpp.llama_context_default_params()
self.context_params.seed = seed
self.context_params.n_ctx = n_ctx
self.context_params.n_batch = self.n_batch
self.context_params.n_threads = self.n_threads
self.context_params.n_threads_batch = self.n_threads_batch
self.context_params.rope_freq_base = rope_freq_base
self.context_params.rope_freq_scale = rope_freq_scale
self.context_params.mul_mat_q = mul_mat_q
self.context_params.f16_kv = f16_kv
self.context_params.logits_all = logits_all
self.context_params.embedding = embedding
# Sampling Params
self.last_n_tokens_size = last_n_tokens_size
self.n_batch = min(n_ctx, n_batch)
self.cache: Optional[BaseLlamaCache] = None
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.lora_base = lora_base
self.lora_scale = lora_scale
self.lora_path = lora_path
if not os.path.exists(model_path):
@ -330,21 +350,23 @@ class Llama:
if verbose:
self.model = llama_cpp.llama_load_model_from_file(
self.model_path.encode("utf-8"), self.params
self.model_path.encode("utf-8"), self.model_params
)
else:
with suppress_stdout_stderr():
self.model = llama_cpp.llama_load_model_from_file(
self.model_path.encode("utf-8"), self.params
self.model_path.encode("utf-8"), self.model_params
)
assert self.model is not None
if verbose:
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
self.ctx = llama_cpp.llama_new_context_with_model(
self.model, self.context_params
)
else:
with suppress_stdout_stderr():
self.ctx = llama_cpp.llama_new_context_with_model(
self.model, self.params
self.model, self.context_params
)
assert self.ctx is not None
@ -353,6 +375,7 @@ class Llama:
if llama_cpp.llama_model_apply_lora_from_file(
self.model,
self.lora_path.encode("utf-8"),
self.lora_scale,
self.lora_base.encode("utf-8")
if self.lora_base is not None
else llama_cpp.c_char_p(0),
@ -409,7 +432,7 @@ class Llama:
def eval_logits(self) -> Deque[List[float]]:
return deque(
self.scores[: self.n_tokens, :].tolist(),
maxlen=self._n_ctx if self.params.logits_all else 1,
maxlen=self._n_ctx if self.model_params.logits_all else 1,
)
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
@ -427,7 +450,7 @@ class Llama:
assert self.model is not None
n_ctx = self._n_ctx
tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize_with_model(
n_tokens = llama_cpp.llama_tokenize(
self.model,
text,
len(text),
@ -438,7 +461,7 @@ class Llama:
if n_tokens < 0:
n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize_with_model(
n_tokens = llama_cpp.llama_tokenize(
self.model,
text,
len(text),
@ -466,14 +489,16 @@ class Llama:
size = 32
buffer = (ctypes.c_char * size)()
for token in tokens:
n = llama_cpp.llama_token_to_piece_with_model(
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size
)
assert n <= size
output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
return (
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
)
def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache.
@ -504,17 +529,16 @@ class Llama:
tokens=(llama_cpp.llama_token * len(batch))(*batch),
n_tokens=n_tokens,
n_past=n_past,
n_threads=self.n_threads,
)
if return_code != 0:
raise RuntimeError(f"llama_eval returned {return_code}")
# Save tokens
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
# Save logits
rows = n_tokens if self.params.logits_all else 1
rows = n_tokens if self.context_params.logits_all else 1
cols = self._n_vocab
offset = (
0 if self.params.logits_all else n_tokens - 1
0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
-1
@ -545,11 +569,7 @@ class Llama:
n_vocab = self._n_vocab
n_ctx = self._n_ctx
top_k = n_vocab if top_k <= 0 else top_k
last_n_tokens_size = (
n_ctx
if last_n_tokens_size < 0
else last_n_tokens_size
)
last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
logits: npt.NDArray[np.single] = self._scores[-1, :]
if logits_processor is not None:
@ -610,7 +630,7 @@ class Llama:
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
m=mirostat_m,
)
elif mirostat_mode== 2:
elif mirostat_mode == 2:
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
llama_cpp.llama_sample_temperature(
ctx=self.ctx,
@ -802,7 +822,7 @@ class Llama:
def create_embedding(
self, input: Union[str, List[str]], model: Optional[str] = None
) -> Embedding:
) -> CreateEmbeddingResponse:
"""Embed a string.
Args:
@ -814,7 +834,7 @@ class Llama:
assert self.ctx is not None
model_name: str = model if model is not None else self.model_path
if self.params.embedding == False:
if self.model_params.embedding == False:
raise RuntimeError(
"Llama model must be created with embedding=True to call this method"
)
@ -900,7 +920,11 @@ class Llama:
created: int = int(time.time())
completion_tokens: List[int] = []
# Add blank space to start of prompt to match OG llama tokenizer
prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()]
prompt_tokens: List[int] = (
self.tokenize(prompt.encode("utf-8"))
if prompt != ""
else [self.token_bos()]
)
text: bytes = b""
returned_tokens: int = 0
stop = (
@ -932,7 +956,7 @@ class Llama:
else:
stop_sequences = []
if logprobs is not None and self.params.logits_all is False:
if logprobs is not None and self.model_params.logits_all is False:
raise ValueError(
"logprobs is not supported for models created with logits_all=False"
)
@ -1025,7 +1049,9 @@ class Llama:
for token in remaining_tokens:
token_end_position += len(self.detokenize([token]))
# Check if stop sequence is in the token
if token_end_position > (remaining_length - first_stop_position):
if token_end_position > (
remaining_length - first_stop_position
):
break
token_str = self.detokenize([token]).decode(
"utf-8", errors="ignore"
@ -1082,7 +1108,7 @@ class Llama:
for i in range(1, len(remaining_tokens) + 1):
try:
bs = self.detokenize(remaining_tokens[:i])
ts = bs.decode('utf-8')
ts = bs.decode("utf-8")
decode_success = True
break
except UnicodeError:
@ -1093,7 +1119,9 @@ class Llama:
# all remaining tokens cannot be decoded to a UTF-8 character
break
token_end_position += len(bs)
if token_end_position > (remaining_length - first_stop_position):
if token_end_position > (
remaining_length - first_stop_position
):
break
remaining_tokens = remaining_tokens[i:]
returned_tokens += i
@ -1398,7 +1426,7 @@ class Llama:
model=model,
stopping_criteria=stopping_criteria,
logits_processor=logits_processor,
grammar=grammar
grammar=grammar,
)
if stream:
chunks: Iterator[CompletionChunk] = completion_or_chunks
@ -1618,47 +1646,68 @@ class Llama:
def __getstate__(self):
return dict(
verbose=self.verbose,
model_path=self.model_path,
n_ctx=self.params.n_ctx,
n_gpu_layers=self.params.n_gpu_layers,
seed=self.params.seed,
f16_kv=self.params.f16_kv,
logits_all=self.params.logits_all,
vocab_only=self.params.vocab_only,
use_mmap=self.params.use_mmap,
use_mlock=self.params.use_mlock,
embedding=self.params.embedding,
low_vram=self.params.low_vram,
last_n_tokens_size=self.last_n_tokens_size,
n_batch=self.n_batch,
n_threads=self.n_threads,
lora_base=self.lora_base,
lora_path=self.lora_path,
# Model Params
n_gpu_layers=self.model_params.n_gpu_layers,
main_gpu=self.model_params.main_gpu,
tensor_split=self.tensor_split,
mul_mat_q=self.params.mul_mat_q,
vocab_only=self.model_params.vocab_only,
use_mmap=self.model_params.use_mmap,
use_mlock=self.model_params.use_mlock,
# Context Params
seed=self.context_params.seed,
n_ctx=self.context_params.n_ctx,
n_batch=self.n_batch,
n_threads=self.context_params.n_threads,
n_threads_batch=self.context_params.n_threads_batch,
rope_freq_base=self.context_params.rope_freq_base,
rope_freq_scale=self.context_params.rope_freq_scale,
mul_mat_q=self.context_params.mul_mat_q,
f16_kv=self.context_params.f16_kv,
logits_all=self.context_params.logits_all,
embedding=self.context_params.embedding,
# Sampling Params
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
lora_base=self.lora_base,
lora_scale=self.lora_scale,
lora_path=self.lora_path,
# Backend Params
numa=self.numa,
# Misc
verbose=self.verbose,
)
def __setstate__(self, state):
self.__init__(
model_path=state["model_path"],
n_ctx=state["n_ctx"],
# Model Params
n_gpu_layers=state["n_gpu_layers"],
seed=state["seed"],
f16_kv=state["f16_kv"],
logits_all=state["logits_all"],
main_gpu=state["main_gpu"],
tensor_split=state["tensor_split"],
vocab_only=state["vocab_only"],
use_mmap=state["use_mmap"],
use_mlock=state["use_mlock"],
embedding=state["embedding"],
low_vram=state["low_vram"],
n_threads=state["n_threads"],
# Context Params
seed=state["seed"],
n_ctx=state["n_ctx"],
n_batch=state["n_batch"],
n_threads=state["n_threads"],
n_threads_batch=state["n_threads_batch"],
rope_freq_base=state["rope_freq_base"],
rope_freq_scale=state["rope_freq_scale"],
mul_mat_q=state["mul_mat_q"],
f16_kv=state["f16_kv"],
logits_all=state["logits_all"],
embedding=state["embedding"],
# Sampling Params
last_n_tokens_size=state["last_n_tokens_size"],
# LoRA Params
lora_base=state["lora_base"],
lora_path=state["lora_path"],
tensor_split=state["tensor_split"],
mul_mat_q=state["mul_mat_q"],
# Backend Params
numa=state["numa"],
# Misc
verbose=state["verbose"],
)
@ -1711,13 +1760,13 @@ class Llama:
def n_embd(self) -> int:
"""Return the embedding size."""
assert self.ctx is not None
return llama_cpp.llama_n_embd(self.ctx)
assert self.model is not None
return llama_cpp.llama_n_embd(self.model)
def n_vocab(self) -> int:
"""Return the vocabulary size."""
assert self.ctx is not None
return llama_cpp.llama_n_vocab(self.ctx)
assert self.model is not None
return llama_cpp.llama_n_vocab(self.model)
def tokenizer(self) -> "LlamaTokenizer":
"""Return the tokenizer for this model."""

File diff suppressed because it is too large Load diff

View file

@ -119,7 +119,7 @@ def test_llama_pickle():
def test_utf8(monkeypatch):
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
n_vocab = llama.n_vocab()
## Set up mock function
def mock_eval(*args, **kwargs):

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
Subproject commit 16bc66d9479edd5ee12ec734973554d4493c5dfa