Update llama.cpp

This commit is contained in:
Andrei Betlen 2023-08-27 12:59:20 -04:00
parent 9ab49bc1d4
commit 4887973c22
4 changed files with 28 additions and 33 deletions

View file

@ -445,17 +445,17 @@ class Llama:
""" """
assert self.model is not None assert self.model is not None
output = b"" output = b""
size = 8 size = 32
buffer = (ctypes.c_char * size)() buffer = (ctypes.c_char * size)()
for token in tokens: for token in tokens:
n = llama_cpp.llama_token_to_str_with_model( n = llama_cpp.llama_token_to_piece_with_model(
self.model, llama_cpp.llama_token(token), buffer, size self.model, llama_cpp.llama_token(token), buffer, size
) )
assert n <= size assert n <= size
output += bytes(buffer[:n]) output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt # NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token # this line removes a leading space if the first token is a beginning of sentence token
return output return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
def set_cache(self, cache: Optional[BaseLlamaCache]): def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache. """Set the cache.

View file

@ -973,48 +973,43 @@ _lib.llama_tokenize_with_model.argtypes = [
_lib.llama_tokenize_with_model.restype = c_int _lib.llama_tokenize_with_model.restype = c_int
# // Token Id -> String. Uses the vocabulary in the provided context # // Token Id -> Piece.
# // Does not write null terminator to the buffer # // Uses the vocabulary in the provided context.
# LLAMA_API int llama_token_to_str( # // Does not write null terminator to the buffer.
# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
# LLAMA_API int llama_token_to_piece(
# const struct llama_context * ctx, # const struct llama_context * ctx,
# llama_token token, # llama_token token,
# char * buf, # char * buf,
# int length); # int length);
def llama_token_to_str( def llama_token_to_piece(
ctx: llama_context_p, token: llama_token, buf: bytes, length: c_int ctx: llama_context_p, token: llama_token, buf: bytes, length: c_int
) -> int: ) -> int:
return _lib.llama_token_to_str(ctx, token, buf, length) return _lib.llama_token_to_piece(ctx, token, buf, length)
_lib.llama_tokenize_with_model.argtypes = [ _lib.llama_token_to_piece.argtypes = [llama_context_p, llama_token, c_char_p, c_int]
llama_model_p, _lib.llama_token_to_piece.restype = c_int
c_char_p,
llama_token_p,
c_int,
c_bool,
]
_lib.llama_tokenize_with_model.restype = c_int
# LLAMA_API int llama_token_to_str_with_model( # LLAMA_API int llama_token_to_piece_with_model(
# const struct llama_model * model, # const struct llama_model * model,
# llama_token token, # llama_token token,
# char * buf, # char * buf,
# int length); # int length);
def llama_token_to_str_with_model( def llama_token_to_piece_with_model(
model: llama_model_p, token: llama_token, buf: bytes, length: c_int model: llama_model_p, token: llama_token, buf: bytes, length: c_int
) -> int: ) -> int:
return _lib.llama_token_to_str_with_model(model, token, buf, length) return _lib.llama_token_to_piece_with_model(model, token, buf, length)
_lib.llama_token_to_str_with_model.argtypes = [ _lib.llama_token_to_piece_with_model.argtypes = [
llama_model_p, llama_model_p,
llama_token, llama_token,
c_char_p, c_char_p,
c_int, c_int,
] ]
_lib.llama_token_to_str_with_model.restype = c_int _lib.llama_token_to_piece_with_model.restype = c_int
# // # //
# // Grammar # // Grammar

View file

@ -14,16 +14,16 @@ def test_llama_cpp_tokenization():
tokens = llama.tokenize(text) tokens = llama.tokenize(text)
assert tokens[0] == llama.token_bos() assert tokens[0] == llama.token_bos()
assert tokens == [1, 10994, 2787] assert tokens == [1, 15043, 2787]
detokenized = llama.detokenize(tokens) detokenized = llama.detokenize(tokens)
assert detokenized == text assert detokenized == text
tokens = llama.tokenize(text, add_bos=False) tokens = llama.tokenize(text, add_bos=False)
assert tokens[0] != llama.token_bos() assert tokens[0] != llama.token_bos()
assert tokens == [10994, 2787] assert tokens == [15043, 2787]
detokenized = llama.detokenize(tokens) detokenized = llama.detokenize(tokens)
assert detokenized == text assert detokenized != text
@pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos") @pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit c1ac54b77aaba10d029084d152be786102010eb2 Subproject commit c10704d01e21e3dbe4d6ca1026ebff85349dd239