Update llama.cpp

2023-08-27 12:59:20 -04:00 · 2023-08-27 12:59:20 -04:00 · 4887973c22
parent 9ab49bc1d4
commit 4887973c22
4 changed files with 28 additions and 33 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -445,17 +445,17 @@ class Llama:
        """
        assert self.model is not None
        output = b""
-        size = 8
+        size = 32
        buffer = (ctypes.c_char * size)()
        for token in tokens:
-            n = llama_cpp.llama_token_to_str_with_model(
+            n = llama_cpp.llama_token_to_piece_with_model(
                self.model, llama_cpp.llama_token(token), buffer, size
            )
            assert n <= size
            output += bytes(buffer[:n])
        # NOTE: Llama1 models automatically added a space at the start of the prompt
        # this line removes a leading space if the first token is a beginning of sentence token
-        return output
+        return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
    def set_cache(self, cache: Optional[BaseLlamaCache]):
        """Set the cache.
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -973,48 +973,43 @@ _lib.llama_tokenize_with_model.argtypes = [
 _lib.llama_tokenize_with_model.restype = c_int
-# // Token Id -> String. Uses the vocabulary in the provided context
+# // Token Id -> Piece.
-# // Does not write null terminator to the buffer
+# // Uses the vocabulary in the provided context.
-# LLAMA_API int llama_token_to_str(
+# // Does not write null terminator to the buffer.
 # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
 # LLAMA_API int llama_token_to_piece(
 #         const struct llama_context * ctx,
-#                        llama_token   token,
+#                         llama_token   token,
-#                               char * buf,
+#                                 char * buf,
-#                               int    length);
+#                                 int    length);
-def llama_token_to_str(
+def llama_token_to_piece(
    ctx: llama_context_p, token: llama_token, buf: bytes, length: c_int
 ) -> int:
-    return _lib.llama_token_to_str(ctx, token, buf, length)
+    return _lib.llama_token_to_piece(ctx, token, buf, length)
-_lib.llama_tokenize_with_model.argtypes = [
+_lib.llama_token_to_piece.argtypes = [llama_context_p, llama_token, c_char_p, c_int]
-    llama_model_p,
+_lib.llama_token_to_piece.restype = c_int
    c_char_p,
    llama_token_p,
    c_int,
    c_bool,
 ]
 _lib.llama_tokenize_with_model.restype = c_int
-# LLAMA_API int llama_token_to_str_with_model(
+# LLAMA_API int llama_token_to_piece_with_model(
-#           const struct llama_model * model,
+#             const struct llama_model * model,
-#                        llama_token   token,
+#                         llama_token   token,
-#                               char * buf,
+#                                 char * buf,
-#                               int    length);
+#                                 int    length);
-def llama_token_to_str_with_model(
+def llama_token_to_piece_with_model(
    model: llama_model_p, token: llama_token, buf: bytes, length: c_int
 ) -> int:
-    return _lib.llama_token_to_str_with_model(model, token, buf, length)
+    return _lib.llama_token_to_piece_with_model(model, token, buf, length)
-_lib.llama_token_to_str_with_model.argtypes = [
+_lib.llama_token_to_piece_with_model.argtypes = [
    llama_model_p,
    llama_token,
    c_char_p,
    c_int,
 ]
-_lib.llama_token_to_str_with_model.restype = c_int
+_lib.llama_token_to_piece_with_model.restype = c_int
 # //
 # // Grammar
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@ -14,16 +14,16 @@ def test_llama_cpp_tokenization():
    tokens = llama.tokenize(text)
    assert tokens[0] == llama.token_bos()
-    assert tokens == [1, 10994, 2787]
+    assert tokens == [1, 15043, 2787]
    detokenized = llama.detokenize(tokens)
    assert detokenized == text
    tokens = llama.tokenize(text, add_bos=False)
    assert tokens[0] != llama.token_bos()
-    assert tokens == [10994, 2787]
+    assert tokens == [15043, 2787]
    detokenized = llama.detokenize(tokens)
-    assert detokenized == text
+    assert detokenized != text
@pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit c1ac54b77aaba10d029084d152be786102010eb2
+Subproject commit c10704d01e21e3dbe4d6ca1026ebff85349dd239
		`@ -1 +1 @@`
			`Subproject commit c1ac54b77aaba10d029084d152be786102010eb2`				`Subproject commit c10704d01e21e3dbe4d6ca1026ebff85349dd239`