From ca01f98e09f2f4146d8adb19efbd48460a99068c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 May 2023 14:11:33 -0400 Subject: [PATCH] Add LlamaTokenizer class --- llama_cpp/llama.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b7a8d79..7dd1acb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1380,6 +1380,11 @@ class Llama: assert self.ctx is not None return llama_cpp.llama_n_vocab(self.ctx) + def tokenizer(self) -> "LlamaTokenizer": + """Return the tokenizer for this model.""" + assert self.ctx is not None + return LlamaTokenizer(self) + @staticmethod def token_eos() -> int: """Return the end-of-sequence token.""" @@ -1410,3 +1415,18 @@ class Llama: else: break return longest_prefix + + +class LlamaTokenizer: + def __init__(self, llama: Llama): + self.llama = llama + + def encode(self, text: str) -> List[int]: + return self.llama.tokenize(text.encode("utf-8", errors="ignore")) + + def decode(self, tokens: List[int]) -> str: + return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "LlamaTokenizer": + return cls(Llama(model_path=path, vocab_only=True))