From abc538fcd55a79293f68bc46b8d078ee7b88bc66 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Jul 2023 01:43:00 -0400 Subject: [PATCH 01/10] fix: annoying bug where attribute exceptions were droining out file not found exceptions --- llama_cpp/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 66c76c9..b52a398 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1503,10 +1503,10 @@ class Llama: return self._convert_text_completion_to_chat(completion) def __del__(self): - if self.model is not None: + if hasattr(self, "model") and self.model is not None: llama_cpp.llama_free_model(self.model) self.model = None - if self.ctx is not None: + if hasattr(self, "ctx") and self.ctx is not None: llama_cpp.llama_free(self.ctx) self.ctx = None From a9b9f0397cd86509b3ea359e5260e329464dc032 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Jul 2023 01:53:08 -0400 Subject: [PATCH 02/10] Format --- llama_cpp/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b52a398..2537af2 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -224,7 +224,7 @@ class Llama: rope_freq_base: float = 10000.0, rope_freq_scale: float = 1.0, n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b - rms_norm_eps: Optional[float] = None, # (TEMPORARY) + rms_norm_eps: Optional[float] = None, # (TEMPORARY) verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -277,7 +277,9 @@ class Llama: if self.tensor_split is not None: FloatArray = (ctypes.c_float * len(self.tensor_split))(*self.tensor_split) - self._p_tensor_split = ctypes.POINTER(ctypes.c_float)(FloatArray) # keep a reference to the array so it is not gc'd + self._p_tensor_split = ctypes.POINTER(ctypes.c_float)( + FloatArray + ) # keep a reference to the array so it is not gc'd self.params.tensor_split = self._p_tensor_split self.params.rope_freq_base = rope_freq_base @@ -959,9 +961,7 @@ class Llama: for token in remaining_tokens: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token - if token_end_position >= ( - remaining_length - first_stop_position - ): + if token_end_position >= (remaining_length - first_stop_position): break logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: From ce57920e608d075335dbd291476420f2abc491be Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Jul 2023 14:45:18 -0400 Subject: [PATCH 03/10] Suppress llama.cpp output when loading model. --- llama_cpp/llama.py | 23 +++++++++++++++++++---- llama_cpp/utils.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 llama_cpp/utils.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2537af2..47f71e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -27,6 +27,8 @@ from .llama_types import * import numpy as np import numpy.typing as npt +from .utils import suppress_stdout_stderr + class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" @@ -308,12 +310,25 @@ class Llama: if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self.model = llama_cpp.llama_load_model_from_file( - self.model_path.encode("utf-8"), self.params - ) + if verbose: + self.model = llama_cpp.llama_load_model_from_file( + self.model_path.encode("utf-8"), self.params + ) + else: + with suppress_stdout_stderr(): + self.model = llama_cpp.llama_load_model_from_file( + self.model_path.encode("utf-8"), self.params + ) assert self.model is not None - self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) + if verbose: + self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) + else: + with suppress_stdout_stderr(): + print("here") + self.ctx = llama_cpp.llama_new_context_with_model( + self.model, self.params + ) assert self.ctx is not None diff --git a/llama_cpp/utils.py b/llama_cpp/utils.py new file mode 100644 index 0000000..c14f53f --- /dev/null +++ b/llama_cpp/utils.py @@ -0,0 +1,38 @@ +import os +import sys + + +class suppress_stdout_stderr(object): + # Oddly enough this works better than the contextlib version + def __enter__(self): + self.outnull_file = open(os.devnull, "w") + self.errnull_file = open(os.devnull, "w") + + self.old_stdout_fileno_undup = sys.stdout.fileno() + self.old_stderr_fileno_undup = sys.stderr.fileno() + + self.old_stdout_fileno = os.dup(sys.stdout.fileno()) + self.old_stderr_fileno = os.dup(sys.stderr.fileno()) + + self.old_stdout = sys.stdout + self.old_stderr = sys.stderr + + os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup) + os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup) + + sys.stdout = self.outnull_file + sys.stderr = self.errnull_file + return self + + def __exit__(self, *_): + sys.stdout = self.old_stdout + sys.stderr = self.old_stderr + + os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) + os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) + + os.close(self.old_stdout_fileno) + os.close(self.old_stderr_fileno) + + self.outnull_file.close() + self.errnull_file.close() From 39978ccaf5b8ca85bc6b72d719e746ea305ad37f Mon Sep 17 00:00:00 2001 From: bretello Date: Thu, 3 Aug 2023 18:22:52 +0200 Subject: [PATCH 04/10] add `mul_mat_q` parameter This also fixes a crash when loading the 70b llama2 model on MacOS with metal and `n_gpu_layers=1` --- llama_cpp/llama_cpp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 423a4a0..bbb2a1e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -181,6 +181,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool mul_mat_q; // if true, use experimental mul_mat_q kernels # bool f16_kv; // use fp16 for KV cache # bool logits_all; // the llama_eval() call computes all logits, not just the last one # bool vocab_only; // only load the vocabulary, no weights @@ -203,6 +204,7 @@ class llama_context_params(Structure): ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), + ("mul_mat_q", c_bool), ("f16_kv", c_bool), ("logits_all", c_bool), ("vocab_only", c_bool), From 9f499af6b0253273d03834eac6b36c5767c57d48 Mon Sep 17 00:00:00 2001 From: bretello Date: Thu, 3 Aug 2023 18:23:26 +0200 Subject: [PATCH 05/10] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 41c6741..8183159 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e +Subproject commit 8183159cf3def112f6d1fe94815fce70e1bffa12 From 097fba25e53866beb08d1cff250a00d75e178127 Mon Sep 17 00:00:00 2001 From: Mike Zeng Date: Sat, 5 Aug 2023 02:00:04 -0500 Subject: [PATCH 06/10] Fixed spelling error "lowe-level API" to "low-level API" --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea1e07f..7c515d0 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggm ## Low-level API The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. -The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). Below is a short example demonstrating how to use the low-level API to tokenize a prompt: From 4cf2fc7d3d2635190f670eff41f0d1e52462f59c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Aug 2023 20:09:55 +0000 Subject: [PATCH 07/10] Bump mkdocs from 1.5.1 to 1.5.2 Bumps [mkdocs](https://github.com/mkdocs/mkdocs) from 1.5.1 to 1.5.2. - [Release notes](https://github.com/mkdocs/mkdocs/releases) - [Commits](https://github.com/mkdocs/mkdocs/compare/1.5.1...1.5.2) --- updated-dependencies: - dependency-name: mkdocs dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1dcbfe6..932f15f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -744,13 +744,13 @@ files = [ [[package]] name = "mkdocs" -version = "1.5.1" +version = "1.5.2" description = "Project documentation with Markdown." optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs-1.5.1-py3-none-any.whl", hash = "sha256:67e889f8d8ba1fe5decdfc59f5f8f21d6a8925a129339e93dede303bdea03a98"}, - {file = "mkdocs-1.5.1.tar.gz", hash = "sha256:f2f323c62fffdf1b71b84849e39aef56d6852b3f0a5571552bca32cefc650209"}, + {file = "mkdocs-1.5.2-py3-none-any.whl", hash = "sha256:60a62538519c2e96fe8426654a67ee177350451616118a41596ae7c876bb7eac"}, + {file = "mkdocs-1.5.2.tar.gz", hash = "sha256:70d0da09c26cff288852471be03c23f0f521fc15cf16ac89c7a3bfb9ae8d24f9"}, ] [package.dependencies] @@ -1757,4 +1757,4 @@ server = ["fastapi", "pydantic-settings", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "6718d680fa89f9518a232c1110ba43958d3e21c54c4dbd9129effa4f40a02b81" +content-hash = "4bfb67dfb72b02c845376211f7f958b2ece8c985944fbd03d246c858e846ddf6" diff --git a/pyproject.toml b/pyproject.toml index e3fcd0e..c636d5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ pydantic-settings = { version = ">=2.0.1", optional = true } [tool.poetry.group.dev.dependencies] black = "^23.7.0" twine = "^4.0.2" -mkdocs = "^1.4.3" +mkdocs = "^1.5.2" mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.21" pytest = "^7.4.0" From 83f8438c4fc6a3b561c0a6881fa5f46c74d993bf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Aug 2023 20:10:12 +0000 Subject: [PATCH 08/10] Bump fastapi from 0.100.1 to 0.101.0 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.100.1 to 0.101.0. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.100.1...0.101.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1dcbfe6..667d88d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -384,17 +384,17 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.100.1" +version = "0.101.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.100.1-py3-none-any.whl", hash = "sha256:ec6dd52bfc4eff3063cfcd0713b43c87640fefb2687bbbe3d8a08d94049cdf32"}, - {file = "fastapi-0.100.1.tar.gz", hash = "sha256:522700d7a469e4a973d92321ab93312448fbe20fca9c8da97effc7e7bc56df23"}, + {file = "fastapi-0.101.0-py3-none-any.whl", hash = "sha256:494eb3494d89e8079c20859d7ca695f66eaccc40f46fe8c75ab6186d15f05ffd"}, + {file = "fastapi-0.101.0.tar.gz", hash = "sha256:ca2ae65fe42f6a34b5cf6c994337149154b1b400c39809d7b2dccdceb5ae77af"}, ] [package.dependencies] -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<3.0.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" starlette = ">=0.27.0,<0.28.0" typing-extensions = ">=4.5.0" From f6a7850e1a316c5168ba51cbdbb669d774cd0c15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 8 Aug 2023 14:30:58 -0400 Subject: [PATCH 09/10] Update llama.cpp --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 423a4a0..bbb2a1e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -181,6 +181,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool mul_mat_q; // if true, use experimental mul_mat_q kernels # bool f16_kv; // use fp16 for KV cache # bool logits_all; // the llama_eval() call computes all logits, not just the last one # bool vocab_only; // only load the vocabulary, no weights @@ -203,6 +204,7 @@ class llama_context_params(Structure): ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), + ("mul_mat_q", c_bool), ("f16_kv", c_bool), ("logits_all", c_bool), ("vocab_only", c_bool), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 41c6741..f5bfea0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e +Subproject commit f5bfea0580e417f99850d5456ca541d871a3e48c From d015bdb4f8ab5591a9147443ec3e0d4f1d0a3192 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 8 Aug 2023 14:35:06 -0400 Subject: [PATCH 10/10] Add mul_mat_q option --- llama_cpp/llama.py | 4 ++++ llama_cpp/server/app.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 47f71e9..9a8c090 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -227,6 +227,7 @@ class Llama: rope_freq_scale: float = 1.0, n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b rms_norm_eps: Optional[float] = None, # (TEMPORARY) + mul_mat_q: Optional(bool) = None, # (TEMPORARY) verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -293,6 +294,9 @@ class Llama: if rms_norm_eps is not None: self.params.rms_norm_eps = rms_norm_eps + if mul_mat_q is not None: + self.params.mul_mat_q = mul_mat_q + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4afcfd5..3d5238b 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -103,6 +103,10 @@ class Settings(BaseSettings): default=None, description="TEMPORARY", ) + mul_mat_q: Optional[bool] = Field( + default=None, + description="TEMPORARY", + ) class ErrorResponse(TypedDict):