Merge pull request #604 from aliencaocao/main-1

Add doc string for n_gpu_layers argument and make -1 offload all layers
2023-08-14 22:40:10 -04:00 · 2023-08-14 22:40:10 -04:00 · b99e758045
parent b345d60987 c471871d0b
commit b99e758045
1 changed files with 2 additions and 1 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -239,6 +239,7 @@ class Llama:
            n_ctx: Maximum context size.
            n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
            seed: Random seed. -1 for random.
+            n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
            f16_kv: Use half-precision for key/value cache.
            logits_all: Return logits for all tokens, not just the last token.
            vocab_only: Only load the vocabulary no weights.
@ -267,7 +268,7 @@ class Llama:

        self.params = llama_cpp.llama_context_default_params()
        self.params.n_ctx = n_ctx
-        self.params.n_gpu_layers = n_gpu_layers
+        self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
        self.params.seed = seed
        self.params.f16_kv = f16_kv
        self.params.logits_all = logits_all