diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 63c81f1..76b5f7f 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macOS-latest] + os: [ubuntu-20.04, windows-2019, macos-11] steps: - uses: actions/checkout@v3 @@ -23,19 +23,19 @@ jobs: with: python-version: "3.8" - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.12.1 - - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install -e .[all] - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + uses: pypa/cibuildwheel@v2.16.5 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" + with: + package-dir: . + output-dir: wheelhouse - uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml new file mode 100644 index 0000000..a222dce --- /dev/null +++ b/.github/workflows/build-wheels-cuda.yaml @@ -0,0 +1,131 @@ +name: Build Wheels (CUDA) + +on: workflow_dispatch + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-20.04', 'windows-latest') + 'pyver' = @("3.10", "3.11", "3.12") + 'cuda' = @("12.1.1", "12.2.2", "12.3.2") + 'releasetag' = @("basic") + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Setup Mamba + uses: conda-incubator/setup-miniconda@v2.2.0 + with: + activate-environment: "build" + python-version: ${{ matrix.pyver }} + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + add-pip-as-python-dependency: true + auto-activate-base: false + + - name: VS Integration Cache + id: vs-integration-cache + if: runner.os == 'Windows' + uses: actions/cache@v3.3.2 + with: + path: ./MSBuildExtensions + key: cuda-${{ matrix.cuda }}-vs-integration + + - name: Get Visual Studio Integration + if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' + run: | + if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} + $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) + for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} + Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' + & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null + Remove-Item 'cudainstaller.zip' + + - name: Install Visual Studio Integration + if: runner.os == 'Windows' + run: | + $y = (gi '.\MSBuildExtensions').fullname + '\*' + (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) + $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') + echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV + + - name: Install Dependencies + env: + MAMBA_DOWNLOAD_FAILFAST: "0" + MAMBA_NO_LOW_SPEED_LIMIT: "1" + run: | + $cudaVersion = $env:CUDAVER + mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion + python -m pip install build wheel + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + if ($IsLinux) { + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + } + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all' + $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" + if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + } + if ($env:AVXVER -eq 'AVX512') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on' + } + if ($env:AVXVER -eq 'basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + } + python -m build --wheel + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + - uses: softprops/action-gh-release@v1 + with: + files: dist/* + # Set tag_name to -cu + tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml new file mode 100644 index 0000000..2cca477 --- /dev/null +++ b/.github/workflows/build-wheels-metal.yaml @@ -0,0 +1,87 @@ +name: Build Wheels (Metal) + +on: workflow_dispatch + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('macos-11', 'macos-12', 'macos-13') + 'pyver' = @('3.10', '3.11', '3.12') + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} Python ${{ matrix.pyver }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + env: + OSVER: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Install Dependencies + run: | + python -m pip install build wheel cmake + + - name: Build Wheel + run: | + XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer" + XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin" + export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on" + [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0" + [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0" + [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0" + + export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64" + VERBOSE=1 python -m build --wheel + + if [[ "$OSVER" == "macos-13" ]]; then + export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" + export MACOSX_DEPLOYMENT_TARGET="14.0" + VERBOSE=1 python -m build --wheel + fi + + for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done + + export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64" + VERBOSE=1 python -m build --wheel + + if [[ "$OSVER" == "macos-13" ]]; then + export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" + export MACOSX_DEPLOYMENT_TARGET="14.0" + VERBOSE=1 python -m build --wheel + fi + + - uses: softprops/action-gh-release@v1 + with: + files: dist/* + # set release name to -metal + tag_name: ${{ github.ref_name }}-metal + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml new file mode 100644 index 0000000..9042d6c --- /dev/null +++ b/.github/workflows/generate-index-from-release.yaml @@ -0,0 +1,48 @@ +name: Wheels Index + +on: + # Trigger on any new release + release: + types: [published] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v4 + - name: Build + run: | + ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' + ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' + ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' + ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + # Upload entire repository + path: 'index' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index a85eaa4..fc4e29b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.59] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c +- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247 +- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3 +- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295 + +## [0.2.58] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c +- feat: add support for KV cache quantization options by @Limour-dev in #1307 +- feat: Add logprobs support to chat completions by @windspirit95 in #1311 +- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289 +- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273 +- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317 + ## [0.2.57] - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 @@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.2.55] -- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5 +- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244 ## [0.2.54] diff --git a/README.md b/README.md index 3323f38..c4e194b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]() Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. This package provides: @@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho If this fails, add `--verbose` to the `pip install` see the full cmake build log. +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with basic CPU support. + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +``` + ### Installation Configuration `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list. @@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
-cuBLAS (CUDA) +CUDA -To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing: +To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing: ```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python +``` + +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: + +- CUDA Version is 12.1, 12.2 or 12.3 +- Python Version is 3.10, 3.11 or 3.12 + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/ +``` + +Where `` is one of the following: +- `cu121`: CUDA 12.1 +- `cu122`: CUDA 12.2 +- `cu123`: CUDA 12.3 + +For example, to install the CUDA 12.1 wheel: + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 ```
@@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python ``` +**Pre-built Wheel (New)** + +It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: + +- MacOS Version is 11.0 or later +- Python Version is 3.10, 3.11 or 3.12 + +```bash +pip install llama-cpp-python \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal +``` +
@@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1` ### JSON and JSON Schema Mode -To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion). +To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion). #### JSON Mode @@ -529,7 +575,7 @@ llama = Llama( ### Embeddings -To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding). +To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding). ```python import llama_cpp @@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this: ```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' +CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35 ``` diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 1e802fa..2f5219c 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.57" \ No newline at end of file +__version__ = "0.2.59" \ No newline at end of file diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 22d0bef..79f6543 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -730,12 +730,14 @@ class _LlamaSamplingContext: if len(self.prev) > 0: nl_token = ctx_main.model.token_nl() nl_logit = logits_array[nl_token] - if self.params.penalty_last_n > 0: + last_tokens = self.prev[-self.params.penalty_last_n:] + last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) + if last_tokens_size > 0: + last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens) ctx_main.sample_repetition_penalties( token_data_array, - # TODO: Only create this once - (llama_cpp.llama_token * len(self.prev))(*self.prev), - self.params.penalty_last_n, + last_tokens_p, + last_tokens_size, self.params.penalty_repeat, self.params.penalty_freq, self.params.penalty_present, diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fed84d5..e07d57a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -105,6 +105,9 @@ class Llama: draft_model: Optional[LlamaDraftModel] = None, # Tokenizer Override tokenizer: Optional[BaseLlamaTokenizer] = None, + # KV cache quantization + type_k: Optional[int] = None, + type_v: Optional[int] = None, # Misc verbose: bool = True, # Extra Params @@ -172,6 +175,8 @@ class Llama: draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. verbose: Print verbose output to stderr. + type_k: KV cache data type for K (default: f16) + type_v: KV cache data type for V (default: f16) Raises: ValueError: If the model path does not exist. @@ -298,7 +303,11 @@ class Llama: ) # Must be set to True for speculative decoding self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - + # KV cache quantization + if type_k is not None: + self.context_params.type_k = type_k + if type_v is not None: + self.context_params.type_v = type_v # Sampling Params self.last_n_tokens_size = last_n_tokens_size @@ -526,14 +535,16 @@ class Llama: # Save tokens self.input_ids[n_past : n_past + n_tokens] = batch # Save logits - rows = n_tokens - cols = self._n_vocab - offset = ( - 0 if self.context_params.logits_all else n_tokens - 1 - ) # NOTE: Only save the last token logits if logits_all is False - self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[ - : - ] = self._ctx.get_logits()[offset * cols : rows * cols] + if self.context_params.logits_all: + rows = n_tokens + cols = self._n_vocab + logits = self._ctx.get_logits()[: rows * cols] + self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits + else: + rows = 1 + cols = self._n_vocab + logits = self._ctx.get_logits()[: rows * cols] + self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits # Update n_tokens self.n_tokens += n_tokens @@ -1653,6 +1664,7 @@ class Llama: top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, stream=stream, stop=stop, seed=seed, @@ -1723,6 +1735,7 @@ class Llama: n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, + pooling_type=self.context_params.pooling_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, @@ -1732,6 +1745,7 @@ class Llama: yarn_orig_ctx=self.context_params.yarn_orig_ctx, logits_all=self.context_params.logits_all, embedding=self.context_params.embeddings, + offload_kqv=self.context_params.offload_kqv, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params @@ -1743,51 +1757,17 @@ class Llama: # Chat Format Params chat_format=self.chat_format, chat_handler=self.chat_handler, + # Speculative Decidng + draft_model=self.draft_model, + # KV cache quantization + type_k=self.context_params.type_k, + type_v=self.context_params.type_v, # Misc verbose=self.verbose, ) def __setstate__(self, state): - self.__init__( - model_path=state["model_path"], - # Model Params - n_gpu_layers=state["n_gpu_layers"], - split_mode=state["split_mode"], - main_gpu=state["main_gpu"], - tensor_split=state["tensor_split"], - vocab_only=state["vocab_only"], - use_mmap=state["use_mmap"], - use_mlock=state["use_mlock"], - kv_overrides=state["kv_overrides"], - # Context Params - seed=state["seed"], - n_ctx=state["n_ctx"], - n_batch=state["n_batch"], - n_threads=state["n_threads"], - n_threads_batch=state["n_threads_batch"], - rope_freq_base=state["rope_freq_base"], - rope_freq_scale=state["rope_freq_scale"], - rope_scaling_type=state["rope_scaling_type"], - yarn_ext_factor=state["yarn_ext_factor"], - yarn_attn_factor=state["yarn_attn_factor"], - yarn_beta_fast=state["yarn_beta_fast"], - yarn_beta_slow=state["yarn_beta_slow"], - yarn_orig_ctx=state["yarn_orig_ctx"], - logits_all=state["logits_all"], - embedding=state["embedding"], - # Sampling Params - last_n_tokens_size=state["last_n_tokens_size"], - # LoRA Params - lora_base=state["lora_base"], - lora_path=state["lora_path"], - # Backend Params - numa=state["numa"], - # Chat Format Params - chat_format=state["chat_format"], - chat_handler=state["chat_handler"], - # Misc - verbose=state["verbose"], - ) + self.__init__(**state) def save_state(self) -> LlamaState: assert self._ctx.ctx is not None diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ccf4fd0..06cf9ce 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -231,6 +231,7 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, + "logprobs": completion["choices"][0]["logprobs"], "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat( "delta": { "role": "assistant", }, + "logprobs": None, "finish_reason": None, } ], @@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), + "logprobs": chunk["choices"][0]["logprobs"], "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler( temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, + logprobs: int = 0, min_p: float = 0.05, typical_p: float = 1.0, stream: bool = False, @@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler( top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=logprobs, stream=stream, stop=stop, seed=seed, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1b8f6ca..accc02c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa byref = ctypes.byref # type: ignore +# from ggml.h +# // NOTE: always add types at the end of the enum to keep backward compatibility +# enum ggml_type { +# GGML_TYPE_F32 = 0, +# GGML_TYPE_F16 = 1, +# GGML_TYPE_Q4_0 = 2, +# GGML_TYPE_Q4_1 = 3, +# // GGML_TYPE_Q4_2 = 4, support has been removed +# // GGML_TYPE_Q4_3 = 5, support has been removed +# GGML_TYPE_Q5_0 = 6, +# GGML_TYPE_Q5_1 = 7, +# GGML_TYPE_Q8_0 = 8, +# GGML_TYPE_Q8_1 = 9, +# GGML_TYPE_Q2_K = 10, +# GGML_TYPE_Q3_K = 11, +# GGML_TYPE_Q4_K = 12, +# GGML_TYPE_Q5_K = 13, +# GGML_TYPE_Q6_K = 14, +# GGML_TYPE_Q8_K = 15, +# GGML_TYPE_IQ2_XXS = 16, +# GGML_TYPE_IQ2_XS = 17, +# GGML_TYPE_IQ3_XXS = 18, +# GGML_TYPE_IQ1_S = 19, +# GGML_TYPE_IQ4_NL = 20, +# GGML_TYPE_IQ3_S = 21, +# GGML_TYPE_IQ2_S = 22, +# GGML_TYPE_IQ4_XS = 23, +# GGML_TYPE_I8 = 24, +# GGML_TYPE_I16 = 25, +# GGML_TYPE_I32 = 26, +# GGML_TYPE_I64 = 27, +# GGML_TYPE_F64 = 28, +# GGML_TYPE_IQ1_M = 29, +# GGML_TYPE_COUNT, +# }; +GGML_TYPE_F32 = 0 +GGML_TYPE_F16 = 1 +GGML_TYPE_Q4_0 = 2 +GGML_TYPE_Q4_1 = 3 +GGML_TYPE_Q5_0 = 6 +GGML_TYPE_Q5_1 = 7 +GGML_TYPE_Q8_0 = 8 +GGML_TYPE_Q8_1 = 9 +GGML_TYPE_Q2_K = 10 +GGML_TYPE_Q3_K = 11 +GGML_TYPE_Q4_K = 12 +GGML_TYPE_Q5_K = 13 +GGML_TYPE_Q6_K = 14 +GGML_TYPE_Q8_K = 15 +GGML_TYPE_IQ2_XXS = 16 +GGML_TYPE_IQ2_XS = 17 +GGML_TYPE_IQ3_XXS = 18 +GGML_TYPE_IQ1_S = 19 +GGML_TYPE_IQ4_NL = 20 +GGML_TYPE_IQ3_S = 21 +GGML_TYPE_IQ2_S = 22 +GGML_TYPE_IQ4_XS = 23 +GGML_TYPE_I8 = 24 +GGML_TYPE_I16 = 25 +GGML_TYPE_I32 = 26 +GGML_TYPE_I64 = 27 +GGML_TYPE_F64 = 28 +GGML_TYPE_IQ1_M = 29 +GGML_TYPE_COUNT = 30 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); @@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 4 -LLAMA_SESSION_VERSION = 4 +# define LLAMA_SESSION_VERSION 5 +LLAMA_SESSION_VERSION = 5 # struct llama_model; @@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32 # enum llama_vocab_type { # LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab -# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece -# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding -# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece +# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback +# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE +# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece # }; LLAMA_VOCAB_TYPE_NONE = 0 +"""For models without vocab""" LLAMA_VOCAB_TYPE_SPM = 1 +"""LLaMA tokenizer based on byte-level BPE with byte fallback""" LLAMA_VOCAB_TYPE_BPE = 2 +"""GPT-2 tokenizer based on byte-level BPE""" LLAMA_VOCAB_TYPE_WPM = 3 +"""BERT tokenizer based on WordPiece""" # // note: these values should be synchronized with ggml_rope @@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly.""" # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool pure; // quantize all tensors to the default type # void * imatrix; // pointer to importance matrix data +# void * kv_overrides; // pointer to vector containing overrides # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type imatrix (ctypes.c_void_p): pointer to importance matrix data + kv_overrides (ctypes.c_void_p): pointer to vector containing overrides """ _fields_ = [ @@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure): ("only_copy", ctypes.c_bool), ("pure", ctypes.c_bool), ("imatrix", ctypes.c_void_p), + ("kv_overrides", ctypes.c_void_p), ] @@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /): # // Token logits obtained from the last call to llama_decode() -# // The logits for the last token are stored in the last row -# // Logits for which llama_batch.logits[i] == 0 are undefined -# // Rows: n_tokens provided with llama_batch +# // The logits for which llama_batch.logits[i] != 0 are stored contiguously +# // in the order they have appeared in the batch. +# // Rows: number of tokens for which llama_batch.logits[i] != 0 # // Cols: n_vocab # LLAMA_API float * llama_get_logits(struct llama_context * ctx); @ctypes_function( @@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: # // Logits for the ith token. Equivalent to: -# // llama_get_logits(ctx) + i*n_vocab +# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab +# // returns NULL for invalid ids. # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_logits_ith", @@ -1874,8 +1947,12 @@ def llama_get_logits_ith( ... -# // Get all output token embeddings -# // shape: [n_tokens*n_embd] (1-dimensional) +# // Get all output token embeddings. +# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, +# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously +# // in the order they have appeared in the batch. +# // shape: [n_outputs*n_embd] +# // Otherwise, returns NULL. # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @ctypes_function( "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) @@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float] ... -# // Get the embeddings for the ith token -# // llama_get_embeddings(ctx) + i*n_embd +# // Get the embeddings for the ith token. Equivalent to: +# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd # // shape: [n_embd] (1-dimensional) +# // returns NULL for invalid ids. # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_embeddings_ith", diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 1b1befe..87e000f 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict): class ChatCompletionResponseChoice(TypedDict): index: int message: "ChatCompletionResponseMessage" + logprobs: Optional[CompletionLogprobs] finish_reason: Optional[str] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2e1081e..815ed3c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -405,6 +405,18 @@ async def create_chat_completion( } }, }, + "logprobs": { + "summary": "Logprobs", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + "logprobs": True, + "top_logprobs": 10 + }, + }, } ), llama_proxy: LlamaProxy = Depends(get_llama_proxy), @@ -493,7 +505,7 @@ async def tokenize( ) -> TokenizeInputResponse: tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - return {"tokens": tokens} + return TokenizeInputResponse(tokens=tokens) @router.post( @@ -508,7 +520,7 @@ async def count_query_tokens( ) -> TokenizeInputCountResponse: tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - return {"count": len(tokens)} + return TokenizeInputCountResponse(count=len(tokens)) @router.post( @@ -523,4 +535,4 @@ async def detokenize( ) -> DetokenizeInputResponse: text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") - return {"text": text} + return DetokenizeInputResponse(text=text) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index dace8d5..c24fca6 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -175,6 +175,9 @@ class LlamaProxy: chat_handler=chat_handler, # Speculative Decoding draft_model=draft_model, + # KV Cache Quantization + type_k=settings.type_k, + type_v=settings.type_v, # Tokenizer tokenizer=tokenizer, # Misc diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index daa913f..9ebdd0d 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -159,6 +159,15 @@ class ModelSettings(BaseSettings): default=10, description="Number of tokens to predict using the draft model.", ) + # KV Cache Quantization + type_k: Optional[int] = Field( + default=None, + description="Type of the key cache quantization.", + ) + type_v: Optional[int] = Field( + default=None, + description="Type of the value cache quantization.", + ) # Misc verbose: bool = Field( default=True, description="Whether to print debug information." diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index c8b2ebc..ce9c87a 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logprobs: Optional[int] = Field(None) seed: Optional[int] = Field(None) # ignored or currently unsupported @@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + logprobs: Optional[bool] = Field( + default=False, + description="Whether to output the logprobs or not. Default is True" + ) + top_logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.", + ) temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -268,7 +276,7 @@ class ModelList(TypedDict): class TokenizeInputRequest(BaseModel): model: Optional[str] = model_field - input: Optional[str] = Field(description="The input to tokenize.") + input: str = Field(description="The input to tokenize.") model_config = { "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]} diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh new file mode 100755 index 0000000..00ae567 --- /dev/null +++ b/scripts/releases-to-pep-503.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Get output directory or default to index/whl/cpu +output_dir=${1:-"index/whl/cpu"} + +# Create output directory +mkdir -p $output_dir + +# Change to output directory +pushd $output_dir + +# Create an index html file +echo "" > index.html +echo "" >> index.html +echo " " >> index.html +echo " " >> index.html +echo " llama-cpp-python" >> index.html +echo "
" >> index.html +echo " " >> index.html +echo "" >> index.html +echo "" >> index.html + +# Create llama-cpp-python directory +mkdir -p llama-cpp-python + +# Change to llama-cpp-python directory +pushd llama-cpp-python + +# Create an index html file +echo "" > index.html +echo "" >> index.html +echo " " >> index.html +echo "

Links for llama-cpp-python

" >> index.html + +# Get all releases +releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name) + +# Get pattern from second arg or default to valid python package version pattern +pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"} + +# Filter releases by pattern +releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern) + +# For each release, get all assets +for release in $releases; do + assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets) + echo "

$release

" >> index.html + for asset in $(echo $assets | jq -r .[].browser_download_url); do + if [[ $asset == *".whl" ]]; then + echo " $asset" >> index.html + echo "
" >> index.html + fi + done +done + +echo " " >> index.html +echo "" >> index.html +echo "" >> index.html diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 50ccaf5..60cdf40 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652 +Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640