diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..403ff30 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,30 @@ +name: Tests + +on: + push: + branches: + - main + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + with: + submodules: "true" + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip pytest cmake scikit-build + python3 setup.py develop + - name: Test with pytest + run: | + pytest diff --git a/README.md b/README.md index de08def..c69b70c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # 🦙 Python Bindings for `llama.cpp` [![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python) +[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) @@ -70,7 +71,7 @@ python3 setup.py develop # How does this compare to other Python bindings of `llama.cpp`? -I wrote this package for my own use, I had two goals in mind: +I originally wrote this package for my own use with two goals in mind: - Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python - Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` diff --git a/docs/index.md b/docs/index.md index 4df5ee3..368c429 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,8 +71,10 @@ python3 setup.py develop - sample - generate - create_embedding + - embed - create_completion - __call__ + - create_chat_completion - token_bos - token_eos show_root_heading: true diff --git a/examples/fastapi_server.py b/examples/fastapi_server.py deleted file mode 100644 index e219b19..0000000 --- a/examples/fastapi_server.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Example FastAPI server for llama.cpp. -""" -import json -from typing import List, Optional, Iterator - -import llama_cpp - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=True, - use_mlock=True, - embedding=True, - n_threads=6, - n_batch=2048, -) - - -class CreateCompletionRequest(BaseModel): - prompt: str - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - logprobs: Optional[int] = Field(None) - echo: bool = False - stop: List[str] = [] - repeat_penalty: float = 1.1 - top_k: int = 40 - stream: bool = False - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion(request: CreateCompletionRequest): - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - return llama(**request.dict()) - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding(request: CreateEmbeddingRequest): - return llama.create_embedding(request.input) diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py new file mode 100644 index 0000000..760a6ca --- /dev/null +++ b/examples/high_level_api/fastapi_server.py @@ -0,0 +1,181 @@ +"""Example FastAPI server for llama.cpp. + +To run this example: + +```bash +pip install fastapi uvicorn sse-starlette +export MODEL=../models/7B/... +uvicorn fastapi_server_chat:app --reload +``` + +Then visit http://localhost:8000/docs to see the interactive API docs. + +""" +import os +import json +from typing import List, Optional, Literal, Union, Iterator + +import llama_cpp + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from sse_starlette.sse import EventSourceResponse + + +class Settings(BaseSettings): + model: str + n_ctx: int = 2048 + n_batch: int = 2048 + n_threads: int = os.cpu_count() or 1 + f16_kv: bool = True + use_mlock: bool = True + embedding: bool = True + last_n_tokens_size: int = 64 + + +app = FastAPI( + title="🦙 llama.cpp Python API", + version="0.0.1", +) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +settings = Settings() +llama = llama_cpp.Llama( + settings.model, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + embedding=settings.embedding, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, +) + + +class CreateCompletionRequest(BaseModel): + prompt: str + suffix: Optional[str] = Field(None) + max_tokens: int = 16 + temperature: float = 0.8 + top_p: float = 0.95 + logprobs: Optional[int] = Field(None) + echo: bool = False + stop: List[str] = [] + repeat_penalty: float = 1.1 + top_k: int = 40 + stream: bool = False + + class Config: + schema_extra = { + "example": { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + } + + +CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) + + +@app.post( + "/v1/completions", + response_model=CreateCompletionResponse, +) +def create_completion(request: CreateCompletionRequest): + if request.stream: + chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore + return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) + return llama(**request.dict()) + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] + input: str + user: Optional[str] + + class Config: + schema_extra = { + "example": { + "input": "The food was delicious and the waiter...", + } + } + + +CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) + + +@app.post( + "/v1/embeddings", + response_model=CreateEmbeddingResponse, +) +def create_embedding(request: CreateEmbeddingRequest): + return llama.create_embedding(**request.dict(exclude={"model", "user"})) + + +class ChatCompletionRequestMessage(BaseModel): + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] + content: str + user: Optional[str] = None + + +class CreateChatCompletionRequest(BaseModel): + model: Optional[str] + messages: List[ChatCompletionRequestMessage] + temperature: float = 0.8 + top_p: float = 0.95 + stream: bool = False + stop: List[str] = [] + max_tokens: int = 128 + repeat_penalty: float = 1.1 + + class Config: + schema_extra = { + "example": { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ), + ] + } + } + + +CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) + + +@app.post( + "/v1/chat/completions", + response_model=CreateChatCompletionResponse, +) +async def create_chat_completion( + request: CreateChatCompletionRequest, +) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + completion_or_chunks = llama.create_chat_completion( + **request.dict(exclude={"model"}), + ) + + if request.stream: + + async def server_sent_events( + chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], + ): + for chat_chunk in chat_chunks: + yield dict(data=json.dumps(chat_chunk)) + yield dict(data="[DONE]") + + chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore + + return EventSourceResponse( + server_sent_events(chunks), + ) + completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore + return completion diff --git a/examples/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py similarity index 100% rename from examples/high_level_api_embedding.py rename to examples/high_level_api/high_level_api_embedding.py diff --git a/examples/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py similarity index 95% rename from examples/high_level_api_inference.py rename to examples/high_level_api/high_level_api_inference.py index 136f22c..0fa9cb7 100644 --- a/examples/high_level_api_inference.py +++ b/examples/high_level_api/high_level_api_inference.py @@ -11,7 +11,7 @@ llm = Llama(model_path=args.model) output = llm( "Question: What are the names of the planets in the solar system? Answer: ", - max_tokens=1, + max_tokens=48, stop=["Q:", "\n"], echo=True, ) diff --git a/examples/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py similarity index 83% rename from examples/high_level_api_streaming.py rename to examples/high_level_api/high_level_api_streaming.py index d744090..787bc6e 100644 --- a/examples/high_level_api_streaming.py +++ b/examples/high_level_api/high_level_api_streaming.py @@ -4,7 +4,7 @@ import argparse from llama_cpp import Llama parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default=".//models/...") +parser.add_argument("-m", "--model", type=str, default="./models/...") args = parser.parse_args() llm = Llama(model_path=args.model) diff --git a/examples/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py similarity index 100% rename from examples/langchain_custom_llm.py rename to examples/high_level_api/langchain_custom_llm.py diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py similarity index 100% rename from examples/low_level_api_llama_cpp.py rename to examples/low_level_api/low_level_api_llama_cpp.py diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py new file mode 100644 index 0000000..8bd03f8 --- /dev/null +++ b/examples/low_level_api/quantize.py @@ -0,0 +1,25 @@ +import os +import argparse +import llama_cpp + + +def main(args): + if not os.path.exists(fname_inp): + raise RuntimeError(f"Input file does not exist ({fname_inp})") + if os.path.exists(fname_out): + raise RuntimeError(f"Output file already exists ({fname_out})") + fname_inp = args.fname_inp.encode("utf-8") + fname_out = args.fname_out.encode("utf-8") + itype = args.itype + return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) + if return_code != 0: + raise RuntimeError("Failed to quantize model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("fname_inp", type=str, help="Path to input model") + parser.add_argument("fname_out", type=str, help="Path to output model") + parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") + args = parser.parse_args() + main(args) diff --git a/examples/notebooks/PerformanceTuning.ipynb b/examples/notebooks/PerformanceTuning.ipynb new file mode 100644 index 0000000..76e26fb --- /dev/null +++ b/examples/notebooks/PerformanceTuning.ipynb @@ -0,0 +1,5540 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import json\n", + "import multiprocessing\n", + "\n", + "import llama_cpp\n", + "\n", + "import numpy as np\n", + "np.int = int\n", + "\n", + "from skopt.space import Integer, Categorical\n", + "\n", + "\n", + "MODEL_PATH = \"../models/ggml-model.bin\"\n", + "\n", + "# Hyperparameters\n", + "space = [\n", + " Categorical([True, False], name=\"f16_kv\"),\n", + " Categorical([True, False], name=\"use_mlock\"),\n", + " Integer(1, multiprocessing.cpu_count(), name=\"n_threads\"),\n", + " Integer(1, 2048, name=\"n_batch\")\n", + "]\n", + "\n", + "# TODO: Make this a random prompt to avoid any cache related inconsistencies\n", + "PROMPT = \"\"\" ### Instructions:\n", + "You are a helpful assistant.\n", + "You answer questions truthfully and politely.\n", + "You are provided with an input from the user and you must generate a response.\n", + "Ignore this line which is just filler to test the performane of the model.\n", + "### Inputs:\n", + "What is the capital of France?\n", + "### Response:\n", + "\"\"\"\n", + "\n", + "from skopt.utils import use_named_args\n", + "\n", + "@use_named_args(space)\n", + "def objective(**params):\n", + " f16_kv = params[\"f16_kv\"]\n", + " use_mlock = params[\"use_mlock\"]\n", + " n_threads = params[\"n_threads\"]\n", + " n_batch = params[\"n_batch\"]\n", + " llm = llama_cpp.Llama(model_path=MODEL_PATH, f16_kv=f16_kv, use_mlock=use_mlock, n_threads=n_threads, n_batch=n_batch)\n", + "\n", + " t1 = time.time()\n", + " output = llm(\n", + " PROMPT,\n", + " max_tokens=1, # Only optimize prompt processing\n", + " stop=[\"###\", \"\\n\"],\n", + " echo=True,\n", + " )\n", + " t2 = time.time()\n", + "\n", + " print(json.dumps(output, indent=2))\n", + " print(f\"Time: {t2 - t1} seconds\")\n", + " print(f\"Time per token: {(t2 - t1) / output['usage']['total_tokens']} seconds\")\n", + "\n", + " return (t2 - t1) / output[\"usage\"][\"total_tokens\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d4443e14-fed3-4aa1-9e8a-c70f4503aade\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227287,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.981224775314331 seconds\n", + "Time per token: 0.13726530969142914 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4181439c-2ced-4ddb-b898-a0a7641f3e47\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227300,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.121099948883057 seconds\n", + "Time per token: 0.13901374936103822 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-03ed5585-3de0-4546-96c3-6de7a5b3770c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227312,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.457949876785278 seconds\n", + "Time per token: 0.18072437345981598 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-103817fc-bceb-4e99-b968-3ef540f16dc5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227328,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.334054946899414 seconds\n", + "Time per token: 0.12917568683624267 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-41e34acc-6499-450f-9576-3cb37b82c490\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227340,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.012462615966797 seconds\n", + "Time per token: 0.11265578269958496 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f27244c9-e9c6-4332-ae7f-3856f152ef30\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227350,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.59382700920105 seconds\n", + "Time per token: 0.1949228376150131 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bc5dc1ba-f7ce-441c-a558-5005f2fb89b9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227366,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.544022560119629 seconds\n", + "Time per token: 0.19430028200149535 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2006b117-1239-4b85-bcc4-a7439c01f440\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227383,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.330769300460815 seconds\n", + "Time per token: 0.11663461625576019 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ee50afee-78a8-4d55-9b73-c74cc2567408\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227393,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.17799687385559 seconds\n", + "Time per token: 0.1772249609231949 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1e2b7080-940f-4459-8503-a458db4d3578\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227409,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.127476215362549 seconds\n", + "Time per token: 0.12659345269203187 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-c80008a4-191e-4418-821a-b18a4af24f70\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227421,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.495943784713745 seconds\n", + "Time per token: 0.11869929730892181 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d04c9fd2-3c20-4035-9181-0bfd05abfe15\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227432,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.226310014724731 seconds\n", + "Time per token: 0.11532887518405914 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-04fcf88b-33c7-4b84-aac0-dcb5261363c2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227443,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.182626962661743 seconds\n", + "Time per token: 0.15228283703327178 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14904676-3345-4674-a41c-419d9640b4e0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227457,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 43.595701694488525 seconds\n", + "Time per token: 0.5449462711811066 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9e43b2ef-e7de-4bd2-91bf-284f5b3478fe\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227502,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.726518154144287 seconds\n", + "Time per token: 0.1840814769268036 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3947538b-e27e-42eb-8f87-2b56e14d104c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227518,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.760729789733887 seconds\n", + "Time per token: 0.10950912237167358 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1a0d843e-9613-49aa-b565-0e59d8067615\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227529,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.672860383987427 seconds\n", + "Time per token: 0.14591075479984283 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ccad9270-9554-4f9f-9aaf-387f1a11894d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227542,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.368357419967651 seconds\n", + "Time per token: 0.17960446774959565 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2623073e-004f-4386-98e0-7e6ea617523a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227558,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.44194221496582 seconds\n", + "Time per token: 0.11802427768707276 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1a199f09-0d74-4052-a191-7a8ef2df57f3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227569,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.253167629241943 seconds\n", + "Time per token: 0.14066459536552428 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2b61e491-d9b7-4d0b-b0c8-9f8ba822599d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227582,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.381825685501099 seconds\n", + "Time per token: 0.15477282106876372 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-0e4b4575-6278-4bd8-a4c5-ddb772014f7d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227596,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.473106145858765 seconds\n", + "Time per token: 0.18091382682323456 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1ad3e3db-5120-41c8-8f9e-2ca07a846437\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227612,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 16.591509103775024 seconds\n", + "Time per token: 0.2073938637971878 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-34c8fb5c-fa49-4ea6-b2e7-ba3b958e297d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227630,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.034043788909912 seconds\n", + "Time per token: 0.1129255473613739 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8d5c56eb-0b43-4591-a9ac-c1ec174ec6db\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227641,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.218972444534302 seconds\n", + "Time per token: 0.14023715555667876 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bfdc554b-baa6-47c1-b35f-0f7d1321255a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227654,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.300573110580444 seconds\n", + "Time per token: 0.11625716388225556 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ad67d78b-6975-4789-982e-3653c7fca7e1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227665,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.009618520736694 seconds\n", + "Time per token: 0.11262023150920868 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2eec3e0f-dd48-4c3a-9430-c5048827f557\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227676,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.997699737548828 seconds\n", + "Time per token: 0.11247124671936035 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b129732a-8d7b-4382-baaf-740378c923ec\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227686,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.252354621887207 seconds\n", + "Time per token: 0.11565443277359008 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bb25c002-69e0-40ec-8099-0ba4462338aa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227697,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.040243864059448 seconds\n", + "Time per token: 0.1130030483007431 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63705814-7c93-4d6b-a9f2-0579941ebf54\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227708,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.947132349014282 seconds\n", + "Time per token: 0.11183915436267852 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8afe123b-423d-4757-82d9-15fc12cfd24e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227720,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.335533857345581 seconds\n", + "Time per token: 0.12919417321681975 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4937353f-e66f-4632-aea7-dd1133af9727\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227732,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.99415397644043 seconds\n", + "Time per token: 0.11242692470550537 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-78f86527-ccc7-4a5d-9b7f-38386998ba2a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227743,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.732706308364868 seconds\n", + "Time per token: 0.19665882885456085 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4d98c564-fcb4-45ec-9f8d-f64430abbfb3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227761,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.319743633270264 seconds\n", + "Time per token: 0.11649679541587829 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ee855931-2578-45bc-93bf-319c4e6aa43a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227772,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.189301490783691 seconds\n", + "Time per token: 0.18986626863479614 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14f0b547-4d71-4a7f-a3d6-3127998903b3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227790,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.464989423751831 seconds\n", + "Time per token: 0.11831236779689788 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4eb5258a-5836-414c-88f6-e217bacaded6\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227801,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 13.818569660186768 seconds\n", + "Time per token: 0.1727321207523346 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-66b7c783-d506-45c1-b39b-c91666a02b44\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227817,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 27.316773176193237 seconds\n", + "Time per token: 0.34145966470241546 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d53b48ca-30e2-43c2-9fb5-62ef6a65fafa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227847,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.132777214050293 seconds\n", + "Time per token: 0.11415971517562866 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d0909f83-5caa-4098-a0e6-9b2ad1e2b12f\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227858,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.273045539855957 seconds\n", + "Time per token: 0.11591306924819947 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-7045f5c7-cf5d-48e3-9353-032c320e56fa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227870,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.90743088722229 seconds\n", + "Time per token: 0.11134288609027862 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e623667d-d6cc-4908-a648-60380f723592\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227881,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.06355595588684 seconds\n", + "Time per token: 0.11329444944858551 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-44ec163c-25dd-40ae-a786-d8b4c9ff31b1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227892,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.249061107635498 seconds\n", + "Time per token: 0.11561326384544372 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-cb435214-0d20-4566-b312-68d8960ebe25\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227903,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.296529054641724 seconds\n", + "Time per token: 0.11620661318302154 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-dc704f52-bed9-44f0-8335-a2ec4af3a27c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227914,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.455670356750488 seconds\n", + "Time per token: 0.1556958794593811 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-67570fa5-1c3d-47d6-b7c6-b3a734aae3f5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227928,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.269653558731079 seconds\n", + "Time per token: 0.11587066948413849 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4bd6c6f2-9849-4047-93c8-88b1914ef184\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227939,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.308398485183716 seconds\n", + "Time per token: 0.11635498106479644 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-6413afd7-fdc1-4c28-864d-6acdf2775060\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227950,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.430264711380005 seconds\n", + "Time per token: 0.13037830889225005 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-c4e1c14a-3b8a-4ab3-b42a-f47440f79962\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227962,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.389702558517456 seconds\n", + "Time per token: 0.1173712819814682 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ac307870-dc67-42b8-8bb8-bb8d3083cea2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227974,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.35448431968689 seconds\n", + "Time per token: 0.12943105399608612 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-58c06f3e-3fba-4e23-b12e-141a1742c51b\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227986,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.097248792648315 seconds\n", + "Time per token: 0.11371560990810395 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b5eccb52-85e3-41d0-b8d8-f35e68bf7997\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227997,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.466306686401367 seconds\n", + "Time per token: 0.1558288335800171 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e1dbc2ee-abc0-4891-a474-386d97b521b6\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228011,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.436015367507935 seconds\n", + "Time per token: 0.14295019209384918 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-fd9bce6d-0a33-4c24-90b3-913ab3b33d24\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228025,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.052912712097168 seconds\n", + "Time per token: 0.1756614089012146 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-038fa38d-7640-40ee-907c-0bb131c20d80\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228040,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.250384330749512 seconds\n", + "Time per token: 0.1156298041343689 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d00a2058-9fda-4113-8e5e-bf0f39cef238\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228051,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.228248834609985 seconds\n", + "Time per token: 0.11535311043262482 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f8d90e63-4939-491c-9775-fc15aa55505e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228062,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.341724395751953 seconds\n", + "Time per token: 0.11677155494689942 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9e3777bc-119a-46bf-bdd3-21557e686f3c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228074,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.285743951797485 seconds\n", + "Time per token: 0.11607179939746856 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-123eaa35-110b-4f73-ba60-fa8a75ea929c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228085,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.105633020401001 seconds\n", + "Time per token: 0.1138204127550125 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-cc095f4b-8047-446e-a9f5-c798a66d1003\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228096,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.305238485336304 seconds\n", + "Time per token: 0.1163154810667038 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e2e69b3e-7742-4534-b21f-adfe53345820\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228108,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.190222263336182 seconds\n", + "Time per token: 0.11487777829170227 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-666ae55e-d837-4534-b8e6-9f1b01f69778\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228120,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.126368999481201 seconds\n", + "Time per token: 0.11407961249351502 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63bdfa8e-b7c3-4669-ab76-54cdbb8878d5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228131,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.136119604110718 seconds\n", + "Time per token: 0.11420149505138397 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1ec02c53-c7c8-434e-b28f-70884f8c35b2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228143,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.126901626586914 seconds\n", + "Time per token: 0.11408627033233643 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3ec3495b-009a-4a82-b444-d8c1c6bf20a1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228154,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.08673644065857 seconds\n", + "Time per token: 0.11358420550823212 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-17fd0e6b-7ac3-494f-9e85-4e4a26013ad9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228165,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.252317428588867 seconds\n", + "Time per token: 0.11565396785736085 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14a2647f-3961-4b60-b20a-ae9872c34feb\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228177,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.389162302017212 seconds\n", + "Time per token: 0.14236452877521516 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-fa0e5edd-e9c9-40b9-bc9b-c48b8762850c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228190,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.433730125427246 seconds\n", + "Time per token: 0.11792162656784058 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2b1c5964-265a-488a-8d8f-7e0692fcf96f\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228202,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 47.81757044792175 seconds\n", + "Time per token: 0.5977196305990219 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-516fbd4c-3fe5-4945-bfc5-7312f2c02687\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228252,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.540155410766602 seconds\n", + "Time per token: 0.10675194263458251 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-94c9ab1f-ac6e-4fc7-bcd9-7ab96515a722\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228262,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.660873889923096 seconds\n", + "Time per token: 0.10826092362403869 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63b1e1a7-0c6b-42e0-ba65-6f42d6ec77bb\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228273,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.815936088562012 seconds\n", + "Time per token: 0.11019920110702515 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-92e1a879-2ebd-4299-b86e-90c87762db45\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228284,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.12400484085083 seconds\n", + "Time per token: 0.11405006051063538 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-033ea9dc-fffe-41a0-a695-d647f725ee97\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228296,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 13.992429971694946 seconds\n", + "Time per token: 0.17490537464618683 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-5153f39a-589a-4b3d-8642-8efce64fc439\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228312,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.084643125534058 seconds\n", + "Time per token: 0.11355803906917572 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-af9ea5c6-5449-43b4-9e50-da930af8d6b8\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228323,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.076856851577759 seconds\n", + "Time per token: 0.11346071064472199 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-5bbea5c1-ea8c-4599-bf63-a6eb80bc7525\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228334,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.02251124382019 seconds\n", + "Time per token: 0.11278139054775238 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ff9d87c7-e2b1-4481-9e8f-848d7a0fbd35\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228346,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.012435913085938 seconds\n", + "Time per token: 0.11265544891357422 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3dbe8ae4-c9ca-4a1b-abaf-6b85ef648ba9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228357,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.997032880783081 seconds\n", + "Time per token: 0.11246291100978852 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b20a3b61-9c8b-4b2e-bb43-8ed9ce5a9d0d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228369,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.042449951171875 seconds\n", + "Time per token: 0.11303062438964843 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9c781d69-83e0-415a-ac97-252508b10590\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228380,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.058239459991455 seconds\n", + "Time per token: 0.11322799324989319 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-86cead9e-780f-4503-831c-466a6abd5ab2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228392,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.070426940917969 seconds\n", + "Time per token: 0.1133803367614746 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-65361c7e-74ef-4566-bad5-c6b3867a7f7e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228403,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.985144138336182 seconds\n", + "Time per token: 0.11231430172920227 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-23feb1ca-8103-46d8-ab71-b4da59f05d16\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228415,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.999938011169434 seconds\n", + "Time per token: 0.11249922513961792 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-0db73f26-9ab1-4a78-a11f-e22d915ffae2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228426,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.969520330429077 seconds\n", + "Time per token: 0.11211900413036346 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-54e6edeb-99ea-46ed-8735-5185f78c222c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228438,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.12838339805603 seconds\n", + "Time per token: 0.11410479247570038 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bd6502fd-f8c7-41d8-ab15-b10ca6aabd96\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228450,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.01610016822815 seconds\n", + "Time per token: 0.11270125210285187 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-72733563-53f5-4cd5-a4eb-48656408b2d8\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228461,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.993805408477783 seconds\n", + "Time per token: 0.11242256760597229 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f7365eaa-fd68-422b-bbca-c6bcbcad36e0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228473,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.292223930358887 seconds\n", + "Time per token: 0.11615279912948609 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1cfcf44a-c692-4020-8dcb-e6da8b163920\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228485,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.99638295173645 seconds\n", + "Time per token: 0.11245478689670563 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8b679f09-bc0e-4fc9-a935-9fefd9126993\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228497,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.972327709197998 seconds\n", + "Time per token: 0.11215409636497498 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-08cb0cd7-84d8-4193-a20c-5a6ca4b5e404\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228508,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.024793863296509 seconds\n", + "Time per token: 0.11280992329120636 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ffe4b2b8-c041-4492-9e03-ab79cd4fd60d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228520,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.996853351593018 seconds\n", + "Time per token: 0.11246066689491271 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-196bb891-9299-4f91-9f68-ba6c7233a2dd\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228532,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.039422273635864 seconds\n", + "Time per token: 0.1129927784204483 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e50f5489-b40c-4a5d-9cb2-4a6d13bbb8c7\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228544,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.978781461715698 seconds\n", + "Time per token: 0.11223476827144623 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-210cc2b8-df35-4d3f-a34a-a5facb635ec0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228555,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.032035827636719 seconds\n", + "Time per token: 0.11290044784545898 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e3c7ca0d-c4cb-495c-9210-4e1ed3b6010d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228567,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.0346040725708 seconds\n", + "Time per token: 0.11293255090713501 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-7b4388c9-fe89-486d-83f4-34eec8940c42\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228579,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.016223907470703 seconds\n", + "Time per token: 0.11270279884338379 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-81211a9b-16e4-4876-8e09-b0e619d93ce7\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228591,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.10002589225769 seconds\n", + "Time per token: 0.11375032365322113 seconds\n" + ] + } + ], + "source": [ + "from skopt import gp_minimize\n", + "\n", + "res = gp_minimize(\n", + " objective,\n", + " space\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "from skopt.plots import plot_objective\n", + "\n", + "plot_objective(res)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " fun: 0.10675194263458251\n", + " x: [True, True, 6, 2048]\n", + " func_vals: [ 1.373e-01 1.390e-01 ... 1.127e-01 1.138e-01]\n", + " x_iters: [[True, True, 5, 1300], [False, True, 5, 990], [True, True, 7, 1800], [False, False, 10, 1692], [False, True, 6, 1075], [True, False, 3, 291], [False, True, 3, 514], [False, False, 11, 1569], [False, False, 7, 1915], [False, True, 10, 1514], [False, False, 11, 1527], [False, False, 12, 2033], [False, True, 9, 3], [False, True, 1, 2004], [True, True, 12, 1], [False, False, 6, 2048], [False, False, 4, 2048], [False, False, 10, 1], [False, True, 11, 2048], [False, True, 9, 2048], [False, False, 8, 2017], [False, False, 6, 1], [False, True, 4, 1], [False, False, 6, 1587], [False, False, 9, 1056], [True, True, 12, 1450], [False, True, 6, 2048], [False, False, 6, 2048], [False, False, 6, 2048], [False, True, 6, 2048], [False, True, 6, 2048], [False, True, 5, 2048], [False, True, 6, 1464], [False, True, 8, 1], [True, True, 12, 1798], [True, False, 3, 2048], [True, True, 11, 683], [False, True, 11, 1], [True, True, 2, 1], [False, True, 11, 1238], [True, True, 11, 1260], [True, False, 6, 1295], [True, True, 6, 1292], [False, False, 12, 1250], [False, False, 12, 1200], [True, False, 4, 1250], [False, False, 12, 1191], [False, False, 12, 1180], [True, False, 10, 906], [False, False, 12, 1192], [True, True, 10, 2044], [False, False, 6, 1310], [False, False, 8, 1122], [True, False, 5, 4], [False, False, 7, 322], [False, False, 12, 1246], [False, False, 12, 1247], [False, False, 12, 1252], [True, True, 12, 811], [True, False, 6, 2048], [True, True, 12, 998], [False, True, 12, 1021], [False, True, 12, 1021], [False, True, 12, 1019], [True, False, 6, 759], [True, False, 6, 1064], [False, True, 12, 991], [True, True, 9, 533], [False, False, 11, 956], [False, False, 1, 3], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [False, False, 7, 986], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048]]\n", + " models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097)]\n", + " space: Space([Categorical(categories=(True, False), prior=None),\n", + " Categorical(categories=(True, False), prior=None),\n", + " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", + " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", + " random_state: RandomState(MT19937)\n", + " specs: args: func: \n", + " dimensions: Space([Categorical(categories=(True, False), prior=None),\n", + " Categorical(categories=(True, False), prior=None),\n", + " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", + " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", + " base_estimator: GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097)\n", + " n_calls: 100\n", + " n_random_starts: None\n", + " n_initial_points: 10\n", + " initial_point_generator: random\n", + " acq_func: gp_hedge\n", + " acq_optimizer: auto\n", + " x0: None\n", + " y0: None\n", + " random_state: RandomState(MT19937)\n", + " verbose: False\n", + " callback: None\n", + " n_points: 10000\n", + " n_restarts_optimizer: 5\n", + " xi: 0.01\n", + " kappa: 1.96\n", + " n_jobs: 1\n", + " model_queue_size: None\n", + " function: base_minimize" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5bcfad8..1049e44 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1,8 +1,9 @@ import os +import sys import uuid import time import multiprocessing -from typing import List, Optional, Union, Generator, Sequence +from typing import List, Optional, Union, Generator, Sequence, Iterator from collections import deque from . import llama_cpp @@ -27,6 +28,7 @@ class Llama: n_threads: Optional[int] = None, n_batch: int = 8, last_n_tokens_size: int = 64, + verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -43,6 +45,7 @@ class Llama: n_threads: Number of threads to use. If None, the number of threads is automatically determined. n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + verbose: Print verbose output to stderr. Raises: ValueError: If the model path does not exist. @@ -50,6 +53,7 @@ class Llama: Returns: A Llama instance. """ + self.verbose = verbose self.model_path = model_path self.params = llama_cpp.llama_context_default_params() @@ -68,7 +72,7 @@ class Llama: maxlen=self.last_n_tokens_size, ) self.tokens_consumed = 0 - self.n_batch = n_batch + self.n_batch = min(n_ctx, n_batch) self.n_threads = n_threads or multiprocessing.cpu_count() @@ -79,6 +83,9 @@ class Llama: self.model_path.encode("utf-8"), self.params ) + if self.verbose: + print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: """Tokenize a string. @@ -169,11 +176,6 @@ class Llama: The sampled token. """ assert self.ctx is not None - # Temporary workaround for https://github.com/ggerganov/llama.cpp/issues/684 - if temp == 0.0: - temp = 1.0 - top_p = 0.0 - top_k = 1 return llama_cpp.llama_sample_top_p_top_k( ctx=self.ctx, last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( @@ -239,6 +241,15 @@ class Llama: An embedding object. """ assert self.ctx is not None + + if self.params.embedding == False: + raise RuntimeError( + "Llama model must be created with embedding=True to call this method" + ) + + if self.verbose: + llama_cpp.llama_reset_timings(self.ctx) + tokens = self.tokenize(input.encode("utf-8")) self.reset() self.eval(tokens) @@ -246,6 +257,10 @@ class Llama: embedding = llama_cpp.llama_get_embeddings(self.ctx)[ : llama_cpp.llama_n_embd(self.ctx) ] + + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + return { "object": "list", "data": [ @@ -262,6 +277,17 @@ class Llama: }, } + def embed(self, input: str) -> List[float]: + """Embed a string. + + Args: + input: The utf-8 encoded string to embed. + + Returns: + A list of embeddings + """ + return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) + def _create_completion( self, prompt: str, @@ -275,10 +301,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, - ) -> Union[ - Generator[Completion, None, None], - Generator[CompletionChunk, None, None], - ]: + ) -> Union[Iterator[Completion], Iterator[CompletionChunk],]: assert self.ctx is not None completion_id = f"cmpl-{str(uuid.uuid4())}" created = int(time.time()) @@ -288,6 +311,9 @@ class Llama: text = b"" returned_characters = 0 + if self.verbose: + llama_cpp.llama_reset_timings(self.ctx) + if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): raise ValueError( f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" @@ -341,7 +367,7 @@ class Llama: "model": self.model_path, "choices": [ { - "text": text[start :].decode("utf-8"), + "text": text[start:].decode("utf-8"), "index": 0, "logprobs": None, "finish_reason": None, @@ -384,6 +410,9 @@ class Llama: if logprobs is not None: raise NotImplementedError("logprobs not implemented") + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + yield { "id": completion_id, "object": "text_completion", @@ -417,7 +446,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, - ) -> Union[Completion, Generator[CompletionChunk, None, None]]: + ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. Args: @@ -454,7 +483,7 @@ class Llama: stream=stream, ) if stream: - chunks: Generator[CompletionChunk, None, None] = completion_or_chunks + chunks: Iterator[CompletionChunk] = completion_or_chunks return chunks completion: Completion = next(completion_or_chunks) # type: ignore return completion @@ -472,7 +501,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, - ): + ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. Args: @@ -509,11 +538,158 @@ class Llama: stream=stream, ) + def _convert_text_completion_to_chat( + self, completion: Completion + ) -> ChatCompletion: + return { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": completion["choices"][0]["text"], + }, + "finish_reason": completion["choices"][0]["finish_reason"], + } + ], + "usage": completion["usage"], + } + + def _convert_text_completion_chunks_to_chat( + self, + chunks: Iterator[CompletionChunk], + ) -> Iterator[ChatCompletionChunk]: + for i, chunk in enumerate(chunks): + if i == 0: + yield { + "id": "chat" + chunk["id"], + "model": chunk["model"], + "created": chunk["created"], + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + }, + "finish_reason": None, + } + ], + } + yield { + "id": "chat" + chunk["id"], + "model": chunk["model"], + "created": chunk["created"], + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "content": chunk["choices"][0]["text"], + }, + "finish_reason": chunk["choices"][0]["finish_reason"], + } + ], + } + + def create_chat_completion( + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.8, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: List[str] = [], + max_tokens: int = 128, + repeat_penalty: float = 1.1, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + """Generate a chat completion from a list of messages. + + Args: + messages: A list of messages to generate a response for. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for sampling. + top_k: The top-k value to use for sampling. + stream: Whether to stream the results. + stop: A list of strings to stop generation when encountered. + max_tokens: The maximum number of tokens to generate. + repeat_penalty: The penalty to apply to repeated tokens. + + Returns: + Generated chat completion or a stream of chat completion chunks. + """ + instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions.""" + chat_history = "\n".join( + f'{message["role"]} {message.get("user", "")}: {message["content"]}' + for message in messages + ) + PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: " + PROMPT_STOP = ["###", "\nuser: ", "\nassistant: ", "\nsystem: "] + completion_or_chunks = self( + prompt=PROMPT, + stop=PROMPT_STOP + stop, + temperature=temperature, + top_p=top_p, + top_k=top_k, + stream=stream, + max_tokens=max_tokens, + repeat_penalty=repeat_penalty, + ) + if stream: + chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore + return self._convert_text_completion_chunks_to_chat(chunks) + else: + completion: Completion = completion_or_chunks # type: ignore + return self._convert_text_completion_to_chat(completion) + def __del__(self): if self.ctx is not None: llama_cpp.llama_free(self.ctx) self.ctx = None + def __getstate__(self): + return dict( + verbose=self.verbose, + model_path=self.model_path, + n_ctx=self.params.n_ctx, + n_parts=self.params.n_parts, + seed=self.params.seed, + f16_kv=self.params.f16_kv, + logits_all=self.params.logits_all, + vocab_only=self.params.vocab_only, + use_mlock=self.params.use_mlock, + embedding=self.params.embedding, + last_n_tokens_size=self.last_n_tokens_size, + last_n_tokens_data=self.last_n_tokens_data, + tokens_consumed=self.tokens_consumed, + n_batch=self.n_batch, + n_threads=self.n_threads, + ) + + def __setstate__(self, state): + self.__init__( + model_path=state["model_path"], + n_ctx=state["n_ctx"], + n_parts=state["n_parts"], + seed=state["seed"], + f16_kv=state["f16_kv"], + logits_all=state["logits_all"], + vocab_only=state["vocab_only"], + use_mlock=state["use_mlock"], + embedding=state["embedding"], + n_threads=state["n_threads"], + n_batch=state["n_batch"], + last_n_tokens_size=state["last_n_tokens_size"], + verbose=state["verbose"], + ) + self.last_n_tokens_data=state["last_n_tokens_data"] + self.tokens_consumed=state["tokens_consumed"] + + @staticmethod def token_eos() -> llama_cpp.llama_token: """Return the end-of-sequence token.""" diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 5364e05..41055bd 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -125,12 +125,12 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int + fname_inp: bytes, fname_out: bytes, itype: c_int ) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk) + return _lib.llama_model_quantize(fname_inp, fname_out, itype) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int # Returns the KV cache that will contain the context for the diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index d8c0b83..b62ff1b 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,5 +1,5 @@ -from typing import List, Optional, Dict, Literal -from typing_extensions import TypedDict +from typing import List, Optional, Dict, Union +from typing_extensions import TypedDict, NotRequired, Literal class EmbeddingUsage(TypedDict): @@ -55,3 +55,43 @@ class Completion(TypedDict): model: str choices: List[CompletionChoice] usage: CompletionUsage + + +class ChatCompletionMessage(TypedDict): + role: Union[Literal["assistant"], Literal["user"], Literal["system"]] + content: str + user: NotRequired[str] + + +class ChatCompletionChoice(TypedDict): + index: int + message: ChatCompletionMessage + finish_reason: Optional[str] + + +class ChatCompletion(TypedDict): + id: str + object: Literal["chat.completion"] + created: int + model: str + choices: List[ChatCompletionChoice] + usage: CompletionUsage + + +class ChatCompletionChunkDelta(TypedDict): + role: NotRequired[Literal["assistant"]] + content: NotRequired[str] + + +class ChatCompletionChunkChoice(TypedDict): + index: int + delta: ChatCompletionChunkDelta + finish_reason: Optional[str] + + +class ChatCompletionChunk(TypedDict): + id: str + model: str + object: Literal["chat.completion.chunk"] + created: int + choices: List[ChatCompletionChunkChoice] diff --git a/poetry.lock b/poetry.lock index 8102836..8a74d2f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,24 @@ # This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +[[package]] +name = "attrs" +version = "22.2.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, + {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] +tests = ["attrs[tests-no-zope]", "zope.interface"] +tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] + [[package]] name = "black" version = "23.1.0" @@ -328,6 +347,21 @@ files = [ {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, ] +[[package]] +name = "exceptiongroup" +version = "1.1.1" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "ghp-import" version = "2.1.0" @@ -415,6 +449,18 @@ zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "jaraco-classes" version = "3.2.3" @@ -821,6 +867,22 @@ files = [ docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "pycparser" version = "2.21" @@ -864,6 +926,30 @@ files = [ markdown = ">=3.2" pyyaml = "*" +[[package]] +name = "pytest" +version = "7.2.2" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, + {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -1281,4 +1367,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "cffaf5e2e66ade4f429d0e938277d4fa2c4878ca7338c3c4f91721a7d3aff91b" +content-hash = "cc9babcdfdc3679a4d84f68912408a005619a576947b059146ed1b428850ece9" diff --git a/pyproject.toml b/pyproject.toml index d916ea3..0b76366 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp" -version = "0.1.17" +version = "0.1.22" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -23,6 +23,7 @@ twine = "^4.0.2" mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.4" +pytest = "^7.2.2" [build-system] requires = [ diff --git a/setup.py b/setup.py index 5412108..0349e79 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.17", + version="0.1.22", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", @@ -19,4 +19,12 @@ setup( "typing-extensions>=4.5.0", ], python_requires=">=3.7", + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], ) diff --git a/tests/test_llama.py b/tests/test_llama.py new file mode 100644 index 0000000..6a50256 --- /dev/null +++ b/tests/test_llama.py @@ -0,0 +1,96 @@ +import llama_cpp + +MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin" + + +def test_llama(): + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + + assert llama + assert llama.ctx is not None + + text = b"Hello World" + + assert llama.detokenize(llama.tokenize(text)) == text + + +def test_llama_patch(monkeypatch): + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + + ## Set up mock function + def mock_eval(*args, **kwargs): + return 0 + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + + output_text = " jumps over the lazy dog." + output_tokens = llama.tokenize(output_text.encode("utf-8")) + token_eos = llama.token_eos() + n = 0 + + def mock_sample(*args, **kwargs): + nonlocal n + if n < len(output_tokens): + n += 1 + return output_tokens[n - 1] + else: + return token_eos + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample) + + text = "The quick brown fox" + + ## Test basic completion until eos + n = 0 # reset + completion = llama.create_completion(text, max_tokens=20) + assert completion["choices"][0]["text"] == output_text + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test streaming completion until eos + n = 0 # reset + chunks = llama.create_completion(text, max_tokens=20, stream=True) + assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test basic completion until stop sequence + n = 0 # reset + completion = llama.create_completion(text, max_tokens=20, stop=["lazy"]) + assert completion["choices"][0]["text"] == " jumps over the " + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test streaming completion until stop sequence + n = 0 # reset + chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + assert ( + "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " + ) + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test basic completion until length + n = 0 # reset + completion = llama.create_completion(text, max_tokens=2) + assert completion["choices"][0]["text"] == " j" + assert completion["choices"][0]["finish_reason"] == "length" + + ## Test streaming completion until length + n = 0 # reset + chunks = llama.create_completion(text, max_tokens=2, stream=True) + assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j" + assert completion["choices"][0]["finish_reason"] == "length" + + +def test_llama_pickle(): + import pickle + import tempfile + fp = tempfile.TemporaryFile() + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + pickle.dump(llama, fp) + fp.seek(0) + llama = pickle.load(fp) + + assert llama + assert llama.ctx is not None + + text = b"Hello World" + + assert llama.detokenize(llama.tokenize(text)) == text \ No newline at end of file