Resolve merge conflicts

This commit is contained in:
Shouyi Wang 2023-07-14 14:37:01 +10:00
commit 579f526246
8 changed files with 129 additions and 77 deletions

View file

@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](
## [Unreleased]
## [0.1.71]
### Added
- (llama.cpp) Update llama.cpp
### Fixed
- (server) Fix several pydantic v2 migration bugs
## [0.1.70]
### Fixed
- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
- (server) Fixed changed settings field names from pydantic v2 migration
## [0.1.69]
### Added
- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.

View file

@ -833,18 +833,14 @@ class Llama:
if self.verbose:
if max_tokens <= 0:
# Unlimited, depending on n_ctx.
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
raise ValueError(
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
raise ValueError(
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
if max_tokens <= 0:
# Unlimited, depending on n_ctx.
max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
# Truncate max_tokens if requested tokens would exceed the context window
max_tokens = (

View file

@ -326,13 +326,23 @@ _lib.llama_mlock_supported.restype = c_bool
# // Initialize the llama + ggml backend
# // If numa is true, use NUMA optimizations
# // Call once at the start of the program
# LLAMA_API void llama_init_backend(bool numa);
def llama_init_backend(numa: c_bool):
return _lib.llama_init_backend(numa)
# LLAMA_API void llama_backend_init(bool numa);
def llama_backend_init(numa: c_bool):
return _lib.llama_backend_init(numa)
_lib.llama_init_backend.argtypes = [c_bool]
_lib.llama_init_backend.restype = None
_lib.llama_backend_init.argtypes = [c_bool]
_lib.llama_backend_init.restype = None
# // Call once at the end of the program - currently only used for MPI
# LLAMA_API void llama_backend_free();
def llama_backend_free():
return _lib.llama_backend_free()
_lib.llama_backend_free.argtypes = []
_lib.llama_backend_free.restype = None
# LLAMA_API struct llama_model * llama_load_model_from_file(
@ -819,6 +829,39 @@ _lib.llama_sample_frequency_and_presence_penalties.argtypes = [
_lib.llama_sample_frequency_and_presence_penalties.restype = None
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance"
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
# LLAMA_API void llama_sample_classifier_free_guidance(
# struct llama_context * ctx,
# llama_token_data_array * candidates,
# struct llama_context * guidance_ctx,
# float scale,
# float smooth_factor);
def llama_sample_classifier_free_guidance(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
guidance_ctx: llama_context_p,
scale: c_float,
smooth_factor: c_float,
return _lib.llama_sample_classifier_free_guidance(
ctx, candidates, guidance_ctx, scale, smooth_factor
_lib.llama_sample_classifier_free_guidance.argtypes = [
_lib.llama_sample_classifier_free_guidance.restype = None
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
def llama_sample_softmax(
@ -1063,5 +1106,5 @@ _lib.llama_print_system_info.restype = c_char_p
_llama_initialized = False
if not _llama_initialized:
_llama_initialized = True

View file

@ -30,14 +30,14 @@ from import create_app, Settings
if __name__ == "__main__":
parser = argparse.ArgumentParser()
for name, field in Settings.__model_fields__.items():
description = field.field_info.description
for name, field in Settings.model_fields.items():
description = field.description
if field.default is not None and description is not None:
description += f" (default: {field.default})"
type=field.annotation if field.annotation is not None else str,

View file

@ -84,12 +84,8 @@ class Settings(BaseSettings):
verbose: bool = Field(
default=True, description="Whether to print debug information."
host: str = Field(
default="localhost", description="Listen address"
port: int = Field(
default=8000, description="Listen port"
host: str = Field(default="localhost", description="Listen address")
port: int = Field(default=8000, description="Listen port")
interrupt_requests: bool = Field(
description="Whether to interrupt requests when a new request is received.",
@ -183,7 +179,7 @@ def get_settings():
yield settings
model_field = Field(description="The model to use for generating completions.")
model_field = Field(description="The model to use for generating completions.", default=None)
max_tokens_field = Field(
default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@ -247,21 +243,18 @@ mirostat_mode_field = Field(
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
mirostat_tau_field = Field(
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
mirostat_eta_field = Field(
description="Mirostat learning rate"
default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
@ -299,22 +292,23 @@ class CreateCompletionRequest(BaseModel):
model: Optional[str] = model_field
n: Optional[int] = 1
best_of: Optional[int] = 1
user: Optional[str] = Field(None)
user: Optional[str] = Field(default=None)
# llama.cpp specific parameters
top_k: int = top_k_field
repeat_penalty: float = repeat_penalty_field
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
class Config:
schema_extra = {
"example": {
model_config = {
"json_schema_extra": {
"examples": [
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
"stop": ["\n", "###"],
def make_logit_bias_processor(
@ -333,7 +327,7 @@ def make_logit_bias_processor(
elif logit_bias_type == "tokens":
for token, score in logit_bias.items():
token = token.encode('utf-8')
token = token.encode("utf-8")
for input_id in llama.tokenize(token, add_bos=False):
to_bias[input_id] = score
@ -357,7 +351,7 @@ async def create_completion(
request: Request,
body: CreateCompletionRequest,
llama: llama_cpp.Llama = Depends(get_llama),
) -> llama_cpp.Completion:
if isinstance(body.prompt, list):
assert len(body.prompt) <= 1
body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@ -369,7 +363,7 @@ async def create_completion(
kwargs = body.dict(exclude=exclude)
kwargs = body.model_dump(exclude=exclude)
if body.logit_bias is not None:
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@ -401,7 +395,7 @@ async def create_completion(
return EventSourceResponse(
recv_chan, data_sender_callable=partial(event_publisher, send_chan)
) # type: ignore
completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore
return completion
@ -410,16 +404,17 @@ async def create_completion(
class CreateEmbeddingRequest(BaseModel):
model: Optional[str] = model_field
input: Union[str, List[str]] = Field(description="The input to embed.")
user: Optional[str]
user: Optional[str] = Field(default=None)
class Config:
schema_extra = {
"example": {
model_config = {
"json_schema_extra": {
"examples": [
"input": "The food was delicious and the waiter...",
@ -429,7 +424,7 @@ async def create_embedding(
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
return await run_in_threadpool(
llama.create_embedding, **request.dict(exclude={"user"})
llama.create_embedding, **request.model_dump(exclude={"user"})
@ -466,23 +461,24 @@ class CreateChatCompletionRequest(BaseModel):
repeat_penalty: float = repeat_penalty_field
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
class Config:
schema_extra = {
"example": {
model_config = {
"json_schema_extra": {
"examples": [
"messages": [
role="system", content="You are a helpful assistant."
role="user", content="What is the capital of France?"
@ -491,14 +487,14 @@ async def create_chat_completion(
body: CreateChatCompletionRequest,
llama: llama_cpp.Llama = Depends(get_llama),
settings: Settings = Depends(get_settings),
) -> Union[llama_cpp.ChatCompletion]: # type: ignore
) -> llama_cpp.ChatCompletion:
exclude = {
kwargs = body.dict(exclude=exclude)
kwargs = body.model_dump(exclude=exclude)
if body.logit_bias is not None:
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@ -531,7 +527,7 @@ async def create_chat_completion(
return EventSourceResponse(
data_sender_callable=partial(event_publisher, send_chan),
) # type: ignore
completion: llama_cpp.ChatCompletion = await run_in_threadpool(
llama.create_chat_completion, **kwargs # type: ignore
@ -551,8 +547,6 @@ class ModelList(TypedDict):
data: List[ModelData]
async def get_models(
settings: Settings = Depends(get_settings),

View file

@ -1,6 +1,6 @@
name = "llama_cpp_python"
version = "0.1.68"
version = "0.1.71"
description = "Python bindings for the llama.cpp library"
authors = ["Andrei Betlen <>"]
license = "MIT"

View file

@ -10,7 +10,7 @@ setup(
description="A Python wrapper for llama.cpp",
author="Andrei Betlen",
@ -18,7 +18,7 @@ setup(
packages=["llama_cpp", "llama_cpp.server"],
install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
"server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
"server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],

vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 64639555ff93c8ead2b80becb49cc6b60aeac240
Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36