From 1e90597983e8273568e1d50c90e471c5290d3822 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 17:37:06 -0400
Subject: [PATCH 1/4] Add pydantic dep. Errors if pedantic isn't present. Also
 throws errors relating to TypeDict or subclass() if the version is too old or
 new...

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 7e4193a..f50fe8d 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@ setup(
     entry_points={"console_scripts": ["llama_cpp.server=llama_cpp.server:main"]},
     install_requires=[
         "typing-extensions>=4.5.0",
+        "pydantic==1.10.7",
     ],
     extras_require={
         "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],

From 76a82babef9703b814ae4cea28cc63c2340ed743 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 17:44:53 -0400
Subject: [PATCH 2/4] Set n_batch to the default value of 8. I think this is
 leftover from when n_ctx was missing and n_batch was 2048.

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 0362cff..67ca115 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -27,7 +27,7 @@ from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
-    n_batch: int = 2048
+    n_batch: int = 8
     n_threads: int = os.cpu_count() or 1
     f16_kv: bool = True
     use_mlock: bool = True

From c283edd7f29acef7c24755da638c418cb69a22f1 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 18:17:29 -0400
Subject: [PATCH 3/4] Set n_batch to default values and reduce thread count:

Change batch size to the llama.cpp default of 8. I've seen issues in llama.cpp where batch size affects quality of generations. (It shouldn't) But in case that's still an issue I changed to default.

Set auto-determined num of threads to 1/2 system count. ggml will sometimes lock cores at 100% while doing nothing. This is being addressed, but can cause bad experience for user if pegged at 100%
---
 examples/high_level_api/fastapi_server.py | 6 +++---
 llama_cpp/server/__main__.py              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index b7d2565..a649692 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -27,10 +27,10 @@ from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
-    n_batch: int = 2048
-    n_threads: int = os.cpu_count() or 1
+    n_batch: int = 8
+    n_threads: int = int(os.cpu_count() / 2) or 1
     f16_kv: bool = True
-    use_mlock: bool = True
+    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
 
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 67ca115..b474f67 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -28,9 +28,9 @@ class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
     n_batch: int = 8
-    n_threads: int = os.cpu_count() or 1
+    n_threads: int = int(os.cpu_count() / 2) or 1
     f16_kv: bool = True
-    use_mlock: bool = True
+    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
 

From 2e91affea2640eb6ef51da85dc4b131528e78fe1 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 18:23:17 -0400
Subject: [PATCH 4/4] Ignore ./idea folder

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d09b209..fd64c09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/