Merge pull request PromtEngineer#478 from Dafterfly/main

Allow the number of gpu layers and the number of batches to be configured in constants.py
vermavarun · Sep 16, 2023 · 25202dd · 25202dd
2 parents 121e35a + 23525d4
commit 25202dd
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 5 deletions.
diff --git a/constants.py b/constants.py
@@ -158,3 +158,14 @@
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
 # MODEL_ID = "TheBloke/orca_mini_3B-GGML"
 # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"
+
+#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
+
+N_GPU_LAYERS = 1000
+N_BATCH = 2048
+
+### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
+# N_GPU_LAYERS = 20
+# N_BATCH = 512
+
+
diff --git a/load_models.py b/load_models.py
@@ -13,6 +13,9 @@
 from constants import (
     CONTEXT_WINDOW_SIZE, 
     MAX_NEW_TOKENS
+    , 
+    N_GPU_LAYERS, 
+    N_BATCH
 )
 
 def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
@@ -54,7 +57,7 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
             kwargs["n_gpu_layers"] = 1
         if device_type.lower() == "cuda":
             kwargs["n_gpu_layers"] = 100 # set this based on your GPU
-
+            kwargs["n_batch"] = N_BATCH  # set this based on your GPU & CPU RAM
         return LlamaCpp(**kwargs)
     except:
         if 'ggml' in model_basename:

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -21,7 +21,7 @@
     pipeline,
 )
 
-from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
+from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME, N_GPU_LAYERS, N_BATCH
 
 
 def load_model(device_type, model_id, model_basename=None):
@@ -56,10 +56,10 @@ def load_model(device_type, model_id, model_basename=None):
                 "max_tokens": max_ctx_size,
             }
             if device_type.lower() == "mps":
-                kwargs["n_gpu_layers"] = 1000
+                kwargs["n_gpu_layers"] = N_GPU_LAYERS
             if device_type.lower() == "cuda":
-                kwargs["n_gpu_layers"] = 1000
-                kwargs["n_batch"] = max_ctx_size
+                kwargs["n_gpu_layers"] = N_GPU_LAYERS
+                kwargs["n_batch"] = N_BATCH
             return LlamaCpp(**kwargs)
 
         else: