Skip to content

Commit

Permalink
Merge pull request PromtEngineer#478 from Dafterfly/main
Browse files Browse the repository at this point in the history
Allow the number of gpu layers and the number of batches to be configured in constants.py
  • Loading branch information
PromtEngineer authored Sep 16, 2023
2 parents 121e35a + 23525d4 commit 25202dd
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 5 deletions.
11 changes: 11 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,14 @@
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing

N_GPU_LAYERS = 1000
N_BATCH = 2048

### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
# N_GPU_LAYERS = 20
# N_BATCH = 512


5 changes: 4 additions & 1 deletion load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from constants import (
CONTEXT_WINDOW_SIZE,
MAX_NEW_TOKENS
,
N_GPU_LAYERS,
N_BATCH
)

def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
Expand Down Expand Up @@ -54,7 +57,7 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
kwargs["n_gpu_layers"] = 1
if device_type.lower() == "cuda":
kwargs["n_gpu_layers"] = 100 # set this based on your GPU

kwargs["n_batch"] = N_BATCH # set this based on your GPU & CPU RAM
return LlamaCpp(**kwargs)
except:
if 'ggml' in model_basename:
Expand Down
8 changes: 4 additions & 4 deletions run_localGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
pipeline,
)

from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME, N_GPU_LAYERS, N_BATCH


def load_model(device_type, model_id, model_basename=None):
Expand Down Expand Up @@ -56,10 +56,10 @@ def load_model(device_type, model_id, model_basename=None):
"max_tokens": max_ctx_size,
}
if device_type.lower() == "mps":
kwargs["n_gpu_layers"] = 1000
kwargs["n_gpu_layers"] = N_GPU_LAYERS
if device_type.lower() == "cuda":
kwargs["n_gpu_layers"] = 1000
kwargs["n_batch"] = max_ctx_size
kwargs["n_gpu_layers"] = N_GPU_LAYERS
kwargs["n_batch"] = N_BATCH
return LlamaCpp(**kwargs)

else:
Expand Down

0 comments on commit 25202dd

Please sign in to comment.