Skip to content

Commit

Permalink
update memory calcualtions
Browse files Browse the repository at this point in the history
count each layer independently when deciding gpu offloading
  • Loading branch information
mxyng committed Apr 1, 2024
1 parent d338d70 commit 91b3e4d
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 85 deletions.
17 changes: 16 additions & 1 deletion format/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@ import (
)

const (
Byte = 1
Byte = 1

KiloByte = Byte * 1000
MegaByte = KiloByte * 1000
GigaByte = MegaByte * 1000
TeraByte = GigaByte * 1000

KibiByte = Byte * 1024
MebiByte = KibiByte * 1024
)

func HumanBytes(b int64) string {
Expand Down Expand Up @@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
return fmt.Sprintf("%d %s", int(value), unit)
}
}

func HumanBytes2(b int64) string {
switch {
case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
case b >= KibiByte:
return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
default:
return fmt.Sprintf("%d B", b)
}
}
25 changes: 11 additions & 14 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,20 @@ import (
"strings"
"sync"
"unsafe"

"github.com/ollama/ollama/format"
)

type handles struct {
nvml *C.nvml_handle_t
cudart *C.cudart_handle_t
}

const (
cudaMinimumMemory = 377 * format.MebiByte
rocmMinimumMemory = 377 * format.MebiByte
)

var gpuMutex sync.Mutex
var gpuHandles *handles = nil

Expand Down Expand Up @@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
resp.MinimumMemory = cudaMinimumMemory
} else {
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
Expand All @@ -187,13 +195,15 @@ func GetGPUInfo() GpuInfo {
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
resp.MinimumMemory = cudaMinimumMemory
} else {
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else {
AMDGetGPUInfo(&resp)
if resp.Library != "" {
resp.MinimumMemory = rocmMinimumMemory
return resp
}
}
Expand Down Expand Up @@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
}
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory / 10
gpus := uint64(gpuInfo.DeviceCount)
if overhead < gpus*1024*1024*1024 {
overhead = gpus * 1024 * 1024 * 1024
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if CudaTegra != "" {
// Setting overhead for non-Tegra devices
overhead = 0
}
avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
return int64(gpuInfo.FreeMemory), nil
}

return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
Expand Down
3 changes: 3 additions & 0 deletions gpu/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ type GpuInfo struct {
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`

// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory int64 `json:"-"`

// TODO add other useful attributes about the card here for discovery information
}

Expand Down
4 changes: 2 additions & 2 deletions llm/dyn_ext_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import (

type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
options *api.Options
}

// Note: current implementation does not support concurrent instantiations
Expand All @@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}

func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
Expand Down
11 changes: 11 additions & 0 deletions llm/ggml.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,24 @@ import (
"errors"
"fmt"
"io"
"strings"
)

type GGML struct {
container
model
}

func (ggml *GGML) LayerSize(prefix string) (n int64) {
for _, t := range ggml.Tensors() {
if strings.HasPrefix(t.Name, prefix) {
n += int64(t.size())
}
}

return
}

const (
fileTypeF32 uint32 = iota
fileTypeF16
Expand Down
136 changes: 73 additions & 63 deletions llm/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ import (
"fmt"
"log/slog"
"os"
"runtime"
"slices"
"strings"

"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)

Expand All @@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
"mamba",
}

func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
Expand All @@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
defer f.Close()

ggml, size, err := DecodeGGML(f)
ggml, _, err := DecodeGGML(f)
if err != nil {
return nil, err
}
Expand All @@ -49,92 +50,101 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
opts.NumCtx = 4
}

vram, _ := gpu.CheckVRAM()
availableMemory, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo()

usedMemory := info.MinimumMemory
for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector)

// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}

// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())

// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.KV().GQA()) * kv / 6
usedMemory += graph

if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
opts.NumGPU = 0
if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
info.Library = "cpu"
}

info := gpu.GetGPUInfo()
switch runtime.GOOS {
case "darwin":
if opts.NumGPU == 0 {
break
}
requiredMemory := usedMemory

if size+kv+graph > vram {
slog.Info("not enough vram available, setting num_gpu=0")
opts.NumGPU = 0
break
}
var layers int
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
requiredMemory += layerMemory

// TODO: implement layer splitting on macOS
opts.NumGPU = 999
default:
if info.Library == "cpu" {
slog.Info("GPU not available, falling back to CPU")
opts.NumGPU = 0
break
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
usedMemory += layerMemory
layers++
}
}

// don't use GPU at all if no layers are loaded
if opts.NumGPU == 0 {
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
break
}
memOutputLayer := ggml.LayerSize("output.")
requiredMemory += memOutputLayer

// user-defined GPU count
if opts.NumGPU != -1 {
break
}
// only offload output layer if all repeating layers are offloaded
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
usedMemory += memOutputLayer
layers++
}

// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers := int64(ggml.KV().BlockCount()) + 1
devices := int64(info.DeviceCount)
avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices)
if layers > maxlayers {
layers = maxlayers
}
slog.Info(
"offload to gpu",
"layers", layers,
"required", format.HumanBytes2(requiredMemory),
"used", format.HumanBytes2(usedMemory),
"available", format.HumanBytes2(availableMemory),
"kv", format.HumanBytes2(kv),
"graph", format.HumanBytes2(graph),
)

if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}

// 1 + 2 must fit on the main gpu
min := graph + kv*layers/maxlayers
if layers <= 0 || min > avg {
slog.Info("not enough vram available, falling back to CPU only")
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
opts.NumGPU = 0
break
}
return newLlmServer(info, model, adapters, projectors, opts)
}

opts.NumGPU = int(layers)
func projectorMemoryRequirements(filename string) int64 {
file, err := os.Open(filename)
if err != nil {
return 0
}
defer file.Close()

opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlmServer(info, model, adapters, projectors, opts)
ggml, _, err := DecodeGGML(file)
if err != nil {
return 0
}

prefixes := make(map[string]struct{})
for _, layer := range ggml.Tensors() {
parts := strings.Split(layer.Name, ".")
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
}

var ask int64
for prefix := range prefixes {
ask += ggml.LayerSize(prefix)
}

return ask
}

// Give any native cgo implementations an opportunity to initialize
func Init() error {
return nativeInit()
}

func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)

// Check to see if the user has requested a specific library instead of auto-detecting
Expand Down
10 changes: 5 additions & 5 deletions server/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ var loaded struct {
var defaultSessionDuration = 5 * time.Minute

// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
needLoad := loaded.runner == nil || // is there a model loaded?
loaded.ModelPath != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
Expand Down Expand Up @@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D

loaded.Model = model
loaded.runner = llmRunner
loaded.Options = &opts
loaded.Options = opts
}

loaded.expireAt = time.Now().Add(sessionDuration)
Expand Down Expand Up @@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}

if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
Expand Down Expand Up @@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}

if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
Expand Down Expand Up @@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}

if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
Expand Down

0 comments on commit 91b3e4d

Please sign in to comment.