forked from ollama/ollama
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ollama#1146 from dhiltgen/ext_server_cgo
Add cgo implementation for llama.cpp
- Loading branch information
Showing
55 changed files
with
3,205 additions
and
1,179 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,7 @@ | |
ollama | ||
app | ||
dist | ||
scripts | ||
llm/llama.cpp/ggml | ||
llm/llama.cpp/gguf | ||
.env | ||
.cache | ||
test_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,4 +8,5 @@ ollama | |
ggml-metal.metal | ||
.cache | ||
*.exe | ||
.idea | ||
.idea | ||
test_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
//go:build linux || windows | ||
|
||
package gpu | ||
|
||
/* | ||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm | ||
#cgo windows LDFLAGS: -lpthread | ||
#include "gpu_info.h" | ||
*/ | ||
import "C" | ||
import ( | ||
"fmt" | ||
"log" | ||
"sync" | ||
"unsafe" | ||
|
||
"github.com/jmorganca/ollama/api" | ||
) | ||
|
||
type handles struct { | ||
cuda *C.cuda_handle_t | ||
rocm *C.rocm_handle_t | ||
} | ||
|
||
var gpuMutex sync.Mutex | ||
var gpuHandles *handles = nil | ||
|
||
// Note: gpuMutex must already be held | ||
func initGPUHandles() { | ||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing | ||
log.Printf("Detecting GPU type") | ||
gpuHandles = &handles{nil, nil} | ||
var resp C.cuda_init_resp_t | ||
C.cuda_init(&resp) | ||
if resp.err != nil { | ||
log.Printf("CUDA not detected: %s", C.GoString(resp.err)) | ||
C.free(unsafe.Pointer(resp.err)) | ||
|
||
var resp C.rocm_init_resp_t | ||
C.rocm_init(&resp) | ||
if resp.err != nil { | ||
log.Printf("ROCm not detected: %s", C.GoString(resp.err)) | ||
C.free(unsafe.Pointer(resp.err)) | ||
} else { | ||
log.Printf("Radeon GPU detected") | ||
rocm := resp.rh | ||
gpuHandles.rocm = &rocm | ||
} | ||
} else { | ||
log.Printf("Nvidia GPU detected") | ||
cuda := resp.ch | ||
gpuHandles.cuda = &cuda | ||
} | ||
} | ||
|
||
func GetGPUInfo() GpuInfo { | ||
// TODO - consider exploring lspci (and equivalent on windows) to check for | ||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries | ||
gpuMutex.Lock() | ||
defer gpuMutex.Unlock() | ||
if gpuHandles == nil { | ||
initGPUHandles() | ||
} | ||
|
||
var memInfo C.mem_info_t | ||
resp := GpuInfo{"", "", 0, 0} | ||
if gpuHandles.cuda != nil { | ||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo) | ||
if memInfo.err != nil { | ||
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)) | ||
C.free(unsafe.Pointer(memInfo.err)) | ||
} else { | ||
resp.Driver = "CUDA" | ||
resp.Library = "cuda_server" | ||
} | ||
} else if gpuHandles.rocm != nil { | ||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo) | ||
if memInfo.err != nil { | ||
log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)) | ||
C.free(unsafe.Pointer(memInfo.err)) | ||
} else { | ||
resp.Driver = "ROCM" | ||
resp.Library = "rocm_server" | ||
} | ||
} | ||
if resp.Driver == "" { | ||
C.cpu_check_ram(&memInfo) | ||
resp.Driver = "CPU" | ||
// In the future we may offer multiple CPU variants to tune CPU features | ||
resp.Library = "default" | ||
} | ||
if memInfo.err != nil { | ||
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err)) | ||
C.free(unsafe.Pointer(memInfo.err)) | ||
return resp | ||
} | ||
resp.FreeMemory = uint64(memInfo.free) | ||
resp.TotalMemory = uint64(memInfo.total) | ||
return resp | ||
} | ||
|
||
func CheckVRAM() (int64, error) { | ||
gpuInfo := GetGPUInfo() | ||
if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" { | ||
return int64(gpuInfo.FreeMemory), nil | ||
} | ||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation | ||
} | ||
|
||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { | ||
if opts.NumGPU != -1 { | ||
return opts.NumGPU | ||
} | ||
info := GetGPUInfo() | ||
if info.Driver == "CPU" { | ||
return 0 | ||
} | ||
|
||
/* | ||
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. | ||
We can store the model weights and the kv cache in vram, | ||
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. | ||
*/ | ||
bytesPerLayer := uint64(fileSizeBytes / numLayer) | ||
|
||
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors | ||
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4 | ||
|
||
log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer) | ||
|
||
return layers | ||
} |
Oops, something went wrong.