Merge pull request ollama#1146 from dhiltgen/ext_server_cgo

Add cgo implementation for llama.cpp
blsantos · Dec 22, 2023 · 96fb441 · 96fb441
2 parents fabf2f3 + 495c06e
commit 96fb441
Show file tree

Hide file tree

Showing 55 changed files with 3,205 additions and 1,179 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -2,8 +2,7 @@
 ollama
 app
 dist
-scripts
-llm/llama.cpp/ggml
 llm/llama.cpp/gguf
 .env
 .cache
+test_data
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ ollama
 ggml-metal.metal
 .cache
 *.exe
-.idea
+.idea
+test_data
diff --git a/.gitmodules b/.gitmodules
@@ -1,8 +1,3 @@
-[submodule "llm/llama.cpp/ggml"]
-    path = llm/llama.cpp/ggml
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
 [submodule "llm/llama.cpp/gguf"]
     path = llm/llama.cpp/gguf
     url = https://github.com/ggerganov/llama.cpp.git

diff --git a/Dockerfile.build b/Dockerfile.build
@@ -1,23 +1,65 @@
-# centos7 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
-    yum update -y && \
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-# centos8 arm64 dependencies
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-RUN yum install -y git cmake
+# Ubuntu 20.04 amd64 dependencies
+FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.22.1
+# ROCm only supports amd64
+ARG ROCM_VERSION=6.0
+ARG CLBLAST_VER=1.6.1
+
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    mkdir --parents --mode=0755 /etc/apt/keyrings && \
+    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
+
+# CLBlast
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
+    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
+
+ENV ROCM_PATH=/opt/rocm
+
+# Ubuntu 22.04 arm64 dependencies
+FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.27.6
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    apt-get update && \
+    apt-cache madison cuda && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} 
 
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
+ARG CGO_CFLAGS
+ARG GOLANG_VERSION=1.21.3
+
+# Common toolchain
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
 
 # install go
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
+ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
 
 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
@@ -26,6 +68,7 @@ COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}
 
 RUN /usr/local/go/bin/go generate ./... && \
     /usr/local/go/bin/go build .
diff --git a/README.md b/README.md
@@ -192,13 +192,19 @@ Install `cmake` and `go`:
 brew install cmake go
 ```
 
-Then generate dependencies and build:
-
+Then generate dependencies:
 ```
 go generate ./...
+```
+Then build the binary:
+```
 go build .
 ```
 
+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
+
+
+### Running local builds
 Next, start the server:
 
 ```

diff --git a/cmd/cmd.go b/cmd/cmd.go
@@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 	}
 
 	if err := client.Generate(ctx, &request, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
+		switch {
+		case errors.Is(err, context.Canceled):
 			return nil
+		case strings.Contains(err.Error(), "unsupported model format"):
+			// pull and retry to see if the model has been updated
+			parts := strings.Split(opts.Model, string(os.PathSeparator))
+			if len(parts) == 1 {
+				// this is a library model, log some info
+				fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
+			}
+			if err := PullHandler(cmd, []string{opts.Model}); err != nil {
+				fmt.Printf("Error: %s\n", err)
+				return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
+			}
+			// retry
+			if err := client.Generate(ctx, &request, fn); err != nil {
+				if errors.Is(err, context.Canceled) {
+					return nil
+				}
+				return err
+			}
+		default:
+			return err
 		}
-		return err
 	}
 	if opts.Prompt != "" {
 		fmt.Println()

diff --git a/docs/development.md b/docs/development.md
@@ -34,6 +34,35 @@ Now you can run `ollama`:
 
 ## Building on Linux with GPU support
 
-- Install cmake and nvidia-cuda-toolkit
-- run `go generate ./...`
-- run `go build .`
+
+### Linux/Windows CUDA (NVIDIA)
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
+Then generate dependencies:
+```
+go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+
+### Linux ROCm (AMD)
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
+Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
+```
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
+
+## Containerized Build
+
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
diff --git a/go.mod b/go.mod
@@ -7,7 +7,7 @@ require (
 	github.com/gin-gonic/gin v1.9.1
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
-	github.com/stretchr/testify v1.8.3
+	github.com/stretchr/testify v1.8.4
 	golang.org/x/sync v0.3.0
 )
 

diff --git a/go.sum b/go.sum
@@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=

diff --git a/gpu/gpu.go b/gpu/gpu.go
@@ -0,0 +1,134 @@
+//go:build linux || windows
+
+package gpu
+
+/*
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -lpthread
+
+#include "gpu_info.h"
+
+*/
+import "C"
+import (
+	"fmt"
+	"log"
+	"sync"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+type handles struct {
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
+}
+
+var gpuMutex sync.Mutex
+var gpuHandles *handles = nil
+
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
+	log.Printf("Detecting GPU type")
+	gpuHandles = &handles{nil, nil}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))
+
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			log.Printf("Radeon GPU detected")
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
+		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
+	}
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+
+	var memInfo C.mem_info_t
+	resp := GpuInfo{"", "", 0, 0}
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "CUDA"
+			resp.Library = "cuda_server"
+		}
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "ROCM"
+			resp.Library = "rocm_server"
+		}
+	}
+	if resp.Driver == "" {
+		C.cpu_check_ram(&memInfo)
+		resp.Driver = "CPU"
+		// In the future we may offer multiple CPU variants to tune CPU features
+		resp.Library = "default"
+	}
+	if memInfo.err != nil {
+		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
+		C.free(unsafe.Pointer(memInfo.err))
+		return resp
+	}
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
+
+func CheckVRAM() (int64, error) {
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
+		return int64(gpuInfo.FreeMemory), nil
+	}
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
+}
+
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if opts.NumGPU != -1 {
+		return opts.NumGPU
+	}
+	info := GetGPUInfo()
+	if info.Driver == "CPU" {
+		return 0
+	}
+
+	/*
+		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		We can store the model weights and the kv cache in vram,
+		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+	*/
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
+
+	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
+
+	log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)
+
+	return layers
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,5 @@ ollama @@
     ggml-metal.metal
     .cache
     *.exe
-    .idea
+    .idea
+    test_data