Skip to content

Commit

Permalink
Merge pull request ollama#1146 from dhiltgen/ext_server_cgo
Browse files Browse the repository at this point in the history
Add cgo implementation for llama.cpp
  • Loading branch information
dhiltgen authored Dec 22, 2023
2 parents fabf2f3 + 495c06e commit 96fb441
Show file tree
Hide file tree
Showing 55 changed files with 3,205 additions and 1,179 deletions.
3 changes: 1 addition & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
ollama
app
dist
scripts
llm/llama.cpp/ggml
llm/llama.cpp/gguf
.env
.cache
test_data
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ ollama
ggml-metal.metal
.cache
*.exe
.idea
.idea
test_data
5 changes: 0 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
[submodule "llm/llama.cpp/ggml"]
path = llm/llama.cpp/ggml
url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty
shallow = true
[submodule "llm/llama.cpp/gguf"]
path = llm/llama.cpp/gguf
url = https://github.com/ggerganov/llama.cpp.git
Expand Down
71 changes: 57 additions & 14 deletions Dockerfile.build
Original file line number Diff line number Diff line change
@@ -1,23 +1,65 @@
# centos7 amd64 dependencies
FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
yum update -y && \
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH

# centos8 arm64 dependencies
FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y git cmake
# Ubuntu 20.04 amd64 dependencies
FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
ARG CUDA_VERSION=11.3.1-1
ARG CMAKE_VERSION=3.22.1
# ROCm only supports amd64
ARG ROCM_VERSION=6.0
ARG CLBLAST_VER=1.6.1

# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
RUN apt-get update && \
apt-get install -y wget gnupg && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
mkdir --parents --mode=0755 /etc/apt/keyrings && \
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev

# CLBlast
RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install

ENV ROCM_PATH=/opt/rocm

# Ubuntu 22.04 arm64 dependencies
FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
ARG CUDA_VERSION=11.3.1-1
ARG CMAKE_VERSION=3.27.6
RUN apt-get update && \
apt-get install -y wget gnupg && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
apt-get update && \
apt-cache madison cuda && \
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION}

FROM base-${TARGETARCH}
ARG TARGETARCH
ARG GOFLAGS="'-ldflags -w -s'"
ARG CGO_CFLAGS
ARG GOLANG_VERSION=1.21.3

# Common toolchain
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10

# install go
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz

# build the final binary
WORKDIR /go/src/github.com/jmorganca/ollama
Expand All @@ -26,6 +68,7 @@ COPY . .
ENV GOOS=linux
ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS
ENV CGO_CFLAGS=${CGO_CFLAGS}

RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build .
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,19 @@ Install `cmake` and `go`:
brew install cmake go
```

Then generate dependencies and build:

Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```

More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)


### Running local builds
Next, start the server:

```
Expand Down
24 changes: 22 additions & 2 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
}

if err := client.Generate(ctx, &request, fn); err != nil {
if errors.Is(err, context.Canceled) {
switch {
case errors.Is(err, context.Canceled):
return nil
case strings.Contains(err.Error(), "unsupported model format"):
// pull and retry to see if the model has been updated
parts := strings.Split(opts.Model, string(os.PathSeparator))
if len(parts) == 1 {
// this is a library model, log some info
fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
}
if err := PullHandler(cmd, []string{opts.Model}); err != nil {
fmt.Printf("Error: %s\n", err)
return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
}
// retry
if err := client.Generate(ctx, &request, fn); err != nil {
if errors.Is(err, context.Canceled) {
return nil
}
return err
}
default:
return err
}
return err
}
if opts.Prompt != "" {
fmt.Println()
Expand Down
35 changes: 32 additions & 3 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,35 @@ Now you can run `ollama`:

## Building on Linux with GPU support

- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`

### Linux/Windows CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```

### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
```
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
```
Then build the binary:
```
go build .
```

ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

## Containerized Build

If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require (
github.com/gin-gonic/gin v1.9.1
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.3
github.com/stretchr/testify v1.8.4
golang.org/x/sync v0.3.0
)

Expand Down
3 changes: 2 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
Expand Down
134 changes: 134 additions & 0 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//go:build linux || windows

package gpu

/*
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
#cgo windows LDFLAGS: -lpthread
#include "gpu_info.h"
*/
import "C"
import (
"fmt"
"log"
"sync"
"unsafe"

"github.com/jmorganca/ollama/api"
)

type handles struct {
cuda *C.cuda_handle_t
rocm *C.rocm_handle_t
}

var gpuMutex sync.Mutex
var gpuHandles *handles = nil

// Note: gpuMutex must already be held
func initGPUHandles() {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
log.Printf("Detecting GPU type")
gpuHandles = &handles{nil, nil}
var resp C.cuda_init_resp_t
C.cuda_init(&resp)
if resp.err != nil {
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))

var resp C.rocm_init_resp_t
C.rocm_init(&resp)
if resp.err != nil {
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
log.Printf("Radeon GPU detected")
rocm := resp.rh
gpuHandles.rocm = &rocm
}
} else {
log.Printf("Nvidia GPU detected")
cuda := resp.ch
gpuHandles.cuda = &cuda
}
}

func GetGPUInfo() GpuInfo {
// TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
if gpuHandles == nil {
initGPUHandles()
}

var memInfo C.mem_info_t
resp := GpuInfo{"", "", 0, 0}
if gpuHandles.cuda != nil {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if memInfo.err != nil {
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
} else {
resp.Driver = "CUDA"
resp.Library = "cuda_server"
}
} else if gpuHandles.rocm != nil {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
} else {
resp.Driver = "ROCM"
resp.Library = "rocm_server"
}
}
if resp.Driver == "" {
C.cpu_check_ram(&memInfo)
resp.Driver = "CPU"
// In the future we may offer multiple CPU variants to tune CPU features
resp.Library = "default"
}
if memInfo.err != nil {
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return resp
}
resp.FreeMemory = uint64(memInfo.free)
resp.TotalMemory = uint64(memInfo.total)
return resp
}

func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
return int64(gpuInfo.FreeMemory), nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
}

func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
info := GetGPUInfo()
if info.Driver == "CPU" {
return 0
}

/*
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/
bytesPerLayer := uint64(fileSizeBytes / numLayer)

// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4

log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)

return layers
}
Loading

0 comments on commit 96fb441

Please sign in to comment.