Skip to content

Commit

Permalink
Add cuda v12 variant and selection logic
Browse files Browse the repository at this point in the history
Based on compute capability and driver version, pick
v12 or v11 cuda variants.
  • Loading branch information
dhiltgen committed Aug 19, 2024
1 parent fc3b4cd commit 4fe3a55
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 48 deletions.
43 changes: 33 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG GOLANG_VERSION=1.22.5
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG CUDA_VERSION_11=11.3.1
ARG CUDA_VERSION_12=12.4.0
ARG ROCM_VERSION=6.1.2
ARG JETPACK_6=r36.2.0
ARG JETPACK_5=r35.4.1
Expand All @@ -13,7 +13,7 @@ COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm

FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
Expand All @@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
CUDA_VARIANT="_v11" \
bash gen_linux.sh

FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
CUDA_VARIANT="_v12" \
bash gen_linux.sh

FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
Expand All @@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ENV GOARCH arm64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh

FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
ARG CMAKE_VERSION
Expand Down Expand Up @@ -139,8 +160,10 @@ COPY . .
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS
Expand All @@ -155,8 +178,8 @@ ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
## arm binary += 381M
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
Expand Down
43 changes: 43 additions & 0 deletions gpu/cuda_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@ package gpu

import (
"log/slog"
"os"
"regexp"
"runtime"
"strconv"
"strings"
)

// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")

func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
Expand All @@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
}
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
}

func cudaGetVariant(gpuInfo CudaGPUInfo) string {
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" {
ver := strings.Split(CudaTegra, ".")
if len(ver) > 0 {
return "jetpack" + ver[0]
}
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
r := regexp.MustCompile(` R(\d+) `)
m := r.FindSubmatch(data)
if len(m) != 2 {
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
} else {
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
// https://developer.nvidia.com/embedded/jetpack-archive
switch l4t {
case 35:
return "jetpack5"
case 36:
return "jetpack6"
default:
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
}
}
}
}
}

if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
return "v11"
}
return "v12"
}
40 changes: 4 additions & 36 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ import (
"log/slog"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
Expand Down Expand Up @@ -66,10 +64,6 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU

// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")

// Note: gpuMutex must already be held
func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
Expand Down Expand Up @@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {

depPath := GetDepDir()

var cudaVariant string
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" {
ver := strings.Split(CudaTegra, ".")
if len(ver) > 0 {
cudaVariant = "jetpack" + ver[0]
}
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
r := regexp.MustCompile(` R(\d+) `)
m := r.FindSubmatch(data)
if len(m) != 2 {
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
} else {
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
// https://developer.nvidia.com/embedded/jetpack-archive
switch l4t {
case 35:
cudaVariant = "jetpack5"
case 36:
cudaVariant = "jetpack6"
default:
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
}
}
}
}
}

// Load ALL libraries
cHandles = initCudaHandles()

Expand All @@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
gpuInfo := CudaGPUInfo{
GpuInfo: GpuInfo{
Library: "cuda",
Variant: cudaVariant,
},
index: i,
}
Expand All @@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.computeMajor = int(memInfo.major)
gpuInfo.computeMinor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
cudaVariant := cudaGetVariant(gpuInfo)
if depPath != "" {
gpuInfo.DependencyPath = depPath
// Check for variant specific directory
Expand All @@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
gpuInfo.Variant = cudaGetVariant(gpuInfo)

// query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates
Expand Down
6 changes: 4 additions & 2 deletions gpu/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ type CPUInfo struct {

type CudaGPUInfo struct {
GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
computeMajor int //nolint:unused,nolintlint
computeMinor int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo

Expand Down

0 comments on commit 4fe3a55

Please sign in to comment.