Skip to content

Commit

Permalink
Add VGPU
Browse files Browse the repository at this point in the history
  • Loading branch information
peizhaoyou committed Mar 16, 2021
1 parent e5e1bef commit 1619a9b
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

DOCKER ?= docker
ifeq ($(IMAGE),)
REGISTRY ?= nvcr.io/nvidia
REGISTRY ?= m7-ieg-pico-test01:5000/nvidia
IMAGE := $(REGISTRY)/k8s-device-plugin
endif
VERSION ?= v0.9.0
Expand Down
4 changes: 3 additions & 1 deletion docker/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ LABEL summary="NVIDIA device plugin for Kubernetes"
LABEL description="See summary"

COPY ./LICENSE ./licenses/LICENSE
COPY --from=build /build/entrypoint.sh /entrypoint.sh
COPY --from=build /build/vgpu /etc/vgpu

COPY --from=build /build/nvidia-device-plugin /usr/bin/nvidia-device-plugin

ENTRYPOINT ["nvidia-device-plugin"]
ENTRYPOINT ["/entrypoint.sh"]
5 changes: 4 additions & 1 deletion docker/Dockerfile.ubuntu20.04
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ARG CUDA_IMAGE=cuda
FROM nvcr.io/nvidia/${CUDA_IMAGE}:11.2.1-base-ubuntu20.04 as build

RUN rm -rf /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
g++ \
ca-certificates \
Expand Down Expand Up @@ -53,7 +54,9 @@ LABEL summary="NVIDIA device plugin for Kubernetes"
LABEL description="See summary"

COPY ./LICENSE ./licenses/LICENSE
COPY --from=build /build/entrypoint.sh /entrypoint.sh
COPY --from=build /build/vgpu /etc/vgpu

COPY --from=build /build/nvidia-device-plugin /usr/bin/nvidia-device-plugin

ENTRYPOINT ["nvidia-device-plugin"]
ENTRYPOINT ["/entrypoint.sh"]
3 changes: 3 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cp -f /etc/vgpu/* /usr/local/vgpu/
exec nvidia-device-plugin $@
31 changes: 24 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ var passDeviceSpecsFlag bool
var deviceListStrategyFlag string
var deviceIDStrategyFlag string
var nvidiaDriverRootFlag string
var deviceSplitCountFlag uint
var deviceMemoryScalingFlag float64

var version string // This should be set at build time to indicate the actual version

Expand All @@ -43,14 +45,15 @@ func main() {
c.Before = validateFlags
c.Action = start

migStrategyFlag = MigStrategyNone
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "mig-strategy",
Value: "none",
Usage: "the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]",
Destination: &migStrategyFlag,
EnvVars: []string{"MIG_STRATEGY"},
},
//&cli.StringFlag{
// Name: "mig-strategy",
// Value: "none",
// Usage: "the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]",
// Destination: &migStrategyFlag,
// EnvVars: []string{"MIG_STRATEGY"},
//},
&cli.BoolFlag{
Name: "fail-on-init-error",
Value: true,
Expand Down Expand Up @@ -86,6 +89,20 @@ func main() {
Destination: &nvidiaDriverRootFlag,
EnvVars: []string{"NVIDIA_DRIVER_ROOT"},
},
&cli.UintFlag{
Name: "device-split-count",
Value: 2,
Usage: "the number for NVIDIA device split)",
Destination: &deviceSplitCountFlag,
EnvVars: []string{"DEVICE_SPLIT_COUNT"},
},
&cli.Float64Flag{
Name: "device-memory-scaling",
Value: 1.0,
Usage: "the ratio for NVIDIA device memory scaling)",
Destination: &deviceMemoryScalingFlag,
EnvVars: []string{"DEVICE_MEMORY_SCALING"},
},
}

err := c.Run(os.Args)
Expand Down
5 changes: 5 additions & 0 deletions nvidia-device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@ spec:
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: vgpu-dir
mountPath: /usr/local/vgpu
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: vgpu-dir
hostPath:
path: /usr/local/vgpu
31 changes: 24 additions & 7 deletions server.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type NvidiaDevicePlugin struct {
cachedDevices []*Device
health chan *Device
stop chan interface{}
vDevices []*VDevice
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
Expand All @@ -84,13 +85,15 @@ func NewNvidiaDevicePlugin(resourceName string, resourceManager ResourceManager,

func (m *NvidiaDevicePlugin) initialize() {
m.cachedDevices = m.Devices()
m.vDevices = Device2VDevice(m.cachedDevices)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
m.health = make(chan *Device)
m.stop = make(chan interface{})
}

func (m *NvidiaDevicePlugin) cleanup() {
close(m.stop)
m.vDevices = nil
m.cachedDevices = nil
m.server = nil
m.health = nil
Expand Down Expand Up @@ -201,7 +204,8 @@ func (m *NvidiaDevicePlugin) Register() error {
Endpoint: path.Base(m.socket),
ResourceName: m.resourceName,
Options: &pluginapi.DevicePluginOptions{
GetPreferredAllocationAvailable: (m.allocatePolicy != nil),
GetPreferredAllocationAvailable: false,
//GetPreferredAllocationAvailable: (m.allocatePolicy != nil),
},
}

Expand Down Expand Up @@ -240,6 +244,7 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request
func (m *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
response := &pluginapi.PreferredAllocationResponse{}
// get device
for _, req := range r.ContainerRequests {
available, err := gpuallocator.NewDevicesFrom(req.AvailableDeviceIDs)
if err != nil {
Expand Down Expand Up @@ -271,15 +276,14 @@ func (m *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *plug
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
for _, id := range req.DevicesIDs {
if !m.deviceExists(id) {
return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id)
}
vdevices, err := VDevicesByIDs(m.vDevices, req.DevicesIDs)
if err != nil {
return nil, err
}

response := pluginapi.ContainerAllocateResponse{}

uuids := req.DevicesIDs
uuids := UniqueDeviceIDs(vdevices)
deviceIDs := m.deviceIDsFromUUIDs(uuids)

if deviceListStrategyFlag == DeviceListStrategyEnvvar {
Expand All @@ -293,6 +297,18 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc
response.Devices = m.apiDeviceSpecs(nvidiaDriverRootFlag, uuids)
}

var mapEnvs []string
for i, vd := range vdevices {
limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i)
response.Envs[limitKey] = fmt.Sprintf("%vm", vd.memory)
mapEnvs = append(mapEnvs, fmt.Sprintf("%v:%v", i, vd.dev.ID))
}
response.Envs["NVIDIA_DEVICE_MAP"] = strings.Join(mapEnvs, " ")
response.Mounts = append(response.Mounts,
&pluginapi.Mount{ContainerPath: "/usr/local/vgpu/libvgpu.so",
HostPath: "/usr/local/vgpu/libvgpu.so", ReadOnly: true},
&pluginapi.Mount{ContainerPath: "/etc/ld.so.preload",
HostPath: "/usr/local/vgpu/ld.so.preload", ReadOnly: true})
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}

Expand Down Expand Up @@ -349,7 +365,8 @@ func (m *NvidiaDevicePlugin) deviceIDsFromUUIDs(uuids []string) []string {

func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
var pdevs []*pluginapi.Device
for _, d := range m.cachedDevices {
for _, d := range m.vDevices {
d.Health = d.dev.Health
pdevs = append(pdevs, &d.Device)
}
return pdevs
Expand Down
81 changes: 81 additions & 0 deletions vdevice.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"fmt"
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

type VDevice struct {
pluginapi.Device
dev *Device
memory uint64
}

func Device2VDevice(devices []*Device) []*VDevice {
var vdevices []*VDevice
for _, d := range devices {
dev, err := nvml.NewDeviceByUUID(d.ID)
check(err)
memory := uint64(float64(*dev.Memory) * deviceMemoryScalingFlag / float64(deviceSplitCountFlag))
for i := uint(0); i < deviceSplitCountFlag; i++ {
vd := &VDevice{Device: d.Device, dev: d, memory: memory}
vd.ID = fmt.Sprintf("%v-%v", d.ID, i)
vd.memory = memory
vdevices = append(vdevices, vd)
}
}
return vdevices
}

func VDevicesByIDs(vdevices []*VDevice, ids []string) ([]*VDevice, error) {
m := make(map[string]*VDevice, len(vdevices))
for _, vd := range vdevices {
m[vd.ID] = vd
}
var vds []*VDevice
for _, id := range ids {
if vd, ok := m[id]; ok {
vds = append(vds, vd)
} else {
return nil, fmt.Errorf("unknown device: %s", id)
}
}
return vds, nil
}

func UniqueDeviceIDs(vdevices []*VDevice) []string {
m := make(map[string]bool, len(vdevices))
var ids []string
for _, vd := range vdevices {
if _, ok := m[vd.dev.ID]; !ok {
m[vd.dev.ID] = true
ids = append(ids, vd.dev.ID)
}
}
return ids
}

//func VDeviceHealth(vdevices []*VDevice, id string, health string) {
// for _, vd := range vdevices {
// if vd.dev.ID == id {
// vd.Health = health
// }
// }
//}
1 change: 1 addition & 0 deletions vgpu/ld.so.preload
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/usr/local/vgpu/libvgpu.so
Binary file added vgpu/libvgpu.so
Binary file not shown.

0 comments on commit 1619a9b

Please sign in to comment.