Skip to content

Commit

Permalink
fix: when multi container use device allocate fail
Browse files Browse the repository at this point in the history
Signed-off-by: rongfu.leng <[email protected]>
  • Loading branch information
lengrongfu committed Mar 26, 2024
1 parent 9a1a707 commit ddc4054
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pkg/device-plugin/hygon/dcu/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest)
responses.ContainerResponses = append(responses.ContainerResponses, &car)
}
klog.Infoln("response=", responses)
device.PodAllocationTrySuccess(nodename, current)
device.PodAllocationTrySuccess(nodename, hygon.HygonDCUDevice, current)
return &responses, nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/device-plugin/mlu/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ func (m *CambriconDevicePlugin) allocateMLUShare(ctx context.Context, reqs *plug
responses.ContainerResponses = append(responses.ContainerResponses, &resp)
}
klog.Infoln("response=", responses)
device.PodAllocationTrySuccess(nodename, current)
device.PodAllocationTrySuccess(nodename, cambricon.CambriconMLUDevice, current)
return &responses, nil
}

Expand Down
3 changes: 2 additions & 1 deletion pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
nodelock.ReleaseNodeLock(nodename)
return &pluginapi.AllocateResponse{}, err
}
klog.V(5).Infof("allocate pod name is %s/%s, annotation is %+v", current.Namespace, current.Name, current.Annotations)

for idx, req := range reqs.ContainerRequests {
// If the devices being allocated are replicas, then (conditionally)
Expand Down Expand Up @@ -406,7 +407,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
}
}
klog.Infoln("Allocate Response", responses.ContainerResponses)
device.PodAllocationTrySuccess(nodename, current)
device.PodAllocationTrySuccess(nodename, nvidia.NvidiaGPUDevice, current)
return &responses, nil
}

Expand Down
10 changes: 7 additions & 3 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,13 @@ func init() {
DevicesToHandle = append(DevicesToHandle, iluvatar.IluvatarGPUCommonWord)
}

func PodAllocationTrySuccess(nodeName string, pod *v1.Pod) {
refreshed, _ := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{})
annos := refreshed.Annotations[util.AssignedIDsToAllocateAnnotations]
func PodAllocationTrySuccess(nodeName string, devName string, pod *v1.Pod) {
refreshed, err := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{})
if err != nil {
klog.Errorf("get pods %s/%s error: %+v", pod.Namespace, pod.Name, err)
return
}
annos := refreshed.Annotations[util.InRequestDevices[devName]]
klog.Infoln("TrySuccess:", annos)
for _, val := range DevicesToHandle {
if strings.Contains(annos, val) {
Expand Down
9 changes: 5 additions & 4 deletions pkg/device/nvidia/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,11 @@ func (dev *NvidiaGPUDevices) CheckType(annos map[string]string, d util.DeviceUsa
func (dev *NvidiaGPUDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string {
devlist, ok := pd[NvidiaGPUDevice]
if ok && len(devlist) > 0 {
(*annoinput)[util.InRequestDevices[NvidiaGPUDevice]] = util.EncodePodSingleDevice(devlist)
(*annoinput)[util.SupportDevices[NvidiaGPUDevice]] = util.EncodePodSingleDevice(devlist)
//InRequestDevices := util.EncodePodDevices(util.InRequestDevices, m.devices)
//supportDevices := util.EncodePodDevices(util.SupportDevices, m.devices)
deviceStr := util.EncodePodSingleDevice(devlist)
(*annoinput)[util.InRequestDevices[NvidiaGPUDevice]] = deviceStr
(*annoinput)[util.SupportDevices[NvidiaGPUDevice]] = deviceStr
klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.InRequestDevices[NvidiaGPUDevice], deviceStr)
klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.SupportDevices[NvidiaGPUDevice], deviceStr)
}
return *annoinput
}
Expand Down
10 changes: 4 additions & 6 deletions pkg/util/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ import (
const (
//ResourceName = "nvidia.com/gpu"
//ResourceName = "hami.io/vgpu"
AssignedTimeAnnotations = "hami.io/vgpu-time"
AssignedIDsAnnotations = "hami.io/vgpu-ids-new"
AssignedIDsToAllocateAnnotations = "hami.io/devices-to-allocate"
AssignedNodeAnnotations = "hami.io/vgpu-node"
BindTimeAnnotations = "hami.io/bind-time"
DeviceBindPhase = "hami.io/bind-phase"
AssignedTimeAnnotations = "hami.io/vgpu-time"
AssignedNodeAnnotations = "hami.io/vgpu-node"
BindTimeAnnotations = "hami.io/bind-time"
DeviceBindPhase = "hami.io/bind-phase"

DeviceBindAllocating = "allocating"
DeviceBindFailed = "failed"
Expand Down
4 changes: 3 additions & 1 deletion pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ func DecodeContainerDevices(str string) (ContainerDevices, error) {
}

func DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error) {
klog.V(5).Infof("checklist is [%+v], annos is [%+v]", checklist, annos)
if len(annos) == 0 {
return PodDevices{}, nil
}
Expand Down Expand Up @@ -232,7 +233,7 @@ func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevice
if err != nil {
return v1.Container{}, ContainerDevices{}, err
}
klog.InfoS("pdevices", pdevices)
klog.Infof("pod annotation decode vaule is %+v", pdevices)
res := ContainerDevices{}

pd, ok := pdevices[dtype]
Expand Down Expand Up @@ -325,6 +326,7 @@ func PatchPodAnnotations(pod *v1.Pod, annotations map[string]string) error {
if err != nil {
return err
}
klog.V(5).Infof("patch pod %s/%s annotation content is %s", pod.Namespace, pod.Name, string(bytes))
_, err = client.GetClient().CoreV1().Pods(pod.Namespace).
Patch(context.Background(), pod.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{})
if err != nil {
Expand Down
70 changes: 70 additions & 0 deletions pkg/util/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,73 @@ func TestPodDevicesCoding(t *testing.T) {
})
}
}

func Test_DecodePodDevices(t *testing.T) {
//DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error)
InRequestDevices["NVIDIA"] = "hami.io/vgpu-devices-to-allocate"
SupportDevices["NVIDIA"] = "hami.io/vgpu-devices-allocated"
tests := []struct {
name string
args struct {
checklist map[string]string
annos map[string]string
}
want PodDevices
wantErr error
}{
{
name: "annos len is 0",
args: struct {
checklist map[string]string
annos map[string]string
}{
checklist: map[string]string{},
annos: make(map[string]string),
},
want: PodDevices{},
wantErr: nil,
},
{
name: "annos having two device",
args: struct {
checklist map[string]string
annos map[string]string
}{
checklist: InRequestDevices,
annos: map[string]string{
InRequestDevices["NVIDIA"]: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76,NVIDIA,500,3:;GPU-ebe7c3f7-303d-558d-435e-99a160631fe4,NVIDIA,500,3:;",
SupportDevices["NVIDIA"]: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76,NVIDIA,500,3:;GPU-ebe7c3f7-303d-558d-435e-99a160631fe4,NVIDIA,500,3:;",
},
},
want: PodDevices{
"NVIDIA": {
{
{
UUID: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
Type: "NVIDIA",
Usedmem: 500,
Usedcores: 3,
},
},
{
{
UUID: "GPU-ebe7c3f7-303d-558d-435e-99a160631fe4",
Type: "NVIDIA",
Usedmem: 500,
Usedcores: 3,
},
},
},
},
wantErr: nil,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got, gotErr := DecodePodDevices(test.args.checklist, test.args.annos)
assert.DeepEqual(t, test.wantErr, gotErr)
assert.DeepEqual(t, test.want, got)
})
}
}

0 comments on commit ddc4054

Please sign in to comment.