Skip to content

Commit

Permalink
update: Gemini
Browse files Browse the repository at this point in the history
  • Loading branch information
StarCoral authored and justin0u0 committed May 19, 2022
1 parent 6442259 commit bfd3c33
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CONTAINER_PREFIX?=riyazhu
CONTAINER_NAME?=kubeshare-gemini-hook-init
CONTAINER_NAME?=kubeshare-gemini-scheduler
CONTAINER_VERSION?=db
CONTAINER_IMAGE=$(CONTAINER_PREFIX)/$(CONTAINER_NAME):$(CONTAINER_VERSION)

Expand Down
4 changes: 2 additions & 2 deletions deploy/node-daemon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ spec:
- name: "kubeshare-library"
mountPath: /kubeshare/library
- name: gemini-hook-init
image: riyazhu/kubeshare-gemini-hook-init:db #20211108 #20210706124950 #ncy9371/kubeshare-gemini-hook-init:20200429135835
image: cjobr/kubeshare-gemini-hook-init:20220401 #riyazhu/kubeshare-gemini-hook-init:db #20211108 #20210706124950 #ncy9371/kubeshare-gemini-hook-init:20200429135835
imagePullPolicy: Always
volumeMounts:
- name: "kubeshare-library"
Expand Down Expand Up @@ -93,7 +93,7 @@ spec:
- name: "kubeshare-log"
mountPath: "/kubeshare/log"
- name: gemini-scheduler
image: riyazhu/kubeshare-gemini-scheduler:db #20210706124950 #ncy9371/kubeshare-gemini-scheduler:20200429135835
image: cjobr/kubeshare-gemini-scheduler:20220127 #riyazhu/kubeshare-gemini-scheduler:db #20210706124950 #ncy9371/kubeshare-gemini-scheduler:20200429135835
imagePullPolicy: Always
volumeMounts:
- name: "kubeshare-scheduler"
Expand Down
5 changes: 3 additions & 2 deletions docker/kubeshare-gemini-hook-init/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# call from repo root

FROM nvidia/cuda:11.4.0-devel-ubuntu18.04 AS build
FROM nvidia/cuda:11.2.2-devel-ubuntu16.04 AS build
# nvidia/cuda:10.0-devel

WORKDIR /tmp/build
Expand All @@ -10,7 +10,7 @@ WORKDIR /tmp/build
COPY ./Gemini Gemini

RUN cd Gemini && \
make -C src CUDA_PATH=/usr/local/cuda/
make -C src CUDA_PATH=/usr/local/cuda/ DEBUG=1

# RUN apt update && \
# apt install -y git && \
Expand All @@ -25,3 +25,4 @@ FROM alpine:3.9

COPY --from=build /tmp/build/Gemini/lib/libgemhook.so.1 /libgemhook.so.1
CMD ["cp", "/libgemhook.so.1", "/kubeshare/library/libgemhook.so.1"]

1 change: 1 addition & 0 deletions docker/kubeshare-gemini-scheduler/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ COPY --from=build /tmp/build/Gemini/bin/gem-pmgr /gem-pmgr
#COPY --from=build /tmp/build/kubeshare/docker/kubeshare-gemini-scheduler/launch-backend.py /launch-backend.py
CMD ["/launcher-multigpus.sh", "/kubeshare/scheduler/config", "/kubeshare/scheduler/podmanagerport"]
#CMD ["/launch-backend.py", "/gem-schd", "/gem-pmgr"]

2 changes: 1 addition & 1 deletion pkg/scheduler/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (kss *KubeShareScheduler) deletePod(obj interface{}) {
if port >= PodManagerPortStart {
kss.nodePodManagerPortBitmap[podStatus.nodeName].Unmask(port - PodManagerPortStart)
}
if podStatus.cells != nil {
if len(podStatus.cells) != 0 {
cell := podStatus.cells[0]
kss.reclaimResource(cell, request, memory)
}
Expand Down
26 changes: 26 additions & 0 deletions test/cifar10/cifar10.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Test:
# gpu_request and gpu_limit correct
apiVersion: v1
kind: Pod
metadata:
name: cifar10
labels:
"sharedgpu/gpu_request": "1"
"sharedgpu/gpu_limit": "1.0"
"sharedgpu/priority": "100"
"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
schedulerName: kubeshare-scheduler
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/cifar10:test
command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: Always #IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets/"
25 changes: 25 additions & 0 deletions test/cifar10/cifar10_pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Test:
# gpu_request and gpu_limit correct
apiVersion: v1
kind: Pod
metadata:
name: cifar
labels:
"sharedgpu/gpu_request": "1"
"sharedgpu/gpu_limit": "1.0"
"sharedgpu/priority": "100"
"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/cifar10:test
command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets/"
26 changes: 26 additions & 0 deletions test/lstm_wiki2/lstm1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Test:
# gpu_request and gpu_limit correct
apiVersion: v1
kind: Pod
metadata:
name: lstm2
labels:
"sharedgpu/gpu_request": "2.0"
"sharedgpu/gpu_limit": "2.0"
"sharedgpu/priority": "100"
#"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
schedulerName: kubeshare-scheduler
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/lstm-wiki2:test
#command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: Always #IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets/"
15 changes: 5 additions & 10 deletions test/mnist/mnist2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,22 @@ kind: Pod
metadata:
name: mnist2
labels:
"sharedgpu/gpu_request": "0.3"
"sharedgpu/gpu_request": "1"
"sharedgpu/gpu_limit": "1.0"
"sharedgpu/gpu_mem": "3073741824"
"sharedgpu/priority": "100"
"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
schedulerName: kubeshare-scheduler
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/mnist:20220510
image: riyazhu/mnist:test
command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "1"
memory: "4Gi"
imagePullPolicy: Always #IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/workspace/datasets"
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets"
path: "/home/riya/experiment/datasets/"
26 changes: 26 additions & 0 deletions test/mnist/mnist3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Test:
# gpu_request and gpu_limit correct
apiVersion: v1
kind: Pod
metadata:
name: mnist2
labels:
"sharedgpu/gpu_request": "1"
"sharedgpu/gpu_limit": "1.0"
#"sharedgpu/priority": "100"
#"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
schedulerName: kubeshare-scheduler
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/mnist:test
#command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: Always #IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets/"
26 changes: 26 additions & 0 deletions test/super_resolution/super_resolution.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Test:
# gpu_request and gpu_limit correct
apiVersion: v1
kind: Pod
metadata:
name: super-resolution
labels:
"sharedgpu/gpu_request": "1"
"sharedgpu/gpu_limit": "1.0"
"sharedgpu/priority": "100"
"sharedgpu/gpu_model": "NVIDIA-GeForce-GTX-1080"
spec:
schedulerName: kubeshare-scheduler
restartPolicy: Never
containers:
- name: pytorch
image: riyazhu/super_resolution:test
command: ["sh", "-c", "sleep infinity"]
imagePullPolicy: Always #IfNotPresent
volumeMounts:
- name: datasets
mountPath: "/datasets/"
volumes:
- name: datasets
hostPath:
path: "/home/riya/experiment/datasets/"

0 comments on commit bfd3c33

Please sign in to comment.