Skip to content

Commit

Permalink
[CI] Use k8s cluster (dmlc#2957)
Browse files Browse the repository at this point in the history
* add

* fix

* set default

* fix

* try master

* try fix

* try

* fix

* 111

* fix

* fix

* update

* ccc

* try

* fix

* fix

* try new machine

* fix

* fix

* fix

* Revert "fix"

This reverts commit e716d66.

* try

* more parrallel

* use k8s for all

* fix name

* try not specify instance type

* ci

* use one yaml

* Revert "use one yaml"

This reverts commit 717d8d8.

* add timeout

* fix permission

* mount efs

* print

* fix pvc

* fix

* restrict num of gpu instances

* check

* fix

* fix
  • Loading branch information
VoVAllen authored Jun 4, 2021
1 parent 2df4a95 commit 6a56562
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 72 deletions.
129 changes: 58 additions & 71 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def cpp_unit_test_win64() {
def unit_test_linux(backend, dev) {
init_git()
unpack_lib("dgl-${dev}-linux", dgl_linux_libs)
timeout(time: 15, unit: 'MINUTES') {
timeout(time: 20, unit: 'MINUTES') {
sh "bash tests/scripts/task_unit_test.sh ${backend} ${dev}"
}
}
Expand Down Expand Up @@ -100,23 +100,23 @@ def is_authorized(name) {
}

pipeline {
agent any
triggers {
issueCommentTrigger('@dgl-bot .*')
}
agent any
stages {
stage('Regression Test Trigger') {
agent {
docker {
label 'linux-benchmark-node'
image 'dgllib/dgl-ci-lint'
alwaysPull true
}
kubernetes {
yamlFile 'docker/pods/ci-lint.yaml'
defaultContainer 'dgl-ci-lint'
}
}
when { triggeredBy 'IssueCommentCause' }
steps {
checkout scm
script {
// container('dgl-ci-lint') {
checkout scm
script {
def comment = env.GITHUB_COMMENT
def author = env.GITHUB_COMMENT_AUTHOR
echo("${env.GIT_URL}")
Expand All @@ -125,17 +125,17 @@ pipeline {
error('Not authorized to launch regression tests')
}
dir('benchmark_scripts_repo') {
checkout([$class: 'GitSCM', branches: [[name: '*/master']],
userRemoteConfigs: [[credentialsId: 'github', url: 'https://github.com/dglai/DGL_scripts.git']]])
checkout([$class: 'GitSCM', branches: [[name: '*/master']],
userRemoteConfigs: [[credentialsId: 'github', url: 'https://github.com/dglai/DGL_scripts.git']]])
}
sh('cp benchmark_scripts_repo/benchmark/* benchmarks/scripts/')
def command_lists = comment.split(' ')
def instance_type = command_lists[2].replace('.', '')
if (command_lists.size() != 5) {
pullRequest.comment('Cannot run the regression test due to unknown command')
error('Unknown command')
pullRequest.comment('Cannot run the regression test due to unknown command')
error('Unknown command')
} else {
pullRequest.comment("Start the Regression test. View at ${RUN_DISPLAY_URL}")
pullRequest.comment("Start the Regression test. View at ${RUN_DISPLAY_URL}")
}
def prNumber = env.BRANCH_NAME.replace('PR-', '')
dir('benchmarks/scripts') {
Expand All @@ -145,15 +145,15 @@ pipeline {
pullRequest.comment("Finished the Regression test. Result table is at https://dgl-asv-data.s3-us-west-2.amazonaws.com/${env.GIT_COMMIT}_${instance_type}/results/result.csv. Jenkins job link is ${RUN_DISPLAY_URL}. ")
currentBuild.result = 'SUCCESS'
return
}
}
// }
}
}
stage('Bot Instruction') {
agent {
docker {
label 'linux-benchmark-node'
image 'dgllib/dgl-ci-lint'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-lint.yaml'
defaultContainer 'dgl-ci-lint'
}
}
steps {
Expand All @@ -168,15 +168,14 @@ pipeline {
}
}
}
stage('CI'){
when { not {triggeredBy 'IssueCommentCause'} }
stages{
stage('CI') {
when { not { triggeredBy 'IssueCommentCause' } }
stages {
stage('Lint Check') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-lint'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-lint.yaml'
defaultContainer 'dgl-ci-lint'
}
}
steps {
Expand All @@ -189,14 +188,14 @@ pipeline {
}
}
}

stage('Build') {
parallel {
stage('CPU Build') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-cpu:conda'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-compile-cpu.yaml'
defaultContainer 'dgl-ci-cpu-compile'
}
}
steps {
Expand All @@ -210,11 +209,9 @@ pipeline {
}
stage('GPU Build') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-gpu:conda'
args '-u root'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-compile-gpu.yaml'
defaultContainer 'dgl-ci-gpu-compile'
}
}
steps {
Expand Down Expand Up @@ -247,10 +244,9 @@ pipeline {
parallel {
stage('C++ CPU') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-cpu:conda'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-cpu.yaml'
defaultContainer 'dgl-ci-cpu'
}
}
steps {
Expand All @@ -264,11 +260,9 @@ pipeline {
}
stage('C++ GPU') {
agent {
docker {
label 'linux-gpu-node'
image 'dgllib/dgl-ci-gpu:conda'
args '--runtime nvidia'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-gpu.yaml'
defaultContainer 'dgl-ci-gpu'
}
}
steps {
Expand All @@ -293,10 +287,9 @@ pipeline {
}
stage('Tensorflow CPU') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-cpu:conda'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-cpu.yaml'
defaultContainer 'dgl-ci-cpu'
}
}
stages {
Expand All @@ -314,11 +307,9 @@ pipeline {
}
stage('Tensorflow GPU') {
agent {
docker {
label 'linux-gpu-node'
image 'dgllib/dgl-ci-gpu:conda'
args '--runtime nvidia'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-gpu.yaml'
defaultContainer 'dgl-ci-gpu'
}
}
stages {
Expand All @@ -336,10 +327,9 @@ pipeline {
}
stage('Torch CPU') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-cpu:conda'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-cpu.yaml'
defaultContainer 'dgl-ci-cpu'
}
}
stages {
Expand All @@ -355,6 +345,8 @@ pipeline {
}
stage('Tutorial test') {
steps {
sh 'ls -l /tmp/dataset/*'
sh 'ls -l /tmp/dataset/'
tutorial_test_linux('pytorch')
}
}
Expand Down Expand Up @@ -387,11 +379,9 @@ pipeline {
}
stage('Torch GPU') {
agent {
docker {
label 'linux-gpu-node'
image 'dgllib/dgl-ci-gpu:conda'
args '--runtime nvidia'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-gpu.yaml'
defaultContainer 'dgl-ci-gpu'
}
}
stages {
Expand All @@ -415,10 +405,9 @@ pipeline {
}
stage('MXNet CPU') {
agent {
docker {
label 'linux-c52x-node'
image 'dgllib/dgl-ci-cpu:conda'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-cpu.yaml'
defaultContainer 'dgl-ci-cpu'
}
}
stages {
Expand All @@ -441,11 +430,9 @@ pipeline {
}
stage('MXNet GPU') {
agent {
docker {
label 'linux-gpu-node'
image 'dgllib/dgl-ci-gpu:conda'
args '--runtime nvidia'
alwaysPull true
kubernetes {
yamlFile 'docker/pods/ci-gpu.yaml'
defaultContainer 'dgl-ci-gpu'
}
}
stages {
Expand Down
22 changes: 22 additions & 0 deletions docker/pods/ci-compile-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: v1
kind: Pod
spec:
securityContext:
runAsUser: 0
containers:
- name: dgl-ci-cpu-compile
image: dgllib/dgl-ci-cpu:conda
imagePullPolicy: Always
tty: true
resources:
requests:
cpu: 16
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: beta.kubernetes.io/instance-type
# operator: In
# values:
# - c5.9xlarge
22 changes: 22 additions & 0 deletions docker/pods/ci-compile-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: v1
kind: Pod
spec:
securityContext:
runAsUser: 0
containers:
- name: dgl-ci-gpu-compile
image: dgllib/dgl-ci-gpu:conda
imagePullPolicy: Always
tty: true
resources:
requests:
cpu: 32
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: beta.kubernetes.io/instance-type
# operator: In
# values:
# - c5.9xlarge
20 changes: 20 additions & 0 deletions docker/pods/ci-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: v1
kind: Pod
spec:
securityContext:
runAsUser: 0
containers:
- name: dgl-ci-cpu
image: dgllib/dgl-ci-cpu:conda
imagePullPolicy: Always
tty: true
resources:
requests:
cpu: 8
volumeMounts:
- name: persistent-storage
mountPath: /tmp/dataset
volumes:
- name: persistent-storage
persistentVolumeClaim:
claimName: ogb-efs-claim
22 changes: 22 additions & 0 deletions docker/pods/ci-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: v1
kind: Pod
spec:
securityContext:
runAsUser: 0
containers:
- name: dgl-ci-gpu
image: dgllib/dgl-ci-gpu:conda
imagePullPolicy: Always
tty: true
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: beta.kubernetes.io/instance-type
operator: In
values:
- g4dn.2xlarge
14 changes: 14 additions & 0 deletions docker/pods/ci-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Pod
spec:
securityContext:
runAsUser: 0
containers:
- name: dgl-ci-lint
image: dgllib/dgl-ci-lint
imagePullPolicy: Always
tty: true
resources:
requests:
cpu: 4
serviceAccountName: dglciuser
2 changes: 1 addition & 1 deletion tests/scripts/build_dgl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ rm -rf _download

pushd build
cmake $CMAKE_VARS ..
make -j8
make -j
popd

pushd python
Expand Down

0 comments on commit 6a56562

Please sign in to comment.