Merge pull request JuliaGPU#1200 from JuliaGPU/tb/ci

CI improvements.
GiggleLiu · Oct 15, 2021 · 27c87a6 · 27c87a6
2 parents aef1a6f + ca4b3ce
commit 27c87a6
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 89 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -20,33 +20,30 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 120
 
-  - wait  # smoke test: don't kick off a large number of tests if they'd all just fail
-
-  # XXX: disabled due to memory requirements; re-enable on the new CI machines
-  # - label: "Julia 1.6 (debug)"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: 1.6
-  #     - JuliaCI/julia-test#v1:
-  #         julia_args: "-g2"
-  #         test_args: "--sanitize --quickfail --jobs=2"
-  #     - JuliaCI/julia-coverage#v1:
-  #         codecov: true
-  #         dirs:
-  #           - src
-  #           - lib
-  #           - examples
-  #   agents:
-  #     queue: "juliagpu"
-  #     cuda: "11.0"
-  #     cap: "sm_80"                    # test as much as possible
-  #   env:
-  #     JULIA_CUDA_VERSION: '11.4'
-  #     JULIA_CUDA_DEBUG_INFO: 'false'  # NVIDIA bug #3305774: ptxas segfaults with out debug info
-  #     JULIA_CUDA_USE_BINARYBUILDER: 'true'
-  #   if: build.message !~ /\[skip tests\]/ &&
-  #       !build.pull_request.draft
-  #   timeout_in_minutes: 120
+  - label: "Julia 1.6 (debug)"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.6
+      - JuliaCI/julia-test#v1:
+          julia_args: "-g2"
+          test_args: "--sanitize --quickfail --jobs=1"
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+          dirs:
+            - src
+            - lib
+            - examples
+    agents:
+      queue: "juliagpu"
+      cuda: "11.0"
+      cap: "sm_80"
+    env:
+      JULIA_CUDA_VERSION: '11.4'
+      JULIA_CUDA_DEBUG_INFO: 'false'  # NVIDIA bug #3305774: ptxas segfaults with out debug info
+      JULIA_CUDA_USE_BINARYBUILDER: 'true'
+    if: build.message !~ /\[skip tests\]/ &&
+        !build.pull_request.draft
+    timeout_in_minutes: 180
 
   - label: "Julia 1.7"
     plugins:
@@ -244,17 +241,17 @@ steps:
 
    # special tests
 
-  - label: "Windows"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      # XXX: no coverage, as no secrets on Windows
-    agents:
-      queue: "juliagpu-windows"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
+  # - label: "Windows"
+  #   plugins:
+  #     - JuliaCI/julia#v1:
+  #         version: 1.6
+  #     - JuliaCI/julia-test#v1: ~
+  #     # XXX: no coverage, as no secrets on Windows
+  #   agents:
+  #     queue: "juliagpu-windows"
+  #     cuda: "*"
+  #   if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+  #   timeout_in_minutes: 120
 
   - label: "NNlibCUDA.jl"
     plugins:

diff --git a/test/cudnn/convolution.jl b/test/cudnn/convolution.jl
@@ -184,10 +184,10 @@ false && @testset "cudnn/convolution" begin
 
     # Test tensor format
     cx2,cw2,cb2 = (x->permutedims(x,(3,1,2,4))).((cx,cw,cb))
-    whcn = cudnnConvolutionForward(cw,cx)
-    cwhn = cudnnConvolutionForward(cw2,cx2,format=CUDNN_TENSOR_NHWC)
+    whcn = cudnnConvolutionForward(cw,cx) |> Array
+    cwhn = cudnnConvolutionForward(cw2,cx2,format=CUDNN_TENSOR_NHWC) |> Array
     @test cwhn ≈ permutedims(whcn,(3,1,2,4))
-    whcn = cudnnConvolutionForward(cw,cx;bias=cb)
-    cwhn = cudnnConvolutionForward(cw2,cx2;bias=cb2,format=CUDNN_TENSOR_NHWC)
+    whcn = cudnnConvolutionForward(cw,cx;bias=cb) |> Array
+    cwhn = cudnnConvolutionForward(cw2,cx2;bias=cb2,format=CUDNN_TENSOR_NHWC) |> Array
     @test cwhn ≈ permutedims(whcn,(3,1,2,4))
 end
diff --git a/test/cudnn/dropout.jl b/test/cudnn/dropout.jl
@@ -23,10 +23,10 @@ using CUDA.CUDNN:
     x = CUDA.rand(N)
     d = cudnnDropoutDescriptor(P)
     cudnnDropoutSeed[] = 1
-    y = cudnnDropoutForward(x; dropout = P)
-    @test isapprox(mean(Array(y).==0), P; atol = 3/sqrt(N))
-    @test y == cudnnDropoutForward(x, d)
-    @test y == cudnnDropoutForward!(similar(x), x; dropout = P)
-    @test y == cudnnDropoutForward!(similar(x), x, d)
+    y = cudnnDropoutForward(x; dropout = P) |> Array
+    @test isapprox(mean(y.==0), P; atol = 3/sqrt(N))
+    @test y == cudnnDropoutForward(x, d) |> Array
+    @test y == cudnnDropoutForward!(similar(x), x; dropout = P) |> Array
+    @test y == cudnnDropoutForward!(similar(x), x, d) |> Array
     cudnnDropoutSeed[] = -1
 end
diff --git a/test/cudnn/multiheadattn.jl b/test/cudnn/multiheadattn.jl
@@ -116,18 +116,18 @@ using CUDA.CUDNN:
                                       qProjSize, kProjSize, vProjSize, oProjSize,
                                       qoMaxSeqLength, kvMaxSeqLength, maxBatchSize,
                                       maxBeamSize, residuals, currIdx, loWinIdx, hiWinIdx)
-        @test y ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values; axes,
-                                             seqLengthsQO, seqLengthsKV, attnMode, nHeads,
-                                             smScaler, mathType, qProjSize, kProjSize,
-                                             vProjSize, oProjSize, qoMaxSeqLength,
-                                             kvMaxSeqLength, maxBatchSize, maxBeamSize,
-                                             residuals, currIdx, loWinIdx, hiWinIdx)
-        @test y ≈ cudnnMultiHeadAttnForward(weights, queries, keys, values, attnDesc;
-                                            axes, seqLengthsQO, seqLengthsKV, residuals,
-                                            currIdx, loWinIdx, hiWinIdx)
-        @test y ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values, attnDesc;
-                                             axes, seqLengthsQO, seqLengthsKV, residuals,
-                                             currIdx, loWinIdx, hiWinIdx)
+        @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values; axes,
+                                                    seqLengthsQO, seqLengthsKV, attnMode, nHeads,
+                                                    smScaler, mathType, qProjSize, kProjSize,
+                                                    vProjSize, oProjSize, qoMaxSeqLength,
+                                                    kvMaxSeqLength, maxBatchSize, maxBeamSize,
+                                                    residuals, currIdx, loWinIdx, hiWinIdx) |> Array
+        @test Array(y) ≈ cudnnMultiHeadAttnForward(weights, queries, keys, values, attnDesc;
+                                                   axes, seqLengthsQO, seqLengthsKV, residuals,
+                                                   currIdx, loWinIdx, hiWinIdx) |> Array
+        @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values, attnDesc;
+                                                    axes, seqLengthsQO, seqLengthsKV, residuals,
+                                                    currIdx, loWinIdx, hiWinIdx) |> Array
     end
 
     Q,K,V,B,T,F = 6,6,5,4,3,Float32

diff --git a/test/cudnn/normalization.jl b/test/cudnn/normalization.jl
@@ -84,15 +84,15 @@ using CUDA.CUDNN:
         y0 = randn!(similar(x))
         y1 = alpha * y
         y2 = y1 + beta * y0
-        @test y1 ≈ cudnnNormalizationForward(x, xmean, xvar, bias, scale; training, z, mode,
-                                             normOps, algo, alpha, epsilon, groupCnt,
-                                             format, exponentialAverageFactor, savedMean,
-                                             savedInvVariance, activationDesc)
-        @test y2 ≈ cudnnNormalizationForward!(copy(y0), x, xmean, xvar, bias, scale;
-                                              training, z, mode, normOps, algo, alpha, beta,
-                                              epsilon, groupCnt, format,
-                                              exponentialAverageFactor, savedMean,
-                                              savedInvVariance, activationDesc)
+        @test Array(y1) ≈ cudnnNormalizationForward(x, xmean, xvar, bias, scale; training, z, mode,
+                                                    normOps, algo, alpha, epsilon, groupCnt,
+                                                    format, exponentialAverageFactor, savedMean,
+                                                    savedInvVariance, activationDesc) |> Array
+        @test Array(y2) ≈ cudnnNormalizationForward!(copy(y0), x, xmean, xvar, bias, scale;
+                                                     training, z, mode, normOps, algo, alpha, beta,
+                                                     epsilon, groupCnt, format,
+                                                     exponentialAverageFactor, savedMean,
+                                                     savedInvVariance, activationDesc) |> Array
     end
 
     x, z, s = (CUDA.randn(x...) for x in ((5,4,3,2),(5,4,3,2),(1,1,3,1)))

diff --git a/test/cudnn/rnn.jl b/test/cudnn/rnn.jl
@@ -96,17 +96,17 @@ using CUDA.CUDNN:
         _y = copy(y)
         _hy = (hy === nothing ? hy : copy(hy[]))
         _cy = (cy === nothing ? cy : copy(cy[]))
-        @test _y ≈ cudnnRNNForward!(y, w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode,
-                                    hiddenSize, algo, cellMode, biasMode, dirMode, inputMode,
-                                    mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags)
-        (_hy === hy === nothing || _hy ≈ hy[])
-        (_cy === cy === nothing || _cy ≈ cy[])
-        @test _y ≈ cudnnRNNForward(w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode)
-        (_hy === hy === nothing || _hy ≈ hy[])
-        (_cy === cy === nothing || _cy ≈ cy[])
-        @test _y ≈ cudnnRNNForward!(y, w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode)
-        (_hy === hy === nothing || _hy ≈ hy[])
-        (_cy === cy === nothing || _cy ≈ cy[])
+        @test Array(_y) ≈ cudnnRNNForward!(y, w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode,
+                                           hiddenSize, algo, cellMode, biasMode, dirMode, inputMode,
+                                           mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags) |> Array
+        (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[]))
+        (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[]))
+        @test Array(_y) ≈ cudnnRNNForward(w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array
+        (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[]))
+        (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[]))
+        @test Array(_y) ≈ cudnnRNNForward!(y, w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array
+        (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[]))
+        (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[]))
     end
 
     rnntest()

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -171,10 +171,14 @@ has_cudnn() || push!(skip_tests, "cudnn")
 has_cusolvermg() || push!(skip_tests, "cusolvermg")
 has_nvml() || push!(skip_tests, "nvml")
 if !has_cutensor() || CUDA.version() < v"10.1" || first(picks).cap < v"7.0" || do_sanitize
+    push!(skip_tests, "cutensor")
+end
+if do_sanitize
     # XXX: some library tests fail under compute-sanitizer
     append!(skip_tests, ["cutensor", "cusparse"])
+    # XXX: others take absurdly long
+    push!(skip_tests, "cusolver")
 end
-is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
 if first(picks).cap < v"7.0"
     push!(skip_tests, "device/intrinsics/wmma")
 end
@@ -430,15 +434,6 @@ try
                         p = recycle_worker(p)
                     else
                         print_testworker_stats(test, wrkr, resp)
-
-                        cpu_rss = resp[9]
-                        if CUDA.getenv("CI", false) && cpu_rss > 4*2^30
-                            # XXX: despite resetting the device and collecting garbage
-                            #      after each test, we are leaking CPU memory somewhere.
-                            #      this is a problem on CI, where2 we don't have much RAM.
-                            #      work around this by periodically recycling the worker.
-                            p = recycle_worker(p)
-                        end
                     end
 
                     # aggregate the snooped compiler invocations