fix typos

peggiezhu · Dec 18, 2018 · 382ed4a · 382ed4a
1 parent 803ff20
commit 382ed4a
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 44 deletions.
diff --git a/src/cuda/dcn_v2_cuda.cu b/src/cuda/dcn_v2_cuda.cu
@@ -13,6 +13,31 @@ extern THCState *state;
 // author: Charles Shang
 // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
 
+// [batch gemm]
+// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
+
+__global__ void createBatchGemmBuffer(const float ** input_b, float ** output_b, 
+    float ** columns_b, const float ** ones_b,
+    const float ** weight_b, const float ** bias_b,
+    float * input, float * output, 
+    float * columns, float * ones,
+    float * weight, float * bias,
+    const int input_stride, const int output_stride, 
+    const int columns_stride, const int ones_stride,
+    const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches) {
+        input_b[idx] = input + idx * input_stride;
+        output_b[idx] = output + idx * output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        bias_b[idx] = bias;
+    }
+}
+
 at::Tensor
 dcn_v2_cuda_forward(const at::Tensor &input,
                     const at::Tensor &weight,
@@ -29,6 +54,7 @@ dcn_v2_cuda_forward(const at::Tensor &input,
                     const int dilation_w,
                     const int deformable_group)
 {
+    using scalar_t = float;
     // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
@@ -59,50 +85,91 @@ dcn_v2_cuda_forward(const at::Tensor &input,
     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 
-    auto ones = at::ones({height_out, width_out}, input.options());
-    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto ones = at::ones({batch, height_out, width_out}, input.options());
+    auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
 
-    using scalar_t = float;
-    for (int b = 0; b < batch; b++)
-    {
-        auto input_n = input.select(0, b);
-        auto offset_n = offset.select(0, b);
-        auto mask_n = mask.select(0, b);
-        auto output_n = output.select(0, b);
-
-        // Do Bias first:
-        // M,N,K are dims of matrix A and B
-        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-        // (N x 1) (1 x M)
-        long m_ = channels_out;
-        long n_ = height_out * width_out;
-        long k_ = 1;
-        THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
-                         ones.contiguous().data<scalar_t>(), k_,
-                         bias.contiguous().data<scalar_t>(), k_, 0.0f,
-                         output_n.data<scalar_t>(), n_);
-
-        modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
-                                         input_n.data<scalar_t>(),
-                                         offset_n.data<scalar_t>(),
-                                         mask_n.data<scalar_t>(),
-                                         1, channels, height, width,
-                                         height_out, width_out, kernel_h, kernel_w,
-                                         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-                                         deformable_group,
-                                         columns.data<scalar_t>());
-
-        //(k * m)  x  (m * n)
-        // Y = WC
-        long m = channels_out;
-        long n = height_out * width_out;
-        long k = channels * kernel_h * kernel_w;
-        THCudaBlas_Sgemm(state, 'n', 'n', n, m, k, 1.0f,
-                         columns.data<scalar_t>(), n,
-                         weight.data<scalar_t>(), k, 1.0f,
-                         output_n.data<scalar_t>(), n);
-    }
+    // prepare for batch-wise computing, which is significantly faster than instance-wise computing 
+    // when batch size is large. 
+    // launch batch threads
+    int matrices_size = batch * sizeof(float *);
+    auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+
+    const int block = 128;
+    const int grid = (batch + block - 1) / block;
+
+    createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        input_b, output_b,
+        columns_b, ones_b,
+        weight_b, bias_b,
+        input.data<scalar_t>(), 
+        output.data<scalar_t>(),
+        columns.data<scalar_t>(), 
+        ones.data<scalar_t>(),
+        weight.data<scalar_t>(), 
+        bias.data<scalar_t>(),
+        channels * width * height, 
+        channels_out * width_out * height_out,
+        channels * kernel_h * kernel_w * height_out * width_out,
+        height_out * width_out,
+        batch);
+
+    long m_ = channels_out;
+    long n_ = height_out * width_out;
+    long k_ = 1;
+    THCudaBlas_SgemmBatched(state,
+                            't',
+                            'n',
+                            n_,
+                            m_,
+                            k_,
+                            1.0f,
+                            ones_b, k_,
+                            bias_b, k_,
+                            0.0f,
+                            output_b, n_,
+                            batch);
+
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to swap axis
+    // auto columns_transpose = columns.transpose(0, 1).contiguous();
+    modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
+                                     input.data<scalar_t>(),
+                                     offset.data<scalar_t>(),
+                                     mask.data<scalar_t>(),
+                                     batch, channels, height, width,
+                                     height_out, width_out, kernel_h, kernel_w,
+                                     pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                     deformable_group,
+                                     columns.data<scalar_t>());
+
+    long m = channels_out;
+    long n = height_out * width_out;
+    long k = channels * kernel_h * kernel_w;
+    THCudaBlas_SgemmBatched(state, 
+                            'n', 
+                            'n', 
+                            n, 
+                            m, 
+                            k, 
+                            1.0f,
+                            (const float **)columns_b, n,
+                            weight_b, k, 
+                            1.0f,
+                            output_b, n,
+                            batch);
+
+    THCudaFree(state, input_b);
+    THCudaFree(state, output_b);
+    THCudaFree(state, columns_b);
+    THCudaFree(state, ones_b);
+    THCudaFree(state, weight_b);
+    THCudaFree(state, bias_b);
     return output;
 }
 

diff --git a/src/cuda/dcn_v2_im2col_cuda.cu b/src/cuda/dcn_v2_im2col_cuda.cu
@@ -321,7 +321,7 @@ __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
 void modulated_deformable_im2col_cuda(cudaStream_t stream,
   const float* data_im, const float* data_offset, const float* data_mask,
   const int batch_size, const int channels, const int height_im, const int width_im, 
-  const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
   const int dilation_h, const int dilation_w,
   const int deformable_group, float* data_col) {
@@ -331,7 +331,7 @@ void modulated_deformable_im2col_cuda(cudaStream_t stream,
   modulated_deformable_im2col_gpu_kernel
       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
           0, stream>>>(
-      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kenerl_w,
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
       batch_size, channels, deformable_group, height_col, width_col, data_col);