[Feature](bangc-ops): add new op tensor_stride_process (Cambricon#494)

Co-authored-by: tudejiang <[email protected]>
wenzhengyin · Mar 8, 2023 · 941b4a2 · 941b4a2
1 parent 1a0cafb
commit 941b4a2
Show file tree

Hide file tree

Showing 9 changed files with 1,442 additions and 19 deletions.
diff --git a/bangc-ops/kernels/tensor_stride_process/aarch64/tensor_stride_in_block.mlu.o b/bangc-ops/kernels/tensor_stride_process/aarch64/tensor_stride_in_block.mlu.o
diff --git a/bangc-ops/kernels/tensor_stride_process/aarch64/tensor_stride_out_block.mlu.o b/bangc-ops/kernels/tensor_stride_process/aarch64/tensor_stride_out_block.mlu.o
diff --git a/bangc-ops/kernels/tensor_stride_process/tensor_stride_in_block.mlu b/bangc-ops/kernels/tensor_stride_process/tensor_stride_in_block.mlu
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu.h"
+#include "kernels/tensor_stride_process/tensor_stride_process_common.h"
+
+#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024)
+__nram__ char ram[SIZE_NRAM_BUF];
+
+template <typename T>
+__mlu_func__ void blockTensorStridedIn(T *input, TensorShape &input_shape,
+                                       T *output) {
+  int total_num = input_shape.total_num;
+  int data_per_core = total_num / taskDim;
+  int data_last_core = total_num / taskDim + total_num % taskDim;
+  // currently SIZE_NRAM_BUF does not exceed 2GB, keep int32 for performance
+  // reasons
+  int load_once = SIZE_NRAM_BUF / sizeof(T);
+  int load_repeat;
+  int load_remain;
+  if (taskId < taskDim - 1) {
+    load_once = load_once > data_per_core ? data_per_core : load_once;
+    load_repeat = data_per_core / load_once;
+    load_remain = data_per_core % load_once;
+  } else {
+    load_once = load_once > data_last_core ? data_last_core : load_once;
+    load_repeat = data_last_core / load_once;
+    load_remain = data_last_core % load_once;
+  }
+  for (int i = 0; i < load_repeat; i++) {
+    tensorStrideLoad<T>((T *)ram, input, i * load_once + taskId * data_per_core,
+                        load_once, sizeof(T), input_shape);
+    __memcpy(output + i * load_once + taskId * data_per_core, (T *)ram,
+             load_once * sizeof(T), NRAM2GDRAM);
+  }
+  if (load_remain > 0) {
+    tensorStrideLoad<T>((T *)ram, input,
+                        load_repeat * load_once + taskId * data_per_core,
+                        load_remain, sizeof(T), input_shape);
+    __memcpy(output + load_repeat * load_once + taskId * data_per_core,
+             (T *)ram, load_remain * sizeof(T), NRAM2GDRAM);
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnionKernelTensorStrideIn(const void *input,
+                                                 TensorShape input_shape,
+                                                 void *output) {
+  blockTensorStridedIn((T *)input, input_shape, (T *)output);
+}
+
+template void MLUUnionKernelTensorStrideIn<int8>(const void *input,
+                                                 TensorShape input_shape,
+                                                 void *output);
+template void MLUUnionKernelTensorStrideIn<half>(const void *input,
+                                                 TensorShape input_shape,
+                                                 void *output);
+template void MLUUnionKernelTensorStrideIn<float>(const void *input,
+                                                  TensorShape input_shape,
+                                                  void *output);
diff --git a/bangc-ops/kernels/tensor_stride_process/tensor_stride_out_block.mlu b/bangc-ops/kernels/tensor_stride_process/tensor_stride_out_block.mlu
@@ -0,0 +1,72 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu.h"
+#include "kernels/tensor_stride_process/tensor_stride_process_common.h"
+
+#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024)
+__nram__ char ram[SIZE_NRAM_BUF];
+
+template <typename T>
+__mlu_func__ void blockTensorStridedOut(T *input, T *output,
+                                        TensorShape &output_shape) {
+  int total_num = output_shape.total_num;
+  int rem_per_core = total_num % taskDim;
+  int data_per_core =
+      taskId < rem_per_core ? total_num / taskDim + 1 : total_num / taskDim;
+
+  // currently SIZE_NRAM_BUF does not exceed 2GB, keep int32 for performance
+  // reasons
+  int load_once = SIZE_NRAM_BUF / sizeof(T);
+  int load_repeat = data_per_core / load_once;
+  int load_remain = data_per_core % load_once;
+
+  int gdram_offset = taskId < rem_per_core
+                         ? taskId * data_per_core
+                         : taskId * data_per_core + rem_per_core;
+  for (int i = 0; i < load_repeat; i++) {
+    __memcpy((T *)ram, input + gdram_offset + i * load_once,
+             load_once * sizeof(T), GDRAM2NRAM);
+    tensorStrideStore<T>(output, gdram_offset + i * load_once, (T *)ram,
+                         load_once, sizeof(T), output_shape);
+  }
+  if (load_remain > 0) {
+    __memcpy((T *)ram, input + gdram_offset + load_repeat * load_once,
+             load_remain * sizeof(T), GDRAM2NRAM);
+    tensorStrideStore<T>(output, gdram_offset + load_repeat * load_once,
+                         (T *)ram, load_remain, sizeof(T), output_shape);
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnionKernelTensorStrideOut(const void *input,
+                                                  void *output,
+                                                  TensorShape output_shape) {
+  blockTensorStridedOut((T *)input, (T *)output, output_shape);
+}
+
+template __mlu_global__ void MLUUnionKernelTensorStrideOut<int8>(
+    const void *input, void *output, TensorShape output_shape);
+template __mlu_global__ void MLUUnionKernelTensorStrideOut<half>(
+    const void *input, void *output, TensorShape output_shape);
+template __mlu_global__ void MLUUnionKernelTensorStrideOut<float>(
+    const void *input, void *output, TensorShape output_shape);
diff --git a/bangc-ops/kernels/tensor_stride_process/tensor_stride_process.cpp b/bangc-ops/kernels/tensor_stride_process/tensor_stride_process.cpp
@@ -35,24 +35,6 @@
 
 using std::vector;
 
-// Check if tensor need stride process.
-bool ifNeedTensorStrideProcess(const mluOpTensorDescriptor_t tensor_desc) {
-  bool needStrideProcess = false;
-  int tensor_dim = tensor_desc->dim;
-  int stride_base = 1;
-  for (int i = tensor_dim - 1; i >= 0; i--) {
-    if (tensor_desc->dims[i] != 1) {
-      if (tensor_desc->strides[i] == stride_base) {
-        stride_base *= tensor_desc->dims[i];
-      } else {
-        needStrideProcess = true;
-        break;
-      }
-    }
-  }
-  return needStrideProcess;
-}
-
 bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc) {
   int tensor_dim = tensor_desc->dim;
   std::vector<int> dims;
@@ -90,6 +72,93 @@ bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc) {
   return true;
 }
 
+// Check if tensor need stride process.
+bool ifNeedTensorStrideProcess(const mluOpTensorDescriptor_t tensor_desc) {
+  bool needStrideProcess = false;
+  int tensor_dim = tensor_desc->dim;
+  int stride_base = 1;
+  for (int i = tensor_dim - 1; i >= 0; i--) {
+    if (tensor_desc->dims[i] != 1) {
+      if (tensor_desc->strides[i] == stride_base) {
+        stride_base *= tensor_desc->dims[i];
+      } else {
+        needStrideProcess = true;
+        break;
+      }
+    }
+  }
+  return needStrideProcess;
+}
+
+// Check if stride out is 021 trans and dimension 1 or 2 pad
+// for stride in, the operation is crop actually
+// dims_ptr != nullptr will fill tensor_shape with merged stride and dim
+bool isTransPadStride(TensorShape &tensor_shape, int *dims_ptr,
+                      int *strides_ptr) {
+  // get valid dims and merging dims
+  vector<int> dims;
+  vector<int> strides;
+  int begin = 0;
+  while (begin < MLUOP_DIM_MAX) {
+    // skip the leading 1
+    if (tensor_shape.tensor_dims[begin] != 1) {
+      // start scanning on following dims and stride
+      int dim = tensor_shape.tensor_dims[begin];
+      int stride = tensor_shape.tensor_strides[begin];
+      while (begin + 1 < MLUOP_DIM_MAX) {
+        // means can be merged
+        if (tensor_shape.tensor_strides[begin] ==
+            (tensor_shape.tensor_strides[begin + 1] *
+             tensor_shape.tensor_dims[begin + 1])) {
+          dim *= tensor_shape.tensor_dims[begin + 1];
+          stride = tensor_shape.tensor_strides[begin + 1];
+          begin++;
+        } else {
+          break;
+        }
+      }
+      // fillin the merged dim and stride
+      dims.push_back(dim);
+      strides.push_back(stride);
+    }
+    begin++;
+  }
+  // only handle three dimension of which notation is nhw
+  if (dims.size() != 3) {
+    return false;
+  }
+  // the stride of h == 1 means transpose
+  bool is_trans = false;
+  if (strides[1] == 1) {
+    is_trans = true;
+  }
+  // there are two kinds of pad: pad ho or pad wo
+  bool is_pad = false;
+  // if pad ho, wo should be equal to hi
+  if (strides[2] == dims[1] && (strides[0] % strides[2]) == 0 &&
+      (strides[0] / strides[2]) > dims[2]) {
+    is_pad = true;
+  }
+  // if pad wo, ho should be equal to wi
+  if (strides[2] > dims[1] && strides[0] == (dims[2] * strides[2])) {
+    is_pad = true;
+  }
+  // return merges dims and strides
+  if (is_trans && is_pad) {
+    if (dims_ptr != nullptr) {
+      for (int i = 0; i < dims.size(); i++) {
+        dims_ptr[i] = dims[i];
+      }
+    }
+    if (strides_ptr != nullptr) {
+      for (int i = 0; i < strides.size(); i++) {
+        strides_ptr[i] = strides[i];
+      }
+    }
+  }
+  return is_trans && is_pad;
+}
+
 // From tensor_desc get tensor's dims and strides.
 void getTensorShape(const mluOpTensorDescriptor_t tensor_desc,
                     TensorShape *tensor_shape) {
@@ -231,10 +300,26 @@ void getExpandTensorShape(const mluOpTensorDescriptor_t tensor_desc,
   }
 }
 
+static size_t shapeStrideCount(const mluOpTensorDescriptor_t desc) {
+  size_t total = 1;
+  for (int i = 0; i < desc->dim; ++i) {
+    if (desc->dims[i] == 0) {
+      total = 0;
+      break;
+    }
+    total += (desc->dims[i] - 1) * desc->strides[i];
+  }
+  return total;
+}
+
 // Policy function
 static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
                                 cnrtFunctionType_t *k_type, int total_num) {
-  *k_type = CNRT_FUNC_TYPE_UNION1;
+  if (handle->sram_size <= 0) {
+    *k_type = CNRT_FUNC_TYPE_BLOCK;
+  } else {
+    *k_type = CNRT_FUNC_TYPE_UNION1;
+  }
   uint32_t union_number = mluop::runtime::getClusterLimitCapability(handle);
 
   // Split to different cores according to total_num.
@@ -250,6 +335,11 @@ static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
 mluOpStatus_t MLUOP_WIN_API mluOpTensorStrideIn(
     mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
     const void *input, void *output) {
+  if (handle->arch < MLUOP_MLU590) {
+    size_t num_with_stride = shapeStrideCount(input_desc);
+    TENSOR_NUM_CHECK("[mluOpTensorStrideIn]", num_with_stride, LARGE_TENSOR_NUM,
+                     "input tensor num with stride is too large. ");
+  }
   TensorShape input_shape;
   getTensorShape(input_desc, &input_shape);
   mluOpDataType_t data_type = input_desc->dtype;
@@ -271,6 +361,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpTensorStrideOut(
     const void *input, void *output) {
   TensorShape output_shape;
   getTensorShape(input_desc, &output_shape);
+  if (handle->arch < MLUOP_MLU590) {
+    size_t num_with_stride = shapeStrideCount(input_desc);
+    TENSOR_NUM_CHECK("[mluOpTensorStrideOut]", num_with_stride,
+                     LARGE_TENSOR_NUM,
+                     "input tensor num with stride is too large. ");
+  }
 
   mluOpDataType_t data_type = input_desc->dtype;
   cnrtDim3_t k_dim;

diff --git a/bangc-ops/kernels/tensor_stride_process/tensor_stride_process.h b/bangc-ops/kernels/tensor_stride_process/tensor_stride_process.h
@@ -43,6 +43,8 @@ bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc);
 
 bool strideCaseWithNotConsistentDense(int tensor_num, ...);
 
+bool isTransPadStride(TensorShape &tensor_shape, int *dims, int *strides);
+
 void getTensorShape(const mluOpTensorDescriptor_t tensor_desc,
                     TensorShape *tensor_shape);