Skip to content

Commit

Permalink
[Feature](bangc-ops): add new op tensor_stride_process (Cambricon#494)
Browse files Browse the repository at this point in the history
Co-authored-by: tudejiang <[email protected]>
  • Loading branch information
tudejiang79 and tudejiang authored Mar 8, 2023
1 parent 1a0cafb commit 941b4a2
Show file tree
Hide file tree
Showing 9 changed files with 1,442 additions and 19 deletions.
Binary file not shown.
Binary file not shown.
79 changes: 79 additions & 0 deletions bangc-ops/kernels/tensor_stride_process/tensor_stride_in_block.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*************************************************************************
* Copyright (C) [2022] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu.h"
#include "kernels/tensor_stride_process/tensor_stride_process_common.h"

#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024)
__nram__ char ram[SIZE_NRAM_BUF];

template <typename T>
__mlu_func__ void blockTensorStridedIn(T *input, TensorShape &input_shape,
T *output) {
int total_num = input_shape.total_num;
int data_per_core = total_num / taskDim;
int data_last_core = total_num / taskDim + total_num % taskDim;
// currently SIZE_NRAM_BUF does not exceed 2GB, keep int32 for performance
// reasons
int load_once = SIZE_NRAM_BUF / sizeof(T);
int load_repeat;
int load_remain;
if (taskId < taskDim - 1) {
load_once = load_once > data_per_core ? data_per_core : load_once;
load_repeat = data_per_core / load_once;
load_remain = data_per_core % load_once;
} else {
load_once = load_once > data_last_core ? data_last_core : load_once;
load_repeat = data_last_core / load_once;
load_remain = data_last_core % load_once;
}
for (int i = 0; i < load_repeat; i++) {
tensorStrideLoad<T>((T *)ram, input, i * load_once + taskId * data_per_core,
load_once, sizeof(T), input_shape);
__memcpy(output + i * load_once + taskId * data_per_core, (T *)ram,
load_once * sizeof(T), NRAM2GDRAM);
}
if (load_remain > 0) {
tensorStrideLoad<T>((T *)ram, input,
load_repeat * load_once + taskId * data_per_core,
load_remain, sizeof(T), input_shape);
__memcpy(output + load_repeat * load_once + taskId * data_per_core,
(T *)ram, load_remain * sizeof(T), NRAM2GDRAM);
}
}

template <typename T>
__mlu_global__ void MLUUnionKernelTensorStrideIn(const void *input,
TensorShape input_shape,
void *output) {
blockTensorStridedIn((T *)input, input_shape, (T *)output);
}

template void MLUUnionKernelTensorStrideIn<int8>(const void *input,
TensorShape input_shape,
void *output);
template void MLUUnionKernelTensorStrideIn<half>(const void *input,
TensorShape input_shape,
void *output);
template void MLUUnionKernelTensorStrideIn<float>(const void *input,
TensorShape input_shape,
void *output);
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*************************************************************************
* Copyright (C) [2022] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu.h"
#include "kernels/tensor_stride_process/tensor_stride_process_common.h"

#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024)
__nram__ char ram[SIZE_NRAM_BUF];

template <typename T>
__mlu_func__ void blockTensorStridedOut(T *input, T *output,
TensorShape &output_shape) {
int total_num = output_shape.total_num;
int rem_per_core = total_num % taskDim;
int data_per_core =
taskId < rem_per_core ? total_num / taskDim + 1 : total_num / taskDim;

// currently SIZE_NRAM_BUF does not exceed 2GB, keep int32 for performance
// reasons
int load_once = SIZE_NRAM_BUF / sizeof(T);
int load_repeat = data_per_core / load_once;
int load_remain = data_per_core % load_once;

int gdram_offset = taskId < rem_per_core
? taskId * data_per_core
: taskId * data_per_core + rem_per_core;
for (int i = 0; i < load_repeat; i++) {
__memcpy((T *)ram, input + gdram_offset + i * load_once,
load_once * sizeof(T), GDRAM2NRAM);
tensorStrideStore<T>(output, gdram_offset + i * load_once, (T *)ram,
load_once, sizeof(T), output_shape);
}
if (load_remain > 0) {
__memcpy((T *)ram, input + gdram_offset + load_repeat * load_once,
load_remain * sizeof(T), GDRAM2NRAM);
tensorStrideStore<T>(output, gdram_offset + load_repeat * load_once,
(T *)ram, load_remain, sizeof(T), output_shape);
}
}

template <typename T>
__mlu_global__ void MLUUnionKernelTensorStrideOut(const void *input,
void *output,
TensorShape output_shape) {
blockTensorStridedOut((T *)input, (T *)output, output_shape);
}

template __mlu_global__ void MLUUnionKernelTensorStrideOut<int8>(
const void *input, void *output, TensorShape output_shape);
template __mlu_global__ void MLUUnionKernelTensorStrideOut<half>(
const void *input, void *output, TensorShape output_shape);
template __mlu_global__ void MLUUnionKernelTensorStrideOut<float>(
const void *input, void *output, TensorShape output_shape);
134 changes: 115 additions & 19 deletions bangc-ops/kernels/tensor_stride_process/tensor_stride_process.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,6 @@

using std::vector;

// Check if tensor need stride process.
bool ifNeedTensorStrideProcess(const mluOpTensorDescriptor_t tensor_desc) {
bool needStrideProcess = false;
int tensor_dim = tensor_desc->dim;
int stride_base = 1;
for (int i = tensor_dim - 1; i >= 0; i--) {
if (tensor_desc->dims[i] != 1) {
if (tensor_desc->strides[i] == stride_base) {
stride_base *= tensor_desc->dims[i];
} else {
needStrideProcess = true;
break;
}
}
}
return needStrideProcess;
}

bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc) {
int tensor_dim = tensor_desc->dim;
std::vector<int> dims;
Expand Down Expand Up @@ -90,6 +72,93 @@ bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc) {
return true;
}

// Check if tensor need stride process.
bool ifNeedTensorStrideProcess(const mluOpTensorDescriptor_t tensor_desc) {
bool needStrideProcess = false;
int tensor_dim = tensor_desc->dim;
int stride_base = 1;
for (int i = tensor_dim - 1; i >= 0; i--) {
if (tensor_desc->dims[i] != 1) {
if (tensor_desc->strides[i] == stride_base) {
stride_base *= tensor_desc->dims[i];
} else {
needStrideProcess = true;
break;
}
}
}
return needStrideProcess;
}

// Check if stride out is 021 trans and dimension 1 or 2 pad
// for stride in, the operation is crop actually
// dims_ptr != nullptr will fill tensor_shape with merged stride and dim
bool isTransPadStride(TensorShape &tensor_shape, int *dims_ptr,
int *strides_ptr) {
// get valid dims and merging dims
vector<int> dims;
vector<int> strides;
int begin = 0;
while (begin < MLUOP_DIM_MAX) {
// skip the leading 1
if (tensor_shape.tensor_dims[begin] != 1) {
// start scanning on following dims and stride
int dim = tensor_shape.tensor_dims[begin];
int stride = tensor_shape.tensor_strides[begin];
while (begin + 1 < MLUOP_DIM_MAX) {
// means can be merged
if (tensor_shape.tensor_strides[begin] ==
(tensor_shape.tensor_strides[begin + 1] *
tensor_shape.tensor_dims[begin + 1])) {
dim *= tensor_shape.tensor_dims[begin + 1];
stride = tensor_shape.tensor_strides[begin + 1];
begin++;
} else {
break;
}
}
// fillin the merged dim and stride
dims.push_back(dim);
strides.push_back(stride);
}
begin++;
}
// only handle three dimension of which notation is nhw
if (dims.size() != 3) {
return false;
}
// the stride of h == 1 means transpose
bool is_trans = false;
if (strides[1] == 1) {
is_trans = true;
}
// there are two kinds of pad: pad ho or pad wo
bool is_pad = false;
// if pad ho, wo should be equal to hi
if (strides[2] == dims[1] && (strides[0] % strides[2]) == 0 &&
(strides[0] / strides[2]) > dims[2]) {
is_pad = true;
}
// if pad wo, ho should be equal to wi
if (strides[2] > dims[1] && strides[0] == (dims[2] * strides[2])) {
is_pad = true;
}
// return merges dims and strides
if (is_trans && is_pad) {
if (dims_ptr != nullptr) {
for (int i = 0; i < dims.size(); i++) {
dims_ptr[i] = dims[i];
}
}
if (strides_ptr != nullptr) {
for (int i = 0; i < strides.size(); i++) {
strides_ptr[i] = strides[i];
}
}
}
return is_trans && is_pad;
}

// From tensor_desc get tensor's dims and strides.
void getTensorShape(const mluOpTensorDescriptor_t tensor_desc,
TensorShape *tensor_shape) {
Expand Down Expand Up @@ -231,10 +300,26 @@ void getExpandTensorShape(const mluOpTensorDescriptor_t tensor_desc,
}
}

static size_t shapeStrideCount(const mluOpTensorDescriptor_t desc) {
size_t total = 1;
for (int i = 0; i < desc->dim; ++i) {
if (desc->dims[i] == 0) {
total = 0;
break;
}
total += (desc->dims[i] - 1) * desc->strides[i];
}
return total;
}

// Policy function
static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type, int total_num) {
*k_type = CNRT_FUNC_TYPE_UNION1;
if (handle->sram_size <= 0) {
*k_type = CNRT_FUNC_TYPE_BLOCK;
} else {
*k_type = CNRT_FUNC_TYPE_UNION1;
}
uint32_t union_number = mluop::runtime::getClusterLimitCapability(handle);

// Split to different cores according to total_num.
Expand All @@ -250,6 +335,11 @@ static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
mluOpStatus_t MLUOP_WIN_API mluOpTensorStrideIn(
mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
const void *input, void *output) {
if (handle->arch < MLUOP_MLU590) {
size_t num_with_stride = shapeStrideCount(input_desc);
TENSOR_NUM_CHECK("[mluOpTensorStrideIn]", num_with_stride, LARGE_TENSOR_NUM,
"input tensor num with stride is too large. ");
}
TensorShape input_shape;
getTensorShape(input_desc, &input_shape);
mluOpDataType_t data_type = input_desc->dtype;
Expand All @@ -271,6 +361,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpTensorStrideOut(
const void *input, void *output) {
TensorShape output_shape;
getTensorShape(input_desc, &output_shape);
if (handle->arch < MLUOP_MLU590) {
size_t num_with_stride = shapeStrideCount(input_desc);
TENSOR_NUM_CHECK("[mluOpTensorStrideOut]", num_with_stride,
LARGE_TENSOR_NUM,
"input tensor num with stride is too large. ");
}

mluOpDataType_t data_type = input_desc->dtype;
cnrtDim3_t k_dim;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ bool isDenseStrideTensor(const mluOpTensorDescriptor_t tensor_desc);

bool strideCaseWithNotConsistentDense(int tensor_num, ...);

bool isTransPadStride(TensorShape &tensor_shape, int *dims, int *strides);

void getTensorShape(const mluOpTensorDescriptor_t tensor_desc,
TensorShape *tensor_shape);

Expand Down
Loading

0 comments on commit 941b4a2

Please sign in to comment.