Skip to content

Commit

Permalink
Perception: lint cuda kernel functions exection configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
mickeyouyou authored and jinghaomiao committed Jan 2, 2020
1 parent e197392 commit fef9a8e
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 141 deletions.
15 changes: 6 additions & 9 deletions modules/perception/inference/utils/cuda_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "modules/perception/inference/utils/cuda_util.h"

#include <cuda_runtime_api.h>
#include <boost/thread.hpp>

#include "boost/thread.hpp"
#include "cyber/common/log.h"
Expand All @@ -27,9 +28,9 @@ namespace inference {

static boost::thread_specific_ptr<CudaUtil> thread_instance_;

#define CUBLAS_CHECK(condition) \
do { \
cublasStatus_t status = condition; \
#define CUBLAS_CHECK(condition) \
do { \
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " << status; \
} while (0)

Expand All @@ -40,9 +41,7 @@ CudaUtil &CudaUtil::get() {
return *(thread_instance_.get());
}

CudaUtil::CudaUtil() {
CUBLAS_CHECK(cublasCreate(&cublas_handle_));
}
CudaUtil::CudaUtil() { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); }

bool CudaUtil::set_device_id(int device_id) {
int now_device = -1;
Expand All @@ -61,9 +60,7 @@ bool CudaUtil::set_device_id(int device_id) {
}
return true;
}
cublasHandle_t &CudaUtil::get_handler() {
return get().cublas_handle_;
}
cublasHandle_t &CudaUtil::get_handler() { return get().cublas_handle_; }

CudaUtil::~CudaUtil() {
if (get().cublas_handle_) {
Expand Down
79 changes: 29 additions & 50 deletions modules/perception/inference/utils/gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@

#include <algorithm>

#include "modules/perception/inference/utils/util.h"
#include "modules/perception/inference/utils/cuda_util.h"
#include "modules/perception/inference/utils/util.h"

namespace apollo {
namespace perception {
namespace inference {

__global__ void
sqrt_kernel(float *data, int width, int height) {
__global__ void sqrt_kernel(float *data, int width, int height) {
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;

Expand All @@ -40,18 +39,15 @@ sqrt_kernel(float *data, int width, int height) {
}
}
}
__global__ void mul_kernel(const int n, const float *a,
const float *b, float *y) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < (n); i += blockDim.x * gridDim.x) {
__global__ void mul_kernel(const int n, const float *a, const float *b,
float *y) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);
i += blockDim.x * gridDim.x) {
y[i] = a[i] * b[i];
}
}
__global__ void multi_scale_kernel(const float *data_in,
const float *scale,
float *data_out,
int width,
int height) {
__global__ void multi_scale_kernel(const float *data_in, const float *scale,
float *data_out, int width, int height) {
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;

Expand All @@ -61,8 +57,8 @@ __global__ void multi_scale_kernel(const float *data_in,
}
}
__global__ void set_kernel(const int n, const float alpha, float *y) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < (n); i += blockDim.x * gridDim.x) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);
i += blockDim.x * gridDim.x) {
y[i] = alpha;
}
}
Expand All @@ -80,61 +76,44 @@ void GPUL2Norm::L2Norm(base::Blob<float> *input_data) {
GPUMSetFloat(ones_.count(), 1, ones_.mutable_gpu_data());

// x = input^2
GPUMultiFloat(input_data->count(),
input_data->gpu_data(),
input_data->gpu_data(),
square_.mutable_gpu_data());
GPUMultiFloat(input_data->count(), input_data->gpu_data(),
input_data->gpu_data(), square_.mutable_gpu_data());
// scale_ = (numxdim)*(dimx1) = (numx1)
GPUGemmFloat(CblasNoTrans,
CblasTrans,
num,
1,
dim,
1.0,
square_.gpu_data(),
ones_.gpu_data(),
0.0,
scale_.mutable_gpu_data());
GPUGemmFloat(CblasNoTrans, CblasTrans, num, 1, dim, 1.0, square_.gpu_data(),
ones_.gpu_data(), 0.0, scale_.mutable_gpu_data());
dim3 threadsPerBlock(32, 8);
dim3 numBlocks(dim / threadsPerBlock.x + 1, num / threadsPerBlock.y + 1);
sqrt_kernel << < numBlocks, threadsPerBlock >> >
(scale_.mutable_gpu_data(), 1, num);
sqrt_kernel<<<numBlocks, threadsPerBlock>>>(scale_.mutable_gpu_data(), 1,
num);

multi_scale_kernel << < numBlocks, threadsPerBlock >> >
(input_data->gpu_data(), scale_.gpu_data(),
input_data->mutable_gpu_data(), dim, num);
multi_scale_kernel<<<numBlocks, threadsPerBlock>>>(
input_data->gpu_data(), scale_.gpu_data(), input_data->mutable_gpu_data(),
dim, num);
}

void GPUMultiFloat(const int n, const float *a, const float *b, float *result) {
const int CUDA_THREAD = 512;
mul_kernel << < (n + CUDA_THREAD - 1) / CUDA_THREAD, CUDA_THREAD >> >
(n, a, b, result);
mul_kernel<<<(n + CUDA_THREAD - 1) / CUDA_THREAD, CUDA_THREAD>>>(n, a, b,
result);
}
void GPUMSetFloat(const int n, const float alpha, float *result) {
const int CUDA_THREAD = 512;
set_kernel << < (n + CUDA_THREAD - 1) / CUDA_THREAD, CUDA_THREAD >> >
(n, alpha, result);
set_kernel<<<(n + CUDA_THREAD - 1) / CUDA_THREAD, CUDA_THREAD>>>(n, alpha,
result);
}
void GPUGemmFloat(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
const int K,
const float alpha,
const float *A,
const float *B,
const float beta,
float *C) {
void GPUGemmFloat(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
const int M, const int N, const int K, const float alpha,
const float *A, const float *B, const float beta, float *C) {
// Note that cublas follows fortran order.
int lda = (TransA == CblasNoTrans) ? K : M;
int ldb = (TransB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
CHECK(cublasSgemm(CudaUtil::get_handler(), cuTransB, cuTransA,
N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)
== CUBLAS_STATUS_SUCCESS);
CHECK(cublasSgemm(CudaUtil::get_handler(), cuTransB, cuTransA, N, M, K,
&alpha, B, ldb, A, lda, &beta, C,
N) == CUBLAS_STATUS_SUCCESS);
}

} // namespace inference
Expand Down
93 changes: 32 additions & 61 deletions modules/perception/inference/utils/resize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,16 @@
#include <algorithm>

#include "cyber/common/log.h"
#include "modules/perception/inference/utils/util.h"
#include "modules/perception/inference/utils/cuda_util.h"
#include "modules/perception/inference/utils/util.h"

namespace apollo {
namespace perception {
namespace inference {
__global__ void resize_linear_kernel(const unsigned char *src,
float *dst,
int channel,
int height,
int width,
int stepwidth,
int dst_height,
int dst_width,
float fx,
float fy) {
__global__ void resize_linear_kernel(const unsigned char *src, float *dst,
int channel, int height, int width,
int stepwidth, int dst_height,
int dst_width, float fx, float fy) {
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dst_x < dst_width && dst_y < dst_height) {
Expand Down Expand Up @@ -85,22 +79,13 @@ int divup(int a, int b) {
}
}

template<typename T>
__global__ void resize_linear_kernel_mean(const unsigned char *src,
float *dst,
int channel,
int height,
int width,
int stepwidth,
int dst_height,
int dst_width,
float fx,
float fy,
T mean_b,
T mean_g,
T mean_r,
bool channel_axis,
float scale) {
template <typename T>
__global__ void resize_linear_kernel_mean(const unsigned char *src, float *dst,
int channel, int height, int width,
int stepwidth, int dst_height,
int dst_width, float fx, float fy,
T mean_b, T mean_g, T mean_r,
bool channel_axis, float scale) {
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dst_x < dst_width && dst_y < dst_height) {
Expand Down Expand Up @@ -159,9 +144,8 @@ __global__ void resize_linear_kernel_mean(const unsigned char *src,
}

bool ResizeGPU(const base::Image8U &src,
std::shared_ptr<apollo::perception::base::Blob<float> > dst,
int stepwidth,
int start_axis) {
std::shared_ptr<apollo::perception::base::Blob<float>> dst,
int stepwidth, int start_axis) {
int width = dst->shape(2);
int height = dst->shape(1);
int channel = dst->shape(3);
Expand All @@ -179,22 +163,16 @@ bool ResizeGPU(const base::Image8U &src,

const dim3 grid(divup(width, block.x), divup(height, block.y));

resize_linear_kernel << < grid, block >> >
(src.gpu_data(), dst->mutable_gpu_data(),
origin_channel, origin_height, origin_width,
stepwidth, height, width, fx, fy);
resize_linear_kernel<<<grid, block>>>(
src.gpu_data(), dst->mutable_gpu_data(), origin_channel, origin_height,
origin_width, stepwidth, height, width, fx, fy);
return true;
}

bool ResizeGPU(const apollo::perception::base::Blob<uint8_t> &src_gpu,
std::shared_ptr<apollo::perception::base::Blob<float> > dst,
int stepwidth,
int start_axis,
int mean_b,
int mean_g,
int mean_r,
bool channel_axis,
float scale) {
std::shared_ptr<apollo::perception::base::Blob<float>> dst,
int stepwidth, int start_axis, int mean_b, int mean_g,
int mean_r, bool channel_axis, float scale) {
int width = dst->shape(2);
int height = dst->shape(1);
int channel = dst->shape(3);
Expand Down Expand Up @@ -223,24 +201,18 @@ bool ResizeGPU(const apollo::perception::base::Blob<uint8_t> &src_gpu,
const dim3 block(32, 8);
const dim3 grid(divup(width, block.x), divup(height, block.y));

resize_linear_kernel_mean << < grid, block >> >
((const unsigned char *) src_gpu.gpu_data(),
dst->mutable_gpu_data() + dst->offset(start_axis),
origin_channel, origin_height, origin_width,
stepwidth, height, width, fx, fy, mean_b, mean_g, mean_r,
channel_axis, scale);
resize_linear_kernel_mean<<<grid, block>>>(
(const unsigned char *)src_gpu.gpu_data(),
dst->mutable_gpu_data() + dst->offset(start_axis), origin_channel,
origin_height, origin_width, stepwidth, height, width, fx, fy, mean_b,
mean_g, mean_r, channel_axis, scale);
return true;
}

bool ResizeGPU(const base::Image8U &src,
std::shared_ptr<apollo::perception::base::Blob<float> > dst,
int stepwidth,
int start_axis,
float mean_b,
float mean_g,
float mean_r,
bool channel_axis,
float scale) {
std::shared_ptr<apollo::perception::base::Blob<float>> dst,
int stepwidth, int start_axis, float mean_b, float mean_g,
float mean_r, bool channel_axis, float scale) {
int width = dst->shape(2);
int height = dst->shape(1);
int channel = dst->shape(3);
Expand Down Expand Up @@ -269,11 +241,10 @@ bool ResizeGPU(const base::Image8U &src,
const dim3 block(32, 8);
const dim3 grid(divup(width, block.x), divup(height, block.y));

resize_linear_kernel_mean << < grid, block >> >
(src.gpu_data(), dst->mutable_gpu_data() + dst->offset(start_axis),
origin_channel, origin_height, origin_width,
stepwidth, height, width, fx, fy, mean_b, mean_g, mean_r,
channel_axis, scale);
resize_linear_kernel_mean<<<grid, block>>>(
src.gpu_data(), dst->mutable_gpu_data() + dst->offset(start_axis),
origin_channel, origin_height, origin_width, stepwidth, height, width, fx,
fy, mean_b, mean_g, mean_r, channel_axis, scale);
return true;
}

Expand Down
34 changes: 13 additions & 21 deletions modules/perception/inference/utils/util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,10 @@
namespace apollo {
namespace perception {
namespace inference {
__global__ void resize_linear_kernel(const unsigned char *src,
float *dst,
int channel,
int height,
int width,
int stepwidth,
int dst_height,
int dst_width,
float fx,
float fy) {
__global__ void resize_linear_kernel(const unsigned char *src, float *dst,
int channel, int height, int width,
int stepwidth, int dst_height,
int dst_width, float fx, float fy) {
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dst_x < dst_width && dst_y < dst_height) {
Expand Down Expand Up @@ -83,13 +77,11 @@ int divup(int a, int b) {
}
}

bool resize(int origin_channel,
int origin_height,
int origin_width,
int stepwidth,
std::shared_ptr <apollo::perception::base::Blob<float>> dst,
std::shared_ptr <apollo::perception::base::SyncedMemory> src_gpu,
int start_axis) {
bool resize(int origin_channel, int origin_height, int origin_width,
int stepwidth,
std::shared_ptr<apollo::perception::base::Blob<float>> dst,
std::shared_ptr<apollo::perception::base::SyncedMemory> src_gpu,
int start_axis) {
int width = dst->shape(2);
int height = dst->shape(1);
int channel = dst->shape(3);
Expand All @@ -108,10 +100,10 @@ bool resize(int origin_channel,

const dim3 grid(divup(width, block.x), divup(height, block.y));

resize_linear_kernel << < grid, block >>
> ((const unsigned char *) src_gpu->gpu_data(), dst->mutable_gpu_data(),
origin_channel, origin_height, origin_width,
stepwidth, height, width, fx, fy);
resize_linear_kernel<<<grid, block>>>(
(const unsigned char *)src_gpu->gpu_data(), dst->mutable_gpu_data(),
origin_channel, origin_height, origin_width, stepwidth, height, width, fx,
fy);
return true;
}
} // namespace inference
Expand Down

0 comments on commit fef9a8e

Please sign in to comment.