Skip to content

Commit

Permalink
Improve the BiasGrad for NCHW using less shared memory and better memory
Browse files Browse the repository at this point in the history
efficiency.

With GoogleNet V1, time spent in BiasGrad in ms:

              Before    After     Improvement
GoogleNet V1  19.70     13.14     49.93%
Change: 116901889
  • Loading branch information
zheng-xq authored and tensorflower-gardener committed Mar 10, 2016
1 parent fd839dc commit 7cc6ba7
Showing 1 changed file with 50 additions and 35 deletions.
85 changes: 50 additions & 35 deletions tensorflow/core/kernels/bias_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,31 +118,51 @@ __global__ void BiasGradNHWC_SharedAtomics(int32 nthreads,
}

template <typename T>
__global__ void BiasGradNCHW_SharedAtomics(int32 nthreads,
const T* output_backprop,
T* bias_backprop, int32 bias_size,
int32 image_size,
int32 shared_replicas) {
T* s_data = reinterpret_cast<T*>(s_buf);
int32 s_data_size = bias_size * shared_replicas;
__global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
T* bias_backprop, int32 batch,
int32 bias_size, int32 image_size,
int group_size) {
// Initialize the shared memory.
__shared__ T s_data[32];
int32 s_data_size = sizeof(s_data) / sizeof(T);
for (int32 index = threadIdx.x; index < s_data_size; index += blockDim.x) {
s_data[index] = 0;
}
__syncthreads();

for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int32 index2 = index / image_size;
int32 bias_slot_index = index2 % bias_size;
int32 bias_slot_offset = index % shared_replicas;
int32 bias_offset = bias_slot_index * shared_replicas + bias_slot_offset;
CudaAtomicAdd(s_data + bias_offset, ldg(output_backprop + index));
// Accumulate all the values within this thread. They all have the same bias
// index.
int32 bias_index = blockIdx.x % bias_size;
int32 group_index = blockIdx.x / bias_size;
int32 total_count = batch * image_size;
T sum = 0;
for (int32 index = group_index * blockDim.x + threadIdx.x;
index < total_count; index += blockDim.x * group_size) {
int32 image_offset = index % image_size;
int32 batch = index / image_size;
T val = ldg(output_backprop +
(batch * bias_size + bias_index) * image_size + image_offset);
sum += val;
}

// Write the accumulated sum in this thread to the shared memory. Each thread
// shifts their write location to avoid bank conflict.
int bias_offset = threadIdx.x % 32;
CudaAtomicAdd(s_data + bias_offset, sum);
__syncthreads();

for (int32 index = threadIdx.x; index < s_data_size; index += blockDim.x) {
int bias_slot_index = index / shared_replicas;
CudaAtomicAdd(bias_backprop + bias_slot_index, s_data[index]);
// Accumulate the results in the shared memory into the first element.
// No syncthreads is needed since this is only in the same warp.
int32 thread_index = threadIdx.x;
if (thread_index < 16) s_data[thread_index] += s_data[thread_index + 16];
if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
if (thread_index < 1) s_data[thread_index] += s_data[thread_index + 1];

// The first thread writes out the accumulated result to the global location.
if (thread_index == 0) {
CudaAtomicAdd(bias_backprop + bias_index, s_data[0]);
}
}

Expand All @@ -154,24 +174,13 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
const int32 bias_size = channel;
const int32 image_size = height * width;
const int32 total_count = batch * bias_size * image_size;
static constexpr int32 kWarpSize = 32;
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);

const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
int32 shared_memory_size = bias_size * sizeof(T);
int shared_replicas = 1;
if (data_format == FORMAT_NCHW) {
// For NCHW, the reduction in the HW dimensions all go to the same locaiton,
// which causes a lot of bank conflicts. So having a number of them can
// improve the performance. But we also want to limit their usage so the
// warp occupancy does not decrease.
if (shared_memory_size <= max_shared_memory_size) {
// We need enough shared memory to avoid bank conflict. But not too much
// so that it would reduce occupancy.
static constexpr int kMaxSharedReplicas = 8;
shared_replicas = std::min(kMaxSharedReplicas,
max_shared_memory_size / shared_memory_size);
shared_memory_size *= shared_replicas;
}
int32 shared_memory_size = 0;
if (data_format == FORMAT_NHWC) {
shared_memory_size = bias_size * sizeof(T);
}
// Check if we have enough shared memory.
if (shared_memory_size <= max_shared_memory_size) {
Expand All @@ -181,10 +190,16 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
d.stream()>>>(total_count, output_backprop, bias_backprop,
bias_size);
} else {
// Round up the block count to multiple of bias_size.
int group_size = (config.block_count + bias_size - 1) / bias_size;
config.block_count = group_size * bias_size;
if (config.thread_per_block < kWarpSize) {
config.thread_per_block = kWarpSize;
}
BiasGradNCHW_SharedAtomics<
T><<<config.block_count, config.thread_per_block, shared_memory_size,
d.stream()>>>(total_count, output_backprop, bias_backprop,
bias_size, image_size, shared_replicas);
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
output_backprop, bias_backprop, batch, bias_size, image_size,
group_size);
}
} else {
// Note that even if we don't have enough shared memory to fit the entire
Expand Down

0 comments on commit 7cc6ba7

Please sign in to comment.