Skip to content

Commit

Permalink
Fixed openmp bugs for XNOR
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexeyAB committed Sep 12, 2018
1 parent c0e01fd commit ca43bbd
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 166 deletions.
69 changes: 19 additions & 50 deletions src/convolutional_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,70 +141,39 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
size_t t_intput_size = new_ldb * n;
size_t t_bit_input_size = t_intput_size / 8;// +1;

/*
int i = 0;
im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
//cudaDeviceSynchronize();
{
int i = 0;
im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
//cudaDeviceSynchronize();

// should be optimized
float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
//cudaDeviceSynchronize();
// should be optimized
float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
//cudaDeviceSynchronize();

//im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
//im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);

transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
//cudaDeviceSynchronize();
transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
//cudaDeviceSynchronize();

// should be optimized
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
//cudaDeviceSynchronize();
//check_error(status);
*/
// should be optimized
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
//cudaDeviceSynchronize();
//check_error(status);
}

{
//

/*
float *input_cpu = (float *)calloc(input_size, sizeof(float));
status = cudaMemcpy(input_cpu, state.input, input_size* sizeof(float), cudaMemcpyDeviceToHost);
check_error(status);
// swaped(binary_weights <-> l.weights)
convolve_cpu(input_cpu, l.weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad); // CPU
status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
check_error(status);
free(input_cpu);
*/

/*
float *input_cpu = (float *)calloc(input_size, sizeof(float));
float *input_bin_cpu = (float *)calloc(input_size, sizeof(char));
//float *weights_bin_cpu = (float *)calloc(l.n*l.c*l.size*l.size, sizeof(char));
status = cudaMemcpy(input_cpu, state.input, input_size * sizeof(float), cudaMemcpyDeviceToHost);
check_error(status);
float_to_bit(input_cpu, (unsigned char *)input_bin_cpu, input_size);
//float_to_bit(l.weights, (unsigned char *)weights_bin_cpu, l.n*l.c*l.size*l.size); // l.align_bit_weights
convolve_bin_cpu(input_bin_cpu, (float *)l.align_bit_weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); // CPU
status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
check_error(status);
//free(weights_bin_cpu);
free(input_bin_cpu);
free(input_cpu);
*/

/*
{
float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size);
convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu);

//convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad);
//cudaDeviceSynchronize();
//check_error(status);


}
*/

add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
Expand Down
110 changes: 60 additions & 50 deletions src/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,11 @@ void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
{
int *count_arr = calloc(M*N, sizeof(int));
int i, j, k, h;
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
const char a_bit = get_bit(A, i*lda + k);
uint64_t a_bit64 = fill_bit_int64(a_bit);
Expand Down Expand Up @@ -271,10 +272,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int i, j, k, h;
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
Expand Down Expand Up @@ -365,7 +367,7 @@ void transpose_bin(char *A, char *B, const int n, const int m,
const int lda, const int ldb, const int block_size)
{
int i;
#pragma omp parallel for
#pragma omp parallel for
for (i = 0; i < n; i += 8) {
int j;
for (j = 0; j < m - 8; j += 8) {
Expand Down Expand Up @@ -617,14 +619,14 @@ void gemm_nn(int M, int N, int K, float ALPHA,
void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride,
float *weights, float *input, float *output)
{
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
int i, f, j;
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1

int fil;
// filter index
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
for (fil = 0; fil < n; ++fil) {
//int i, f, j;
int chan, y, x, f_y, f_x;
// channel index
for (chan = 0; chan < c; ++chan)
Expand Down Expand Up @@ -665,9 +667,9 @@ void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stri
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
float *weights, float *input, float *output, float *mean)
{
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
int i, f, j;
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
int i;

#if defined(_OPENMP)
static int max_num_threads = 0;
Expand All @@ -684,9 +686,9 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
*((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1));
}

for (i = 0; i < w*h*c; i += 8) {
//for (i = 0; i < w*h*c; i += 8) {
//*((__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
}
//}


//__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
Expand All @@ -704,7 +706,7 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,

int fil;
// filter index
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
for (fil = 0; fil < n; ++fil) {
int chan, y, x, f_y, f_x;
float cur_mean = fabs(mean[fil]);
Expand Down Expand Up @@ -914,16 +916,17 @@ void im2col_cpu_custom_transpose(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col, int ldb_align)
{
int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;
int c;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1)
{
#pragma omp parallel for
#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1005,6 +1008,7 @@ void im2col_cpu_custom_transpose(float* data_im,
else {
#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand All @@ -1029,17 +1033,17 @@ void im2col_cpu_custom(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col)
{

int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
int c;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
{
#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1121,10 +1125,10 @@ void im2col_cpu_custom_align(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col, int bit_align)
{
int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
int c;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
Expand All @@ -1133,6 +1137,7 @@ void im2col_cpu_custom_align(float* data_im,

#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1218,10 +1223,10 @@ void im2col_cpu_custom_bin(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col, int bit_align)
{
int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
int c;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
Expand All @@ -1233,6 +1238,7 @@ void im2col_cpu_custom_bin(float* data_im,

#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1451,8 +1457,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
int pad, int stride, int batch)
{

int w_offset = -pad / 2;
int h_offset = -pad / 2;
const int w_offset = -pad / 2;
const int h_offset = -pad / 2;
int b, k;

for (b = 0; b < batch; ++b) {
Expand Down Expand Up @@ -1563,13 +1569,13 @@ void gemm_nn(int M, int N, int K, float ALPHA,
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
float *weights, float *input, float *output, float *mean)
{
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
int i, f, j;
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
//int i, f, j;

int fil;
// filter index
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
for (fil = 0; fil < n; ++fil) {
int chan, y, x, f_y, f_x;
// channel index
Expand Down Expand Up @@ -1613,10 +1619,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int i, j, k, h;
int i;

#pragma omp parallel for
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k;
float mean_val = mean_arr[i];

for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
Expand Down Expand Up @@ -1660,16 +1667,17 @@ void im2col_cpu_custom(float* data_im,
im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
return;

int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
int c;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1)
{
#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1750,10 +1758,10 @@ void im2col_cpu_custom_bin(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col, int bit_align)
{
int c, h, w;
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
int c;
const int height_col = (height + 2 * pad - ksize) / stride + 1;
const int width_col = (width + 2 * pad - ksize) / stride + 1;
const int channels_col = channels * ksize * ksize;

// optimized version
if (height_col == height && width_col == width && stride == 1 && pad == 1)
Expand All @@ -1762,6 +1770,7 @@ void im2col_cpu_custom_bin(float* data_im,

#pragma omp parallel for
for (c = 0; c < channels_col; ++c) {
int h, w;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
Expand Down Expand Up @@ -1906,9 +1915,10 @@ void float_to_bit(float *src, unsigned char *dst, size_t size)

static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size)
{
int i, j;
int i;
//#pragma omp parallel for
for (i = 0; i<block_size; i++) {
int j;
for (j = 0; j<block_size; j++) {
B[j*ldb + i] = A[i*lda + j];
}
Expand Down Expand Up @@ -1938,8 +1948,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
int pad, int stride, int batch)
{
int b, k;
int w_offset = -pad / 2;
int h_offset = -pad / 2;
const int w_offset = -pad / 2;
const int h_offset = -pad / 2;

for (b = 0; b < batch; ++b) {
#pragma omp parallel for
Expand Down
Loading

0 comments on commit ca43bbd

Please sign in to comment.