Fixed openmp bugs for XNOR

Rus-L · Sep 12, 2018 · ca43bbd · ca43bbd
1 parent c0e01fd
commit ca43bbd
Show file tree

Hide file tree

Showing 3 changed files with 224 additions and 166 deletions.
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
@@ -141,70 +141,39 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             size_t t_intput_size = new_ldb * n;
             size_t t_bit_input_size = t_intput_size / 8;// +1;
 
-            /*
-            int i = 0;
-            im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
-            //cudaDeviceSynchronize();
+            {
+                int i = 0;
+                im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
+                //cudaDeviceSynchronize();
 
-            // should be optimized
-            float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
-            //cudaDeviceSynchronize();
+                // should be optimized
+                float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
+                //cudaDeviceSynchronize();
 
-            //im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
+                //im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
 
-            transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
-            //cudaDeviceSynchronize();
+                transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
+                //cudaDeviceSynchronize();
 
-            // should be optimized
-            gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
-                (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
-            //cudaDeviceSynchronize();
-            //check_error(status);
-            */
+                // should be optimized
+                gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
+                    (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
+                //cudaDeviceSynchronize();
+                //check_error(status);
+            }
 
-            {
-                //
-
-                /*
-                float *input_cpu = (float *)calloc(input_size, sizeof(float));
-                status = cudaMemcpy(input_cpu, state.input, input_size* sizeof(float), cudaMemcpyDeviceToHost);
-                check_error(status);
-
-                // swaped(binary_weights <-> l.weights)
-                convolve_cpu(input_cpu, l.weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad); // CPU
-                status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
-                check_error(status);
-                free(input_cpu);
-                */
-
-                /*
-                float *input_cpu = (float *)calloc(input_size, sizeof(float));
-                float *input_bin_cpu = (float *)calloc(input_size, sizeof(char));
-                //float *weights_bin_cpu = (float *)calloc(l.n*l.c*l.size*l.size, sizeof(char));
-                status = cudaMemcpy(input_cpu, state.input, input_size * sizeof(float), cudaMemcpyDeviceToHost);
-                check_error(status);
-                float_to_bit(input_cpu, (unsigned char *)input_bin_cpu, input_size);
-                //float_to_bit(l.weights, (unsigned char *)weights_bin_cpu, l.n*l.c*l.size*l.size); // l.align_bit_weights
-
-                convolve_bin_cpu(input_bin_cpu, (float *)l.align_bit_weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); // CPU
-                status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
-                check_error(status);
-                //free(weights_bin_cpu);
-                free(input_bin_cpu);
-                free(input_cpu);
-                */
 
+            /*
+            {
                 float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size);
                 convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu);
 
-
                 //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad);
 
                 //cudaDeviceSynchronize();
                 //check_error(status);
-
-
             }
+            */
 
             add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
             activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);

diff --git a/src/gemm.c b/src/gemm.c
@@ -204,10 +204,11 @@ void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
 {
     int *count_arr = calloc(M*N, sizeof(int));
 
-    int i, j, k, h;
+    int i;
 
 #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
         for (k = 0; k < K; ++k) {   // l.size*l.size*l.c - one filter size [27 - 9216]
             const char a_bit = get_bit(A, i*lda + k);
             uint64_t a_bit64 = fill_bit_int64(a_bit);
@@ -271,10 +272,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
     unsigned char *B, int ldb,
     float *C, int ldc, float *mean_arr)
 {
-    int i, j, k, h;
+    int i;
 
 #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
         float mean_val = mean_arr[i];
 
         for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
@@ -365,7 +367,7 @@ void transpose_bin(char *A, char *B, const int n, const int m,
     const int lda, const int ldb, const int block_size)
 {
     int i;
-#pragma omp parallel for
+    #pragma omp parallel for
     for (i = 0; i < n; i += 8) {
         int j;
         for (j = 0; j < m - 8; j += 8) {
@@ -617,14 +619,14 @@ void gemm_nn(int M, int N, int K, float ALPHA,
 void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
+        //int i, f, j;
         int chan, y, x, f_y, f_x;
         // channel index
         for (chan = 0; chan < c; ++chan)
@@ -665,9 +667,9 @@ void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stri
 void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output, float *mean)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    int i;
 
 #if defined(_OPENMP)
     static int max_num_threads = 0;
@@ -684,9 +686,9 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
         *((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1));
     }
 
-    for (i = 0; i < w*h*c; i += 8) {
+    //for (i = 0; i < w*h*c; i += 8) {
         //*((__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
-    }
+    //}
 
 
     //__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
@@ -704,7 +706,7 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
         int chan, y, x, f_y, f_x;
         float cur_mean = fabs(mean[fil]);
@@ -914,16 +916,17 @@ void im2col_cpu_custom_transpose(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int ldb_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+    int c;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
     {
-#pragma omp parallel for
+        #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1005,6 +1008,7 @@ void im2col_cpu_custom_transpose(float* data_im,
     else {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1029,17 +1033,17 @@ void im2col_cpu_custom(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col)
 {
-
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
     {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1121,10 +1125,10 @@ void im2col_cpu_custom_align(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
@@ -1133,6 +1137,7 @@ void im2col_cpu_custom_align(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1218,10 +1223,10 @@ void im2col_cpu_custom_bin(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
@@ -1233,6 +1238,7 @@ void im2col_cpu_custom_bin(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1451,8 +1457,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
     int pad, int stride, int batch)
 {
 
-    int w_offset = -pad / 2;
-    int h_offset = -pad / 2;
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
     int b, k;
 
     for (b = 0; b < batch; ++b) {
@@ -1563,13 +1569,13 @@ void gemm_nn(int M, int N, int K, float ALPHA,
 void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output, float *mean)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    //int i, f, j;
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
         int chan, y, x, f_y, f_x;
         // channel index
@@ -1613,10 +1619,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
     unsigned char *B, int ldb,
     float *C, int ldc, float *mean_arr)
 {
-    int i, j, k, h;
+    int i;
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k;
         float mean_val = mean_arr[i];
 
         for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
@@ -1660,16 +1667,17 @@ void im2col_cpu_custom(float* data_im,
     im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
     return;
 
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
     {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1750,10 +1758,10 @@ void im2col_cpu_custom_bin(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
@@ -1762,6 +1770,7 @@ void im2col_cpu_custom_bin(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1906,9 +1915,10 @@ void float_to_bit(float *src, unsigned char *dst, size_t size)
 
 static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size)
 {
-    int i, j;
+    int i;
     //#pragma omp parallel for
     for (i = 0; i<block_size; i++) {
+        int j;
         for (j = 0; j<block_size; j++) {
             B[j*ldb + i] = A[i*lda + j];
         }
@@ -1938,8 +1948,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
     int pad, int stride, int batch)
 {
     int b, k;
-    int w_offset = -pad / 2;
-    int h_offset = -pad / 2;
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
 
     for (b = 0; b < batch; ++b) {
         #pragma omp parallel for