im2col_kernels.cu

#include <cuda_runtime.h>
#include <curand.h>
#include <cublas_v2.h>
#include <stdint.h>

#include "im2col.h"
#include "dark_cuda.h"

#include <stdio.h>
#include <assert.h>


template<typename T1, typename T2>
__device__ inline T1 __shfl_custom(T1 val, T2 lane) {
#if CUDART_VERSION >= 9000
    return __shfl_sync(FULL_MASK, val, lane);
#else
    return __shfl(val, lane);
#endif
}

template<typename T>
__device__ inline uint32_t __ballot_custom(T val) {
#if CUDART_VERSION >= 9000
    return __ballot_sync(FULL_MASK, val);
#else
    return __ballot(val);
#endif
}


// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE

__global__ void im2col_gpu_kernel(const int n, const float* data_im,
        const int height, const int width, const int ksize,
        const int pad,
        const int stride,
        const int height_col, const int width_col,
        float *data_col) {
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    for(; index < n; index += blockDim.x*gridDim.x){
        int w_out = index % width_col;
        int h_index = index / width_col;
        int h_out = h_index % height_col;
        int channel_in = h_index / height_col;
        int channel_out = channel_in * ksize * ksize;
        int h_in = h_out * stride - pad;
        int w_in = w_out * stride - pad;
        float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
        const float* data_im_ptr = data_im;
        data_im_ptr += (channel_in * height + h_in) * width + w_in;
        for (int i = 0; i < ksize; ++i) {
            for (int j = 0; j < ksize; ++j) {
                int h = h_in + i;
                int w = w_in + j;

                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
                    data_im_ptr[i * width + j] : 0;

                //data_im[(channel_in * height + h_in) * width + w_in + i * width + j];
                //(*data_col_ptr) = data_im_ptr[ii * width + jj];

                data_col_ptr += height_col * width_col;
            }
        }
    }
}

void im2col_ongpu(float *im,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_col){
    // We are going to launch channels * height_col * width_col kernels, each
    // kernel responsible for copying a single-channel grid.
    int height_col = (height + 2 * pad - ksize) / stride + 1;
    int width_col = (width + 2 * pad - ksize) / stride + 1;
    int num_kernels = channels * height_col * width_col;
    im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
        BLOCK, 0, get_cuda_stream()>>>(
                num_kernels, im, height, width, ksize, pad,
                stride, height_col,
                width_col, data_col);

    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

/*
__global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
    const int height, const int width, const int ksize,
    const int pad,
    const int stride,
    const int height_col, const int width_col,
    float *data_col, const int bit_align)
{
    //__shared__ float tmp_s[1];

    int index = blockIdx.x*blockDim.x + threadIdx.x;
    for (; index < n; index += blockDim.x*gridDim.x) {
        int w_out = index % width_col;
        int h_index = index / width_col;
        int h_out = h_index % height_col;
        int channel_in = h_index / height_col;
        int channel_out = channel_in * ksize * ksize;
        int h_in = h_out * stride - pad;
        int w_in = w_out * stride - pad;
        float* data_col_ptr = data_col;
        //data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
        data_col_ptr += channel_out * bit_align + h_out * width_col + w_out;
        float* data_col_ptr_32 = data_col + (channel_out * bit_align + h_out * width_col + w_out)/32;
        const float* data_im_ptr = data_im;
        data_im_ptr += (channel_in * height + h_in) * width + w_in;
        for (int i = 0; i < ksize; ++i) {
            for (int j = 0; j < ksize; ++j) {
                int h = h_in + i;
                int w = w_in + j;

                float val = (h >= 0 && w >= 0 && h < height && w < width) ?
                    data_im_ptr[i * width + j] : 0;

                *data_col_ptr = val;
                //tmp_s[0] = val;

                //(*data_col_ptr) = (h >= 0 && w >= 0 && h < height && w < width) ?
                //    data_im_ptr[i * width + j] : 0;

                //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
                //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
                //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
                // use atomicOr() // *dst_ptr |= (mask << (col_index % 8));
                //data_col_ptr_32 += bit_align / 32;

                //data_col_ptr += height_col * width_col;
                data_col_ptr += bit_align;
            }
        }
    }
}
*/

// float 32
__global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
    const int height, const int width, const int ksize,
    const int pad,
    const int stride,
    const int height_col, const int width_col,
    float *data_col, const int bit_align)
{
    //__shared__ float tmp_s[1];


    int index = blockIdx.x*blockDim.x + threadIdx.x;
    for (; index < n; index += blockDim.x*gridDim.x) {
        int w_out = index % width_col;
        int h_index = index / width_col;
        int h_out = h_index % height_col;
        int channel_in = h_index / height_col;
        int channel_out = channel_in * ksize * ksize;
        int h_in = h_out * stride - pad;
        int w_in = w_out * stride - pad;
        //float* data_col_ptr = data_col;
        //float* data_col_ptr_32 = data_col + (channel_out * bit_align + h_out * width_col + w_out) / 32;
        //data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
        //data_col_ptr += channel_out * bit_align + h_out * width_col + w_out;
        float* data_col_ptr = &data_col[channel_out * bit_align + h_out * width_col + w_out];
        const float* data_im_ptr = data_im;
        data_im_ptr += (channel_in * height + h_in) * width + w_in;
        for (int i = 0; i < ksize; ++i) {
            for (int j = 0; j < ksize; ++j) {
                int h = h_in + i;
                int w = w_in + j;

                float val = (h >= 0 && w >= 0 && h < height && w < width) ?
                    data_im_ptr[i * width + j] : 0;

                int pre_out_index = index % (width_col*height_col);
                int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index;// h_out * width_col + w_out;
                data_col[out_index] = val;

                //(*data_col_ptr) = val;
                //dst_s[threadIdx.x] = val;
                //tmp_s[0] = val;

                //(*data_col_ptr) = (h >= 0 && w >= 0 && h < height && w < width) ?
                //    data_im_ptr[i * width + j] : 0;

                //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
                //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
                //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
                // use atomicOr() // *dst_ptr |= (mask << (col_index % 8));
                //data_col_ptr_32 += bit_align / 32;

                //data_col_ptr += height_col * width_col;
                data_col_ptr += bit_align;
            }
        }
    }
}

void im2col_align_ongpu(float *im,
    int channels, int height, int width,
    int ksize, int stride, int pad, float *data_col, int bit_align) {
    // We are going to launch channels * height_col * width_col kernels, each
    // kernel responsible for copying a single-channel grid.
    int height_col = (height + 2 * pad - ksize) / stride + 1;
    int width_col = (width + 2 * pad - ksize) / stride + 1;
    int num_kernels = channels * height_col * width_col;
    im2col_align_gpu_kernel << <(num_kernels + BLOCK - 1) / BLOCK,
        BLOCK, 0, get_cuda_stream() >> >(
            num_kernels, im, height, width, ksize, pad,
            stride, height_col,
            width_col, data_col, bit_align);

    CHECK_CUDA(cudaPeekAtLastError());
}


// --------------------------------


// binary im2col - stride=1
__global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
    const int height, const int width, const int ksize, const int channels,
    const int pad,
    const int stride,
    const int height_col, const int width_col,
    float *data_col, const int bit_align)
{
    //__shared__ float tmp_s[1];
    //__shared__ ulonglong4 tmp256_s[1];


    //#define SHRED_VALS ((BLOCK / 169) * )
    //__shared__ float dst_s[1024];
    //__shared__ float dst_s[1024];
    //__shared__ uint32_t bit_s[32];
    //__shared__ uint8_t bit_s[128];

    int index = blockIdx.x*blockDim.x + threadIdx.x;
    //for (; index < n; index += blockDim.x*gridDim.x)
    {
        int c_index = index;
        int channel_in = c_index % channels;

        //int h_out = index % height_col;
        //int c_index = index / height_col;
        //int channel_in = c_index % channels;

        int channel_out = channel_in * ksize * ksize;

        int j_index = c_index / channels;
        int j = j_index % ksize;
        int i = j_index / ksize;

        int pre_out_index = (channel_out + i*ksize + j) * bit_align;
        int j_pad = (j - pad);
        int i_pad = (i - pad);

        for(int wh_index = 0; wh_index < (height_col*width_col); wh_index += 32)
        //for (int h_out = 0; h_out < height_col; ++h_out)
        {

            // the end of padding
            //if(0)
            //for (int w_out = 0; w_out < (width_col); w_out += 32)
            {
                const int w_out = wh_index % width_col;
                const int h_out = wh_index / width_col;

                const int w = w_out + j_pad;
                const int h = h_out + i_pad;

                int pre_in_index = channel_in * height * width;
                int pre_in_wh_index = h * width + w;

                int send_wh_index = wh_index;
                if (i >= ksize) send_wh_index = height_col*width_col;

                #pragma unroll
                for (int t = 0; t < WARP_SIZE; ++t)
                {
                    const int lane_id = threadIdx.x % WARP_SIZE;

                    const int cur_wh_index = __shfl_custom(send_wh_index, t) + lane_id;

                    if (cur_wh_index < (width_col*height_col))// && (cur_i_pad+pad) < ksize)
                    {
                        const int cur_pre_out_index = __shfl_custom(pre_out_index, t);

                        const int cur_pre_in_index = __shfl_custom(pre_in_index, t);
                        const int cur_pre_in_wh_index = __shfl_custom(pre_in_wh_index, t) + lane_id;

                        int w = cur_pre_in_wh_index % width;
                        int h = cur_pre_in_wh_index / width;
                        int in_index = cur_pre_in_index + cur_pre_in_wh_index;

                        int out_index = cur_pre_out_index + cur_wh_index;

                        float val = (w >= 0 && w < width && h >= 0 && h < height) ?
                            data_im[in_index] : float();

                        //data_col[out_index] = val;
                        //tmp_s[0] = val;

                        uint32_t bit_mask = __ballot_custom(val > 0);
                        if (lane_id == 0) {
                            uint8_t *bit8_ptr = &(((uint8_t *)data_col)[out_index / 8]);
                            uint32_t *bit32_ptr = (uint32_t *)bit8_ptr;
                            *bit32_ptr = bit_mask;
                        }
                    }


                }

            }// w_out

        }
    }
}


void im2col_align_bin_ongpu(float *im,
    int channels, int height, int width,
    int ksize, int stride, int pad, float *data_col, int bit_align) {
    // We are going to launch channels * height_col * width_col kernels, each
    // kernel responsible for copying a single-channel grid.
    int height_col = (height + 2 * pad - ksize) / stride + 1;
    int width_col = (width + 2 * pad - ksize) / stride + 1;
    //int num_kernels = channels * height_col * width_col * ksize * ksize;
    //int num_kernels = channels * ksize * ksize * height_col;
    int num_kernels = channels * ksize * ksize;
    int num_blocks = num_kernels / BLOCK + 1;

    //im2col_align_bin_gpu_kernel << <(num_kernels + BLOCK - 1) / BLOCK,
    im2col_align_bin_gpu_kernel << <num_blocks,
        BLOCK, 0, get_cuda_stream() >> >(
            num_kernels, im, height, width, ksize, channels, pad,
            stride, height_col,
            width_col, data_col, bit_align);

    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

/*
__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
{
    //const int size_aligned = size + (WARP_SIZE - size % WARP_SIZE);

    int index = blockIdx.x*blockDim.x + threadIdx.x;
    float src_val;

    //for (; index < size_aligned; index += blockDim.x*gridDim.x)
    {
        //src_val = src[index];
        if(index < size) src_val = src[index];
        else src_val = 0;
        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
        unsigned int bit_mask = __ballot_custom(src_val > 0);
        if (threadIdx.x % WARP_SIZE == 0) ((unsigned int*)dst)[index / 32] = bit_mask;
    }
}
*/

/*
__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
{
    //const int size_aligned = size + (WARP_SIZE - size % WARP_SIZE);
    __shared__ uint32_t tmp[WARP_SIZE];

    int index = blockIdx.x*blockDim.x + threadIdx.x;
    float src_val;
    uint32_t *dst32_ptr = ((unsigned int*)dst);

    //for (; index < size_aligned; index += blockDim.x*gridDim.x)
    {
        //src_val = src[index];
        if (index < size) src_val = src[index];
        else src_val = 0;
        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
        const int num_of_warps = blockDim.x / WARP_SIZE;
        const int warp_id = threadIdx.x / WARP_SIZE;
        const int lane_id = threadIdx.x % WARP_SIZE;

        uint32_t bit_mask = __ballot_custom(src_val > 0);

        if (lane_id == 0) tmp[warp_id] = bit_mask;

        __syncthreads();
        if (warp_id == 0) {
            if (lane_id < num_of_warps) {
                dst32_ptr[index / 32 + lane_id] = tmp[lane_id];
            }
        }
        __syncthreads();
    }
}
*/

__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size)
{
    __shared__ uint32_t tmp[WARP_SIZE*32];

    int index = 32*blockIdx.x*blockDim.x + threadIdx.x;
    float src_val;
    uint32_t *dst32_ptr = ((unsigned int*)dst);

    int i;
    for(i = 0; i < 32; ++i)
    {
        if ((index + i * 1024) < size) src_val = src[index + i*1024];
        else src_val = 0;
        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
        //const int num_of_warps = blockDim.x / WARP_SIZE;
        const int warp_id = threadIdx.x / WARP_SIZE;
        const int lane_id = threadIdx.x % WARP_SIZE;

        uint32_t bit_mask = __ballot_custom(src_val > 0);
        if (lane_id == 0) tmp[i * 32 + warp_id] = bit_mask;
    }
    __syncthreads();
    dst32_ptr[blockIdx.x*blockDim.x + threadIdx.x] = tmp[threadIdx.x];
}


void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
{
    //const int num_blocks = size / 1024 + 1;
    //const int num_blocks = size / (32*1024) + 1;
    const int num_blocks = get_number_of_blocks(size, 32 * 1024);
    float_to_bit_gpu_kernel<<<num_blocks, 1024, 0, get_cuda_stream()>>>(src, dst, size);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

/*
__device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
    size_t dst_i = index / 8;
    int dst_shift = index % 8;
    dst[dst_i] &= ~(1 << dst_shift);
}

__device__ __host__ static inline void set_bit(unsigned char *const dst, size_t index) {
    size_t dst_i = index / 8;
    int dst_shift = index % 8;
    dst[dst_i] |= 1 << dst_shift;
    //dst[dst_i] |= 1 << (8 - dst_shift);
}
*/

__device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
    size_t src_i = index / 8;
    int src_shift = index % 8;
    unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
    //unsigned char val = (src[src_i] & (1 << (8 - src_shift))) > 0;
    return val;
}

// Intel CPUs and nVidia CUDA GPU are little endian
__device__ __host__ unsigned char reverse_byte(unsigned char a)
{
    return ((a & 0x1) << 7) | ((a & 0x2) << 5) |
        ((a & 0x4) << 3) | ((a & 0x8) << 1) |
        ((a & 0x10) >> 1) | ((a & 0x20) >> 3) |
        ((a & 0x40) >> 5) | ((a & 0x80) >> 7);
}

__device__ __host__ unsigned char reverse_byte_2(unsigned char a)
{
    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
}

__device__ unsigned char reverse_byte_CUDA(unsigned char a)
{
    uint32_t tmp = __brev(a);
    return tmp >> 24;
}

__device__ void transpose8rS32_reversed_diagonale(unsigned char* A, unsigned char* B, int m, int n)
{
    unsigned x, y, t;

    // Load the array and pack it into x and y.
    x = (A[0] << 24) | (A[m] << 16) | (A[2 * m] << 8) | A[3 * m];
    y = (A[4 * m] << 24) | (A[5 * m] << 16) | (A[6 * m] << 8) | A[7 * m];

    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

    t = (x ^ (x >> 14)) & 0x0000CCCC;  x = x ^ t ^ (t << 14);
    t = (y ^ (y >> 14)) & 0x0000CCCC;  y = y ^ t ^ (t << 14);

    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
    x = t;

    B[7 * n] = reverse_byte_CUDA(x >> 24);  B[6 * n] = reverse_byte_CUDA(x >> 16);  B[5 * n] = reverse_byte_CUDA(x >> 8);  B[4 * n] = reverse_byte_CUDA(x);
    B[3 * n] = reverse_byte_CUDA(y >> 24);  B[2 * n] = reverse_byte_CUDA(y >> 16);  B[1 * n] = reverse_byte_CUDA(y >> 8);  B[0 * n] = reverse_byte_CUDA(y);

    //__device__ ​ unsigned int 	__brev(unsigned int  x)
    //Reverse the bit order of a 32 bit unsigned integer.
    // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html
}


// transpose 8x8 bit
__global__ void transpose_bin_gpu_kernel(unsigned char *A, unsigned char *B, const int n, const int m,
    const int lda, const int ldb, const int block_size)
{
    int i;
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    //for (i = 0; i < n; i += 8)
    {
        i = (index*8) % n;
        int j;
        //for (j = 0; j < m - 8; j += 8)
        {
            j = ((index * 8) / n) * 8;
            if (j < m) {
                int a_index = i*lda + j;
                int b_index = j*ldb + i;
                transpose8rS32_reversed_diagonale(&A[a_index / 8], &B[b_index / 8], lda / 8, ldb / 8);
            }
            //else if (j < m) {
            //    for (; j < m; ++j) {
            //        if (get_bit(A, i*lda + j)) set_bit(B, j*ldb + i);
            //        else remove_bit(B, j*ldb + i);
            //    }
            //}
        }
    }
}


__device__ __host__ uint8_t reverse_8_bit(uint8_t a) {
    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
}

__device__ uint32_t reverse_32_bit(uint32_t a)
{
    // __device__ ​ unsigned int __brev(unsigned int  x) // CUDA
    // unsigned int __rbit(unsigned int val) // for ARM    //__asm__("rbit %0, %1\n" : "=r"(output) : "r"(input));
    return __brev(a);
    //return (reverse_8_bit(a >> 24) << 0) |
    //    (reverse_8_bit(a >> 16) << 8) |
    //    (reverse_8_bit(a >> 8) << 16) |
    //    (reverse_8_bit(a >> 0) << 24);
}

#define swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

__device__ void transpose32_optimized(uint32_t A[32]) {
    int j, k;
    unsigned m, t;

    //m = 0x0000FFFF;
    //for (j = 16; j != 0; j = j >> 1, m = m ^ (m << j)) {
    //    for (k = 0; k < 32; k = (k + j + 1) & ~j) {
    //        t = (A[k] ^ (A[k + j] >> j)) & m;
    //        A[k] = A[k] ^ t;
    //        A[k + j] = A[k + j] ^ (t << j);
    //    }
    //}

    j = 16;
    m = 0x0000FFFF;
    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }

    j = 8;
    m = 0x00ff00ff;
    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }

    j = 4;
    m = 0x0f0f0f0f;
    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }

    j = 2;
    m = 0x33333333;
    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }

    j = 1;
    m = 0x55555555;
    for (k = 0; k < 32; k = (k + j + 1) & ~j) { swap(A[k], A[k + j], j, m); }

    // reverse Y
    for (j = 0; j < 16; ++j) {
        uint32_t tmp = A[j];
        A[j] = reverse_32_bit(A[31 - j]);
        A[31 - j] = reverse_32_bit(tmp);
    }
}

extern "C" {
__device__ void transpose_32x32_bits_reversed_diagonale(uint32_t *A, uint32_t *B, int m, int n)
{
    //unsigned A_tmp[32];
    //int i;
    //#pragma unroll
    //for (i = 0; i < 32; ++i) A_tmp[i] = A[i * m];
    //transpose32_optimized(A_tmp);
    //#pragma unroll
    //for (i = 0; i < 32; ++i) B[i*n] = A_tmp[i];

    __shared__ uint32_t A_shared[32 * BLOCK_TRANSPOSE32];
    uint32_t *A_tmp = &A_shared[32 * threadIdx.x];

    int i;
    #pragma unroll 32
    for (i = 0; i < 32; ++i) A_tmp[i] = A[i * m];
    transpose32_optimized(A_tmp);
    #pragma unroll 32
    for (i = 0; i < 32; ++i) B[i*n] = A_tmp[i];
}
}

// transpose 32x32 bit
__global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int n, const int m,
    const int lda, const int ldb, const int block_size)
{
    int i;
    int index = (blockIdx.x*blockDim.x + threadIdx.x) * 32;

    //for (i = 0; i < n; i += 8)
    {
        i = index % n;
        int j;
        //for (j = 0; j < m - 8; j += 8)
        {
            j = (index / n) * 32;
            if (j < m) {
                int a_index = i*lda + j;
                int b_index = j*ldb + i;
                transpose_32x32_bits_reversed_diagonale(&A[a_index / 32], &B[b_index / 32], lda / 32, ldb / 32);
            }
        }
    }
}

void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
    const int lda, const int ldb, const int block_size)
{
    //int size = n*m/ (8*8) + 1;
    int size32 = n*m / (32*32) + 1;
    //const int num_blocks = size / BLOCK + 1;
    const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
    transpose_bin_gpu_kernel_32 << <num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >> >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
    //transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

__global__ void transpose_uint32_kernel(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
{
    //l.bit_align - algined (n) by 32
    //new_ldb - aligned (k) by 256
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    //for (i = 0; i < src_h; i += 1)
    int i = index % src_h;  // l.size*l.size*l.c;
    {
        //for (j = 0; j < src_w; j += 1)
        int j = index / src_h;  // out_h*out_w;
        if(j < src_w)
        {
            ((uint32_t *)dst)[j*dst_align / 32 + i] = ((uint32_t *)src)[i*src_align + j];
        }
    }
}

void transpose_uint32_gpu(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
{
    int size = src_w * src_h;
    const int num_blocks = size / BLOCK + 1;
    transpose_uint32_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(src, dst, src_h, src_w, src_align, dst_align);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

//#define TRANS_LOOP 10

__global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
{
    __shared__ uint32_t tmp[33 * 32];   // misaligned_array[32x32]
    const int w_align = 33;
    //const int shared_size = w_align * 32;

    //l.bit_align - algined (n) by 32
    //new_ldb - aligned (k) by 256

    const int src_w_align = src_w + (32 - src_w % 32);
    //const int src_h_align = src_h + (32 - src_h % 32);

    const int warps_in_width = src_w_align / 32;
    //const int warps_in_height = src_h_align / 32;


    const int local_x = threadIdx.x % 32;   // index % 32;
    const int local_x_index = threadIdx.x / 32; // index / 32;
    const int local_y = local_x_index % 32;

//#pragma unroll TRANS_LOOP
    //for (int i = 0; i < TRANS_LOOP; ++i)
    {
        const int global_index = blockIdx.x;// blockIdx.x*TRANS_LOOP + i;// local_x_index / 32;
        const int global_x_index = global_index % warps_in_width;
        const int global_y_index = global_index / warps_in_width;

        const int global_x = global_x_index * 32 + local_x;
        const int global_y = global_y_index * 32 + local_y;

        uint32_t val = 0;
        if (global_x < src_w && global_y < src_h) {
            val = src[global_y * src_align + global_x];
        }
        //dst[global_x * dst_align / 32 + global_y] = val;
        //tmp[local_y * 32 + local_x] = val;

        tmp[local_x * w_align + local_y] = val;
        __syncthreads();
        val = tmp[local_y * w_align + local_x];

        const int new_global_x = global_y_index * 32 + local_x;
        const int new_global_y = global_x_index * 32 + local_y;

        if (new_global_x < src_h && new_global_y < src_w) {
            dst[new_global_y * (dst_align / 32) + new_global_x] = val;
        }
    }
}

#define TRANS_BLOCK 1024
void transpose_uint32_gpu_2(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
{
    int src_w_align = src_w + (32 - src_w % 32);
    int src_h_align = src_h + (32 - src_h % 32);

    int size = src_w_align * src_h_align;
    int num_blocks = size / TRANS_BLOCK;
    transpose_uint32_kernel_2 << <num_blocks, TRANS_BLOCK, 0, get_cuda_stream() >> >(src, dst, src_h, src_w, src_align, dst_align);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------


// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel(float *input, float *re_packed_input, int w, int h, int c)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    const int items_per_channel = w * h;

    int c_pack = index % 32;
    int chan_index = index / 32;
    int chan = (chan_index * 32) % c;
    int i = (chan_index * 32) / c;

    //for (chan = 0; chan < c; chan += 32)
    {
        //for (i = 0; i < items_per_channel; ++i)
        if(i < items_per_channel)
        {
            //for (c_pack = 0; c_pack < 32; ++c_pack)
            {
                float src = input[(chan + c_pack)*items_per_channel + i];

                re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src;
            }
        }
    }
}

void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
{
    int size = w * h * c;
    const int num_blocks = size / BLOCK + 1;
    repack_input_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(input, re_packed_input, w, h, c);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------


// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
{
    //__shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts

    int index = blockIdx.x*blockDim.x + threadIdx.x;

    const int items_per_channel = w * h;

    int c_pack = index % 32;
    int chan_index = index / 32;
    int chan = (chan_index * 32) % c;
    int i = (chan_index * 32) / c;

    //for (chan = 0; chan < c; chan += 32)
    {
        //for (i = 0; i < items_per_channel; ++i)
        if (i < items_per_channel)
        {
            //for (c_pack = 0; c_pack < 32; ++c_pack)
            {
                float src = input[(chan + c_pack)*items_per_channel + i];

                re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src;
            }
        }
    }
}

void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int c)
{
    int size = w * h * c;
    const int num_blocks = size / BLOCK + 1;
    repack_input_kernel_2 << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(input, re_packed_input, w, h, c);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------


// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
    //__shared__ uint32_t tmp[32];
    const int index = blockIdx.x*blockDim.x + threadIdx.x;

    const int global_warp_id = index / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;

    const int items_per_channel = w * h;
    const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE);

    int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE));
    int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE));

    if (chan < c)
    {
        uint32_t result_bits = 0;

        for (int c_pack = 0; c_pack < 32; ++c_pack)
        {
            float src = 0;
            if ((i + lane_id) < items_per_channel) {
                src = input[(chan + c_pack)*items_per_channel + (i + lane_id)];
            }
            uint32_t bit_mask = __ballot_custom(src > 0);

            uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1);

            result_bits |= (cur_bit << c_pack);
        }
        if ((i + lane_id) < items_per_channel) {
            re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits;
        }
    }
}

void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
    int size = (w * h * c) / 32 + 1;
    const int block_size = BLOCK;
    const int num_blocks = get_number_of_blocks(size, block_size);
    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks / 32, block_size);
    repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
    CHECK_CUDA(cudaPeekAtLastError());
}

/*
// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
    //__shared__ uint32_t tmp[32];
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    //const int num_of_warps = blockDim.x / WARP_SIZE;
    //const int warp_id = threadIdx.x / WARP_SIZE;
    //const int lane_id = threadIdx.x % WARP_SIZE;

    const int items_per_channel = w * h;

    int c_pack = index % 32;
    int chan_index = index / 32;
    //int chan = (chan_index * 32) % c;
    //int i = (chan_index * 32) / c;

    int i = (chan_index) % items_per_channel;
    int chan = ((chan_index ) / items_per_channel)*32;


    //for (chan = 0; chan < c; chan += 32)
    if(chan < c)
    {
        //for (i = 0; i < items_per_channel; ++i)
        //if (i < items_per_channel)
        {
            //for (c_pack = 0; c_pack < 32; ++c_pack)
            {
                float src = input[(chan + c_pack)*items_per_channel + i];

                uint32_t bit_mask = __ballot_custom(src > 0);
                if (threadIdx.x % 32 == 0)
                    re_packed_input_bin[chan*items_per_channel / 32 + i] = bit_mask;
            }
        }
    }
}

void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
    int size = w * h * c;
    const int block_size = 256;// 128;
    const int num_blocks = get_number_of_blocks(size, block_size);
    printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
    repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
    CHECK_CUDA(cudaPeekAtLastError());
}
*/


__global__ void fill_int8_gpu_kernel(unsigned char *src, unsigned char val, size_t size) {
    int index = blockIdx.x*blockDim.x + threadIdx.x;
    if(index < size) src[index] = 0;
}

void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
    const int num_blocks = size / BLOCK + 1;
    fill_int8_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream()>>>(src, val, size);
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

//typedef unsigned long long int uint64_t;
//typedef unsigned int uint32_t;
//typedef unsigned char uint8_t;
//typedef char int8_t;
/*
__device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
    return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
}
*/
__device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
    return ~(a^b) & 0b1;
}
/*
__device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
    return ~(a^b);
}

__device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
    return ~(a^b);
}

__device__ __host__ static inline uint4 xnor_int128(uint4 a, uint4 b) {
    uint4 res;
    res.w = ~(a.w^b.w);
    res.x = ~(a.x^b.x);
    res.y = ~(a.y^b.y);
    res.z = ~(a.z^b.z);
    return res;
}

__device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong4 b) {
    ulonglong4 res;
    res.w = ~(a.w^b.w);
    res.x = ~(a.x^b.x);
    res.y = ~(a.y^b.y);
    res.z = ~(a.z^b.z);
    return res;
}
*/
//-------
/*
__device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
    return (a^b) & 0b1;
}
*/
__device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
    return (a^b);
}

__device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
    return (a^b);
}
/*
__device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
    uint4 res;
    res.w = (a.w^b.w);
    res.x = (a.x^b.x);
    res.y = (a.y^b.y);
    res.z = (a.z^b.z);
    return res;
}
*/
__device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
    ulonglong4 res;
    res.w = (a.w^b.w);
    res.x = (a.x^b.x);
    res.y = (a.y^b.y);
    res.z = (a.z^b.z);
    return res;
}

/*
__device__ static inline int popcnt_256(ulonglong4 a) {
    return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
}

__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    //if (index == 0)
    {
        int i, j, k, h;

        //#pragma omp parallel for
        //for (i = 0; i < M; ++i)
        i = index % M;
        //if(i < M)
        {   // l.n - filters [16 - 55 - 1024]
            float mean_val = mean_arr[i];

            //for (j = 0; j < N; ++j)
            j = index / M;
            if(j < N)
            { // out_h*out_w - one channel output size [169 - 173056]
                int count = 0;

                for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
                    uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));
                    uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
                    uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);

                    int tmp_count = __popcll(c_bit64);

                    if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
                    count += tmp_count;
                    //binary_int64_printf(c_bit64);
                    //printf(", count = %d \n\n", tmp_count);
                }

                C[i*ldc + j] = (2 * count - K) * mean_val;
            }
        }
    }
}
*/


/*
// B (input) in the shared_memory
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr)
{

    __shared__ uint64_t B_s[4096];  // 32 KB // [ldb x N`] // max = 262 144 bits

    int start_j = blockIdx.x*blockDim.x / M;
    {
        int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1;

        size_t shared_size = ldb * (end_j - start_j);

        //float tmp_shared_size = ldb * (blockDim.x / M);
        //int passes = (4096 * 64) / tmp_shared_size - 1;
        //size_t shared_size = tmp_shared_size * passes;

        int k;
        for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) {
            int x = start_j*ldb + k;
            if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8));
        }

        ////if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) {
          ////  for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
            ////    *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8));    // input
            ////}
        ////}
    }
    __syncthreads();

    int index = blockIdx.x*blockDim.x + threadIdx.x;


    //if (index == 0)
    //for(int in_tmp = threadIdx.x; in_tmp < 1*blockDim.x; in_tmp += blockDim.x)
    {
        //int index = blockIdx.x*blockDim.x*1 + in_tmp;

        int j_cur = index / M;
        int local_j = j_cur - start_j;

        int i, j, h;

        //#pragma omp parallel for
        //for (i = 0; i < M; ++i)
        i = index % M;
        //if(i < M)
        {   // l.n - filters [16 - 55 - 1024]
            // further improvements: for (l.n == 1024) iterate several (j)
            float mean_val = mean_arr[i];

            //for (j = 0; j < N; ++j)
            j = index / M;
            if (j < N)
            { // out_h*out_w - one channel output size [169 - 173056]
                const int bit_step = 256;
                int count = 0;
                int k = 0;
                for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
                    ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8));    // weights
                    //ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8));
                    ulonglong4 b_bit256 = *((ulonglong4 *)(B_s + (local_j*ldb + k) / 8));    // input
                    ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);

                    count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
                        __popcll(c_bit256.y) + __popcll(c_bit256.z);
                }

                int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
                //C[i*ldc + j] += 2 * count*mean_val;
                //C[i*ldc + j] += -2 * f1*mean_val;
                //C[i*ldc + j] += - K*mean_val;

                count = count - f1;    // remove extra bits (from empty space for align only)
                C[i*ldc + j] = (2 * count - K) * mean_val;

                //B_s[0] = (2 * count - K) * mean_val;
            }
        }
    }
}
*/

/*
// A (weights) in the shared_memory
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    __shared__ uint64_t A_s[6144];  // 48 KB // [lda x M`]
                                    //__shared__ uint8_t A_s[6144*8];  // 48 KB // [lda x M`]

    int start_i = blockIdx.x*blockDim.x / N;
    int end_i = (blockIdx.x*blockDim.x + blockDim.x) / N + 1;

    size_t shared_size = lda * (end_i - start_i);

    int i_cur = index / N;
    int local_i = i_cur - start_i;

    for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
        int x = start_i*lda + k;
        if (x < (M*lda)) *((uint64_t *)(A_s + k / 8)) = *((uint64_t *)(A + x / 8));
    }

    //if (i_cur < M && (index % N == 0 || threadIdx.x == 0)) {
    //for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
    //(*(uint64_t *)(A_s + (local_i*lda + k) / 8)) = *((uint64_t *)(A + (i_cur*lda + k) / 8));    // weights
    //  }
    //}

    __syncthreads();

    int i, j, k, h;

    j = index % N;
    {    // out_h*out_w - one channel output size [169 - 173056]
        i = index / N;
        if (i < M)  // l.n - filters [16 - 55 - 1024]
        {
            float mean_val = mean_arr[i];
            int count = 0;

            for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
                //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));    // weights
                uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8));    // weights
                uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));            // input
                uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);

                int tmp_count = __popcll(c_bit64);

                if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
                count += tmp_count;
            }

            C[i*ldc + j] = (2 * count - K) * mean_val;
        }
    }
}
*/

__inline__ __device__
int warpAllReduceSum(int val) {
    for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
#if CUDART_VERSION >= 9000
        val += __shfl_xor_sync(FULL_MASK, val, mask);
#else
        val += __shfl_xor(val, mask);
#endif

    return val;
}

// Tensor Cores binary (CC >= 7.3 && CUDA >= 10.0) - __CUDA_SUBBYTE_IMMA__
#if CUDART_VERSION >= 10000
#include <mma.h>

#define WMMA_M 8
#define WMMA_N 8
#define WMMA_K 128
#define WMMA_K32 (WMMA_K/32)

#define WMMA_Nx2 (WMMA_N*2)

// Tensor Cores are used for XOR-GEMM
__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
    float *shortcut_in_gpu, float *shortcut_out_gpu)
{
    // total 57%
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    __shared__ int C_s[WMMA_N * WMMA_M * 32 * 2];    // 2 * 8 KB - Temprorary result of GEMM WMMA for 32 warps

    const int lane_id = threadIdx.x % 32;
    const int warp_id = threadIdx.x / 32;
    const int global_warp_id = index / 32;

    const int N_aligned = N + WMMA_Nx2 - (N % WMMA_Nx2);

    /*
    __syncthreads();
    __shared__ uint32_t A_s[8 * 512];   // 8x512 = 8 x 16384 bits, instead of 8x4
    const int start_global_warp_id = blockIdx.x*blockDim.x / 32;
    int start_i = start_global_warp_id / (N_aligned / WMMA_N);
    start_i = start_i * WMMA_M;
    if (start_i + WMMA_M > M) start_i = M - WMMA_M;   // must be: i+7 < M
    for (int tmp_index = threadIdx.x; tmp_index < (8 * 512); tmp_index += blockDim.x)
    {
        int k_tmp = tmp_index % 512;
        int local_i = tmp_index / 512;

        uint32_t a_val = ((uint32_t *)(A))[(start_i + local_i)*lda/32 + k_tmp];
        A_s[local_i * 512 + k_tmp] = a_val;
    }
    __syncthreads();
    */


    int i, j, k;//, h;
    // 47% = 29 + 10 + 8
    j = global_warp_id % (N_aligned / WMMA_Nx2);
    j = j * WMMA_Nx2;
    {    // out_h*out_w - one channel output size [169 - 173056]
        i = global_warp_id / (N_aligned / WMMA_Nx2);
        i = i * WMMA_M;

        //int count = 0;
        k = 0;

        if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
        {
            if (j + WMMA_Nx2 > N) j = N - WMMA_Nx2;   // must be: j+7 < N
            if (i + WMMA_M > M) i = M - WMMA_M;   // must be: i+7 < M

#if __CUDA_ARCH__ >= 730
            // Tensor Cores
            using namespace nvcuda;

            wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
            wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
            wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c1_frag, c2_frag;
            wmma::fill_fragment(c1_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!
            wmma::fill_fragment(c2_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!

            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
            {
                int64_t A_cur_index = (i*lda + k) / 8;  // index in bits
                int64_t B1_cur_index = (j*ldb + k) / 8;  // index in bits
                int64_t B2_cur_index = ((j + 8)*ldb + k) / 8;  // index in bits

                // try to use A that is cached in shared memory - poor performance
                //if (i == start_i) wmma::load_matrix_sync(a_frag, &A_s[k / 32], (512 * 32));   // lda = (128*32) bits
                //else wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M

                // lda, ldb - are in bits
                wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M

                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B1_cur_index), ldb);   // ldb = K
                wmma::bmma_sync(c1_frag, a_frag, b_frag, c1_frag);    // XOR-GEMM

                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B2_cur_index), ldb);   // ldb = K
                wmma::bmma_sync(c2_frag, a_frag, b_frag, c2_frag);    // XOR-GEMM
            }
            // C[i*ldc + j]
            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c1_frag, WMMA_N, wmma::mem_row_major);
            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N + WMMA_M*WMMA_N*32], c2_frag, WMMA_N, wmma::mem_row_major);
#else // __CUDA_ARCH__ >= 730

            // Custom XOR-GEMM
            int k_d = lane_id % 4;
            int i_d = lane_id / 4;
            //int j_d = lane_id / 4;

            int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
            for (int local_j = 0; local_j < 8*2; ++local_j) {
                accum_c_val[local_j] = 0;
            }

            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
            {
                //int64_t A_cur_index = (i*lda + k) / 8;
                //int64_t A_cur_index = (local_i*lda + k) / 8;
                //int64_t B_cur_index = (j*ldb + k) / 8;

                // lda, ldb - are in bits
                // 8*4 = 32
                // 8*8 = 64
                int k_d = lane_id % 4;
                int i_d = lane_id / 4;
                int j_d = lane_id / 4;
                uint32_t a_val = *(uint32_t *)(A + ((i + i_d)*lda + (k + k_d*32)) / 8); // wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);

                for (int c_x = 0; c_x < 2; c_x++)
                {
                    uint32_t b_val = *(uint32_t *)(B + ((c_x * 8 + j + j_d)*ldb + (k + k_d * 32)) / 8); // wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);

                    // wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);
                    int32_t c_val[8];  // 8 x 32 threads = 256
                    #pragma UNROLL
                    for (int local_j = 0; local_j < 8; ++local_j)
                    {
                        uint32_t b_val_cur = __shfl_custom(b_val, local_j * 4 + k_d);
                        c_val[local_j] = __popc(xor_int32(a_val, b_val_cur));
                    }

                    #pragma UNROLL
                    for (int local_j = 0; local_j < 8; ++local_j)
                    {
                        #pragma UNROLL
                        for (int local_k = 0; local_k < 4; ++local_k) {
                            accum_c_val[local_j + c_x*8] += __shfl_custom(c_val[local_j], i_d * 4 + local_k);
                        }
                    }
                }
            }

            // only the first 8 threads (i) contain 8 good values each, in c_val[8] (j) = 8 x 8 =64
            // wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
            if (k_d == 0) {
                for (int c_x = 0; c_x < 2; c_x++)
                {
                    for (int local_j = 0; local_j < 8; ++local_j)
                    {
                        C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + local_j + WMMA_M*WMMA_N*32 * c_x] = accum_c_val[local_j + c_x*8];
                    }
                }
            }
#endif // __CUDA_ARCH__ >= 730

            for(int c_x = 0; c_x < 2; c_x++)
            {
                int j_d = lane_id % WMMA_N;
                {
                    #pragma UNROLL
                    for (int i_d = lane_id / WMMA_N; i_d < WMMA_M; i_d += WMMA_M / 2)
                    {
                        int count = C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + j_d + WMMA_M*WMMA_N*32*c_x];

                        const int bit_step = 128;
                        int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
                        count = count - f1;    // remove extra bits (from empty space for align only)

                        count = (2 * count - K);

                        float mean_val = mean_arr[i + i_d];
                        float bias_val = bias_arr[i + i_d];
                        float dst_val = count *mean_val + bias_val;
                        if (leaky_activation)
                            dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation

                        size_t out_index = (i + i_d)*ldc + (c_x * 8 + j + j_d);
                        C[out_index] = dst_val;

                        if (shortcut_out_gpu) {
                            shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
                        }
                    }

                }
            }
        }
    }
}
#endif  // CUDART_VERSION >= 10000

/*
// Tensor Cores are used for XOR-GEMM
__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation)
{
    // total 57%
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    __shared__ int C_s[8*8 * 32];    // Temprorary result of GEMM WMMA

    const int lane_id = threadIdx.x % 32;
    const int warp_id = threadIdx.x / 32;
    const int global_warp_id = index / 32;

    const int N_aligned = N + WMMA_N - (N % WMMA_N);

    int i, j, k, h;
    // 47% = 29 + 10 + 8
    j = global_warp_id % (N_aligned / WMMA_N);
    j = j * WMMA_N;
    {    // out_h*out_w - one channel output size [169 - 173056]
        i = global_warp_id / (N_aligned / WMMA_N);
        i = i * WMMA_M;

        int count = 0;
        k = 0;

        if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
        {
            if (j + WMMA_N > N) j = N - WMMA_N;   // must be: j+7 < N
            if (i + WMMA_M > M) i = M - WMMA_M;   // must be: i+7 < M

#if __CUDA_ARCH__ >= 730
            // Tensor Cores
            using namespace nvcuda;

            wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
            wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
            wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
            wmma::fill_fragment(c_frag, 0); // !!!! XOR isn't XNOR !!!!!!!!!!

            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
            {
                int64_t A_cur_index = (i*lda + k) / 8;
                //int64_t A_cur_index = (local_i*lda + k) / 8;
                int64_t B_cur_index = (j*ldb + k) / 8;

                // lda, ldb - are in bits
                wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);   // lda = M
                wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);   // ldb = K

                wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);    // XOR-GEMM
            }
            // C[i*ldc + j]
            wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
#else // __CUDA_ARCH__ >= 730

            // Custom XOR-GEMM
            int k_d = lane_id % 4;
            int i_d = lane_id / 4;
            int j_d = lane_id / 4;

            int32_t accum_c_val[8]; // wmma::fill_fragment(c_frag, 0);
            for (int local_j = 0; local_j < 8; ++local_j) {
                accum_c_val[local_j] = 0;
            }

            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
            {
                int64_t A_cur_index = (i*lda + k) / 8;
                //int64_t A_cur_index = (local_i*lda + k) / 8;
                int64_t B_cur_index = (j*ldb + k) / 8;

                // lda, ldb - are in bits
                // 8*4 = 32
                // 8*8 = 64
                int k_d = lane_id % 4;
                int i_d = lane_id / 4;
                int j_d = lane_id / 4;
                uint32_t a_val = *(uint32_t *)(A + ((i + i_d)*lda + (k + k_d*32)) / 8); // wmma::load_matrix_sync(a_frag, (uint32_t *)(A + A_cur_index), lda);
                uint32_t b_val = *(uint32_t *)(B + ((j + j_d)*ldb + (k + k_d*32)) / 8); // wmma::load_matrix_sync(b_frag, (uint32_t *)(B + B_cur_index), ldb);

                // wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag);
                int32_t c_val[8];  // 8 x 32 threads = 256
                #pragma UNROLL
                for (int local_j = 0; local_j < 8; ++local_j)
                {
                    uint32_t b_val_cur = __shfl_custom(b_val, local_j *4 + k_d);
                    c_val[local_j] = __popc(xor_int32(a_val, b_val_cur));
                }

                #pragma UNROLL
                for (int local_j = 0; local_j < 8; ++local_j)
                {
                    #pragma UNROLL
                    for (int local_k = 0; local_k < 4; ++local_k) {
                        accum_c_val[local_j] += __shfl_custom(c_val[local_j], i_d * 4 + local_k);
                    }
                }
            }

            // only the first 8 threads (i) contain 8 good values each, in c_val[8] (j) = 8 x 8 =64
            // wmma::store_matrix_sync(&C_s[warp_id*WMMA_M*WMMA_N], c_frag, WMMA_N, wmma::mem_row_major);
            if (k_d == 0) {
                for (int local_j = 0; local_j < 8; ++local_j)
                {
                    C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + local_j] = accum_c_val[local_j];
                }
            }
#endif // __CUDA_ARCH__ >= 730

            {
                int i_d = lane_id % WMMA_M;
                {

                    for (int j_d = lane_id / WMMA_M; j_d < WMMA_N; j_d += WMMA_N / 2)
                    {
                        int count = C_s[warp_id*WMMA_M*WMMA_N + i_d*WMMA_N + j_d];

                        const int bit_step = 128;
                        int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
                        count = count - f1;    // remove extra bits (from empty space for align only)

                        count = (2 * count - K);

                        float mean_val = mean_arr[i + i_d];
                        float bias_val = bias_arr[i + i_d];
                        float dst_val = count *mean_val + bias_val;
                        if (leaky_activation)
                            dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation

                        C[(i + i_d)*ldc + (j + j_d)] = dst_val;
                    }

                }
            }
        }
    }
}
*/


// Coalescing
// A (weights) in the shared_memory - GOOD
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
    float *shortcut_in_gpu, float *shortcut_out_gpu)
{
    // total 57%
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    __shared__ uint8_t A_s[6144*8/4];
    //__shared__ uint64_t A_s[6144];  // 48 KB // [lda x M`]
    //__shared__ uint8_t A_s[6144*8];  // 48 KB // [lda x M`]

    int start_i = blockIdx.x*blockDim.x / N;
    int end_i = (blockIdx.x*blockDim.x + blockDim.x) / N + 1;

    size_t shared_size = lda * (end_i - start_i);

    int i_cur = index / N;
    int local_i = i_cur - start_i;
    // ~10%
    for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
        int x = start_i*lda + k;
        if (x < (M*lda)) *((uint64_t *)(A_s + k / 8)) = *((uint64_t *)(A + x / 8));
    }
    __syncthreads();

    int i, j, k; //, h;
    // 47% = 29 + 10 + 8
    j = index % N;
    {    // out_h*out_w - one channel output size [169 - 173056]
        i = index / N;
        //if (i < M)  // l.n - filters [16 - 55 - 1024]
        {
            int count = 0;
            k = 0;

#ifdef NOT_USED
            // 32 thread X 256 bit = 8192 bit
            for (; k < (K - 8192); k += 8192) {   // l.size*l.size*l.c - one filter size [27 - 9216]
                ulonglong4 c_bit256;

                //int64_t A_cur_index = (i*lda + k) / 8;
                int64_t A_cur_index = (local_i*lda + k) / 8;
                int64_t B_cur_index = (j*ldb + k) / 8;
                if (i >= M) A_cur_index = 0;

#pragma unroll
                for (int t = 0; t < WARP_SIZE; ++t) {
                    const int lane_id = threadIdx.x % WARP_SIZE;

                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 32 * lane_id;
                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 32 * lane_id;

                    {
                        //ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i));    // weights
                        ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i));    // weights
                        ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i));    // input
                        c_bit256 = xor_int256(a_bit256, b_bit256);
                        int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) +
                            __popcll(c_bit256.y) + __popcll(c_bit256.z);

                        int sum_count = warpAllReduceSum(tmp_count);
                        if (lane_id == t) count += sum_count;
                    }
                }
            }
#endif


//#ifdef NOT_USED
            // 32 thread X 64 bit = 2048 bit // 29%
            for (; k < (K - 2048); k += 2048) {   // l.size*l.size*l.c - one filter size [27 - 9216]
                uint64_t c_bit64;

                //int64_t A_cur_index = (i*lda + k) / 8;
                int64_t A_cur_index = (local_i*lda + k) / 8;
                int64_t B_cur_index = (j*ldb + k) / 8;
                if (i >= M) A_cur_index = 0;

                #pragma unroll
                for (int t = 0; t < WARP_SIZE; ++t) {
                    const int lane_id = threadIdx.x % WARP_SIZE;

                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 8 * lane_id;
                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 8 * lane_id;

                    {
                        //uint64_t a_bit64 = *((uint64_t *)(A + A_i));    // weights
                        uint64_t a_bit64 = *((uint64_t *)(A_s + A_i));    // weights
                        uint64_t b_bit64 = *((uint64_t *)(B + B_i));    // input
                        c_bit64 = xor_int64(a_bit64, b_bit64);
                        int tmp_count = __popcll(c_bit64);

                        int sum_count = warpAllReduceSum(tmp_count);
                        if (lane_id == t) count += sum_count;
                    }
                }
            }
//#endif

//#ifdef NOT_USED
            // 32 thread X 32 bit = 1024 bit // 10%
            for (; k < (K - 1024); k += 1024) {   // l.size*l.size*l.c - one filter size [27 - 9216]

                //int64_t A_cur_index = (i*lda + k) / 8;
                int64_t A_cur_index = (local_i*lda + k) / 8;
                int64_t B_cur_index = (j*ldb + k) / 8;
                if (i >= M) A_cur_index = 0;

                #pragma unroll
                for (int t = 0; t < WARP_SIZE; ++t) {
                    const int lane_id = threadIdx.x % WARP_SIZE;

                    const int64_t A_i = __shfl_custom(A_cur_index, t) + 4 * lane_id;
                    const int64_t B_i = __shfl_custom(B_cur_index, t) + 4 * lane_id;

                    {
                        //uint64_t a_bit64 = *((uint64_t *)(A + A_i));    // weights
                        uint32_t a_bit32 = *((uint32_t *)(A_s + A_i));    // weights
                        uint32_t b_bit32 = *((uint32_t *)(B + B_i));    // input
                        uint32_t c_bit32 = xor_int32(a_bit32, b_bit32);
                        int tmp_count = __popc(c_bit32);

                        int sum_count = warpAllReduceSum(tmp_count);
                        if (lane_id == t) count += sum_count;
                    }
                }
            }
//#endif

            if (i < M)
            {
                float mean_val = mean_arr[i];
                float bias_val = bias_arr[i];

//#ifdef NOT_USED
                // 8%
                for (; k < K; k += 256) {   // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
                    //ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8));    // weights
                    ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8));    // weights
                    ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8));    // input
                    ulonglong4 c_bit256 = xor_int256(a_bit256, b_bit256);

                    count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
                        __popcll(c_bit256.y) + __popcll(c_bit256.z);
                }
//#endif

#ifdef NOT_USED
                for (; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
                    //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));    // weights
                    uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8));    // weights
                    uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));            // input
                    uint64_t c_bit64 = xor_int64(a_bit64, b_bit64);

                    count += __popcll(c_bit64);
                }
#endif

                const int bit_step = 256;
                int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
                count = count - f1;    // remove extra bits (from empty space for align only)
                float dst_val = (2 * count - K) *mean_val + bias_val;
                if(leaky_activation)
                    dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val);    // Leaky activation
                size_t out_index = i*ldc + j;
                C[out_index] = dst_val;

                if (shortcut_out_gpu) {
                    shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
                }
            }
        }
    }
}


// further optimization - use WMMA GEMM for using Tensor Cores
// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu
// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte
// nvcuda::wmma::col_major ->  cutlass::MatrixLayout::kColumnMajor (matrix is not transposed)

// Matrix A	Matrix B	Accumulator	Matrix Size (m-n-k)
// precision::b1	precision::b1	int	8x8x128

// The only dimensions currently supported by WMMA for XNOR
// const int WMMA_M = 8;
// const int WMMA_N = 8;
// const int WMMA_K = 128;


// GOOD
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
    float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
    float *shortcut_in_gpu, float *shortcut_out_gpu)
{
    int size = M*N;
    const int num_blocks = get_number_of_blocks(size, BLOCK);

    //printf("\n M = %d, N = %d, M %% 8 = %d, N %% 8 = %d \n", M, N, M % 8, N % 8);

    /*
    printf("\n gemm_bin size = %d, num_blocks = %d, M*K = %d KB, N*K = %d KB \n (w) M*K/num_blocks = %d KB, (i) N*K/num_blocks = %d KB \n",
        size, num_blocks, M*K / 1024, N*K / 1024, M*lda / num_blocks / 1024, N*ldb / num_blocks / 1024);
    printf(" M / 512 = %d, N / 512 = %d, M*lda / 512 = %d, N*ldb / 512 = %d \n", M / 512, N / 512, M*lda/512, N*ldb/512);
    */
    //printf(" shared_memory: (w) lda*BLOCK/N = %d, (i) ldb*BLOCK/M = %d, \t lda = %d \n\n", lda*BLOCK / N, ldb*BLOCK / M, lda);


    //if (M % 8 == 0 && N % 8 == 0 && M == 128)
    //if (M >= 32)    // l.n >= 32
#if CUDART_VERSION >= 10000
    if (1)
    {
        const int M_aligned = M + (8 - (M % 8));
        const int N_aligned = N + (16 - (N % 16));
        int size = (M_aligned / 8)*(N_aligned / 16)*WARP_SIZE;
        const int num_blocks = get_number_of_blocks(size, BLOCK);

        //printf(" lda = %d, ldb = %d, ldc = %d, lda/32 = %d, ldb/32 = %d, ldc/32 = %d \n", lda, ldb, ldc, lda / 32, ldb / 32, ldc / 32);
        //printf("  l.c (K/9) = %d, M (l.n) = %d \n", (K%9 == 0)? K / 9: K, M);
        gemm_nn_custom_bin_mean_transposed_tensor_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (
            M, N, K,
            A, lda,
            B, ldb,
            C, ldc,
            mean_arr, bias, leaky_activation,
            shortcut_in_gpu, shortcut_out_gpu);

        //cudaDeviceSynchronize();
        //getchar();
    }
    else
#endif  //# CUDART_VERSION >= 10000
    {
        gemm_nn_custom_bin_mean_transposed_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (
            M, N, K,
            A, lda,
            B, ldb,
            C, ldc,
            mean_arr, bias, leaky_activation,
            shortcut_in_gpu, shortcut_out_gpu);
    }
    CHECK_CUDA(cudaPeekAtLastError());
}
// --------------------------------

/*
void convolve_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
{
    int fil;
    // filter index
#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
    for (fil = 0; fil < n; ++fil) {
        int chan, y, x, f_y, f_x;
        // channel index
        for (chan = 0; chan < in_c; ++chan)
            // input - y
            for (y = 0; y < in_h; ++y)
                // input - x
                for (x = 0; x < in_w; ++x)
                {
                    int const output_index = fil*in_w*in_h + y*in_w + x;
                    int const weights_pre_index = fil*in_c*size*size + chan*size*size;
                    int const input_pre_index = chan*in_w*in_h;
                    float sum = 0;

                    // filter - y
                    for (f_y = 0; f_y < size; ++f_y)
                    {
                        int input_y = y + f_y - pad;
                        // filter - x
                        for (f_x = 0; f_x < size; ++f_x)
                        {
                            int input_x = x + f_x - pad;
                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

                            int input_index = input_pre_index + input_y*in_w + input_x;
                            int weights_index = weights_pre_index + f_y*size + f_x;

                            sum += input[input_index] * weights[weights_index];
                        }
                    }
                    // l.output[filters][width][height] +=
                    //        state.input[channels][width][height] *
                    //        l.weights[filters][channels][filter_width][filter_height];
                    output[output_index] += sum;
                }
    }


}
// --------------------------------


void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
    int size, int pad, int new_lda, float *mean_arr_gpu)
{
    int fil;
    // filter index
#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
    for (fil = 0; fil < n; ++fil) {
        float mean_val = mean_arr_gpu[fil];
        int chan, y, x, f_y, f_x;
        // channel index
        for (chan = 0; chan < in_c; ++chan)
            // input - y
            for (y = 0; y < in_h; ++y)
                // input - x
                for (x = 0; x < in_w; ++x)
                {
                    int const output_index = fil*in_w*in_h + y*in_w + x;
                    int const weights_pre_index = fil*in_c*size*size + chan*size*size;
                    int const input_pre_index = chan*in_w*in_h;
                    int sum = 0;
                    int good_val = 0;

                    // filter - y
                    for (f_y = 0; f_y < size; ++f_y)
                    {
                        int input_y = y + f_y - pad;
                        // filter - x
                        for (f_x = 0; f_x < size; ++f_x)
                        {
                            int input_x = x + f_x - pad;
                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

                            int input_index = input_pre_index + input_y*in_w + input_x;
                            //int weights_index = weights_pre_index + f_y*size + f_x;
                            //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                            int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;

                            //sum += input[input_index] * weights[weights_index];

                            int8_t in_bit = get_bit((uint8_t *)input, input_index);
                            int8_t w_bit = get_bit((uint8_t *)weights, weights_index);
                            int res = xnor_bit1(in_bit, w_bit);
                            sum += res;
                            good_val++;
                            //sum += (res > 0) ? 1 : -1;
                            //in_bit = (in_bit > 0) ? 1 : -1;
                            //w_bit = (w_bit > 0) ? 1 : -1;
                            //int8_t res = in_bit*w_bit;
                            //sum += res;
                            //printf("\n i: %d x w: %d = res: %d \t sum: %d \t mean = %f \n", in_bit, w_bit, res, sum, mean_val);
                        }
                    }
                    //printf("sum = %d, ", sum);
                    sum = sum - (good_val - sum);
                    //printf(" size = %d, sum = %d \n", size, sum);

                    // l.output[filters][width][height] +=
                    //        state.input[channels][width][height] *
                    //        l.weights[filters][channels][filter_width][filter_height];
                    output[output_index] += sum*mean_val;
                }
    }
}
*/
// --------------------------------

__global__ void convolve_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    int fil;
    // filter index
    //for (fil = 0; fil < n; ++fil)
    int chan, y, x, f_y, f_x;
    // channel index
    //for (chan = 0; chan < in_c; ++chan)
    // input - y
    //for (y = 0; y < in_h; ++y)
    // input - x
    //for (x = 0; x < in_w; ++x)
    x = index % in_w;
    int index2 = index / in_w;
    y = index2 % in_h;
    fil = index2 / in_h;
    if (fil < n)
    {

        int const output_index = fil*in_w*in_h + y*in_w + x;
        float sum = 0;

        for (chan = 0; chan < in_c; ++chan)
        {
            int const weights_pre_index = fil*in_c*size*size + chan*size*size;
            int const input_pre_index = chan*in_w*in_h;

            // filter - y
            for (f_y = 0; f_y < size; ++f_y)
            {
                int input_y = y + f_y - pad;
                // filter - x
                for (f_x = 0; f_x < size; ++f_x)
                {
                    int input_x = x + f_x - pad;
                    if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

                    int input_index = input_pre_index + input_y*in_w + input_x;
                    int weights_index = weights_pre_index + f_y*size + f_x;

                    sum += input[input_index] * weights[weights_index];

                }
            }
            // l.output[filters][width][height] +=
            //        state.input[channels][width][height] *
            //        l.weights[filters][channels][filter_width][filter_height];
            //output[output_index] += sum;
        }
        output[output_index] = sum;
    }

}

void convolve_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
{
    int array_size = in_w*in_h*n;    // width X height X filters
    const int num_blocks = array_size / BLOCK + 1;
    //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad);

    convolve_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (input, weights, output, in_w, in_h, in_c, n, size, pad);
    CHECK_CUDA(cudaPeekAtLastError());
}

// --------------------------------

/*
__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
    int size, int pad, int new_lda, float *mean_arr_gpu)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    int fil;
    // filter index
    //for (fil = 0; fil < n; ++fil)
    int chan, y, x, f_y, f_x;
    // channel index
    //for (chan = 0; chan < in_c; ++chan)
    // input - y
    //for (y = 0; y < in_h; ++y)
    // input - x
    //for (x = 0; x < in_w; ++x)
    x = index % in_w;
    int index2 = index / in_w;
    y = index2 % in_h;
    fil = index2 / in_h;
    if (fil < n)    // (1-6 for one BLOCK)
    {
                //float mean_val = mean_arr_gpu[fil];
                int const output_index = fil*in_w*in_h + y*in_w + x;
                int sum = 0;
                int good_val = 0;

                for (chan = 0; chan < in_c; ++chan)
                {
                    //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
                    int const weights_pre_index = fil*new_lda + chan*size*size;
                    int const input_pre_index = chan*in_w*in_h;

                    // filter - y
                    for (f_y = 0; f_y < size; ++f_y)
                    {
                        int input_y = y + f_y - pad;
                        // filter - x
                        for (f_x = 0; f_x < size; ++f_x)
                        {
                            int input_x = x + f_x - pad;
                            if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

                            int input_index = input_pre_index + input_y*in_w + input_x;
                            int weights_index = weights_pre_index + f_y*size + f_x;
                            //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                            //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;

                            uint8_t in_bit = get_bit((uint8_t *)input, input_index);
                            uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);
                            int res = xnor_bit1(in_bit, w_bit);
                            sum += res;
                            good_val++;

                            //sum += input[input_index] *weights[weights_index];

                        }
                    }
                    // l.output[filters][width][height] +=
                    //        state.input[channels][width][height] *
                    //        l.weights[filters][channels][filter_width][filter_height];
                    //output[output_index] += sum;
                }
                sum = sum - (good_val - sum);
                output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
    }

}
*/

__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
    int size, int pad, int new_lda, float *mean_arr_gpu)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;

    int fil;
    // filter index
    //for (fil = 0; fil < n; ++fil)
    int chan, y, x, f_y, f_x;
    // channel index
    //for (chan = 0; chan < in_c; ++chan)
    // input - y
    //for (y = 0; y < in_h; ++y)
    // input - x
    //for (x = 0; x < in_w; ++x)
    x = index % in_w;
    int index2 = index / in_w;
    y = index2 % in_h;
    fil = index2 / in_h;
    //if (fil < n)    // (1-6 for one BLOCK)
    {
        //float mean_val = mean_arr_gpu[fil];
        int const output_index = fil*in_w*in_h + y*in_w + x;
        int sum = 0;
        int good_val = 0;

        int min_index = blockIdx.x*blockDim.x;
        int min_fil = (min_index / in_w) / in_h;
        int max_index = (blockIdx.x+1)*blockDim.x - 1;
        int max_fil = (max_index / in_w) / in_h;

        __shared__ uint32_t weights_shared[3*3*1024*6/32 + 1];  // 7 KB (6 filters) - use (new_lda) for size calculation
        //const int weights_size = size*size*in_c/8;
        const int weights_size = size*size*in_c / 32 + 1;

        for (int tmp_fil = min_fil; tmp_fil <= max_fil; tmp_fil++) {
            for (int s = threadIdx.x; s < weights_size; s += blockDim.x) {
                //weights_shared[s + (tmp_fil - min_fil)*new_lda / 8] = ((uint8_t *)weights)[tmp_fil*new_lda / 8 + s];
                weights_shared[s + (tmp_fil - min_fil)*new_lda/32] = ((uint32_t *)weights)[tmp_fil*new_lda / 32 + s];
            }
        }
        __syncthreads();

        for (chan = 0; chan < in_c; ++chan)
        {
            //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
            //int const weights_pre_index = fil*new_lda + chan*size*size;
            int const input_pre_index = chan*in_w*in_h;

            __shared__ uint32_t input_shared[416*416/32 + 1];   // 21.2 KB bytes (for input size 832x832)
            const int input_shared_size = in_w*in_h / 32 + 1;
            const int add_input_index = input_pre_index % 32;
            __syncthreads();    // why??? but is required

            for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
                input_shared[s] = ((uint32_t *)input)[input_pre_index / 32 + s];
            }
            __syncthreads();

            /*
            __shared__ uint8_t input_shared[208 * 208 / 8 + 1];   // 5.4 KB bytes (for input size 416x416)
            const int input_shared_size = in_w*in_h / 8 + 1;
            const int add_input_index = input_pre_index % 8;
            __syncthreads();

            for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
                ((uint8_t *)input_shared)[s] = ((uint8_t *)input)[input_pre_index / 8 + s];
            }
            __syncthreads();
            */
            //int src_index = -1;
            //uint32_t input_byte;

            if (fil < n)    // (1-6 for one BLOCK)
            {
                // filter - y
                for (f_y = 0; f_y < size; ++f_y)
                {
                    int input_y = y + f_y - pad;
                    // filter - x
                    for (f_x = 0; f_x < size; ++f_x)
                    {
                        int input_x = x + f_x - pad;
                        if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

                        //int input_index = input_pre_index + input_y*in_w + input_x;
                        //int weights_index = weights_pre_index + f_y*size + f_x;
                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                        //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;

                        //uint8_t in_bit = get_bit((uint8_t *)input, input_index);
                        //uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);

                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                        int weights_shared_index = (fil - min_fil)*new_lda + chan*size*size + f_y*size + f_x;
                        //uint8_t in_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
                        uint8_t w_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);

                        //int input_index = input_pre_index + input_y*in_w + input_x;
                        int input_shared_index = /*input_pre_index +*/ input_y*in_w + input_x + add_input_index;
                        uint8_t in_bit = get_bit((uint8_t *)input_shared, input_shared_index);
                        /*
                        int new_src_index = input_shared_index / 32;
                        int src_shift = input_shared_index % 32;
                        //if (new_src_index != src_index)
                        {
                            src_index = new_src_index;
                            input_byte = ((uint32_t *)input_shared)[src_index];
                        }
                        uint8_t in_bit = (input_byte & (1 << src_shift)) >> src_shift;
                        */

                        int res = xnor_bit1(in_bit, w_bit);
                        sum += res;
                        good_val++;

                        //sum += input[input_index] *weights[weights_index];

                    }
                }
            }
            // l.output[filters][width][height] +=
            //        state.input[channels][width][height] *
            //        l.weights[filters][channels][filter_width][filter_height];
            //output[output_index] += sum;
        }
        sum = sum - (good_val - sum);
        //output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
        atomicAdd(&output[output_index], sum * mean_arr_gpu[fil]);
    }

}

void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n,
    int size, int pad, int new_lda, float *mean_arr_gpu)
{
    int array_size = in_w*in_h*n;    // width X height X filters
    const int num_blocks = array_size / BLOCK + 1;
    //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad);

    convolve_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (input, weights, output, in_w, in_h, in_c, n, size, pad, new_lda, mean_arr_gpu);
    CHECK_CUDA(cudaPeekAtLastError());
}

// --------------------------------

// CUDA: use 512 threads per block
const int CAFFE_CUDA_NUM_THREADS = 512;

// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
    return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
}

// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)

// https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
__global__ void im2col_gpu_kernel_ext(const int n, const float* data_im,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int height_col, const int width_col,
    float* data_col) {
    CUDA_KERNEL_LOOP(index, n) {
        const int h_index = index / width_col;
        const int h_col = h_index % height_col;
        const int w_col = index % width_col;
        const int c_im = h_index / height_col;
        const int c_col = c_im * kernel_h * kernel_w;
        const int h_offset = h_col * stride_h - pad_h;
        const int w_offset = w_col * stride_w - pad_w;
        float* data_col_ptr = data_col;
        data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
        const float* data_im_ptr = data_im;
        data_im_ptr += (c_im * height + h_offset) * width + w_offset;
        for (int i = 0; i < kernel_h; ++i) {
            for (int j = 0; j < kernel_w; ++j) {
                int h_im = h_offset + i * dilation_h;
                int w_im = w_offset + j * dilation_w;
                *data_col_ptr =
                    (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
                    data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
                data_col_ptr += height_col * width_col;
            }
        }
    }
}


void im2col_gpu_ext(const float* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    float* data_col)
{
    // We are going to launch channels * height_col * width_col kernels, each
    // kernel responsible for copying a single-channel grid.
    int height_col = (height + 2 * pad_h -
        (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
    int width_col = (width + 2 * pad_w -
        (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
    int num_kernels = channels * height_col * width_col;
    // NOLINT_NEXT_LINE(whitespace/operators)
    im2col_gpu_kernel_ext << <CAFFE_GET_BLOCKS(num_kernels),
        CAFFE_CUDA_NUM_THREADS >> >(
            num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
            pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
            width_col, data_col);

    CHECK_CUDA(cudaPeekAtLastError());
}