From 73a4f28d0b7a825da107079a5722563108ebac1e Mon Sep 17 00:00:00 2001 From: Andrey Kalinin <andrey.kalinin@intel.com> Date: Thu, 21 Dec 2017 12:15:52 -0800 Subject: [PATCH] style: consistent use of trailing underscore in jit_1x1_conv_conf_t --- src/cpu/jit_avx512_common_1x1_conv_kernel.cpp | 30 ++++----- src/cpu/jit_avx512_common_1x1_convolution.cpp | 62 +++++++++---------- src/cpu/jit_primitive_conf.hpp | 2 +- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp b/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp index 676b311ce19..f56e73f544f 100644 --- a/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp +++ b/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp @@ -1132,16 +1132,16 @@ void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, { if (nthreads < jcp.ngroups) { /* simplification... fortunately it doesn't hurt much */ - jcp.nthr_ = jcp.nthr_mb_ = jcp.nthr_g_ = - jcp.nthr_oc_b_ = jcp.nthr_ic_b_ = 1; + jcp.nthr = jcp.nthr_mb = jcp.nthr_g = + jcp.nthr_oc_b = jcp.nthr_ic_b = 1; return; } const int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block); const int nb_load = div_up(jcp.load_dim, jcp.load_block); const int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block); - jcp.nthr_g_ = jcp.ngroups; - const int nthr = nthreads / jcp.nthr_g_; + jcp.nthr_g = jcp.ngroups; + const int nthr = nthreads / jcp.nthr_g; auto calc_mem_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) { /* calculate per thread memory cost (read/write). high level @@ -1162,14 +1162,14 @@ void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, } return 0 + bcast_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) - * div_up(jcp.ngroups, jcp.nthr_g_) + * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.reduce_block / jcp.stride_h / jcp.stride_w /* (n1) */ + load_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) - * div_up(jcp.ngroups, jcp.nthr_g_) + * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b) * jcp.oc_block * jcp.reduce_block + output_koeff /* (n2) */ - * div_up(jcp.ngroups, jcp.nthr_g_) * div_up(nb_load, nthr_oc_b) + * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b) * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.oc_block; }; @@ -1184,22 +1184,22 @@ void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, const int nthr_oc_b_max = nstl::min(nthr_par, nb_load); for (nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) { nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, nb_bcast); - if (nthr_mb * jcp.nthr_g_ * nthr_oc_b * nthr_ic_b < nthreads) + if (nthr_mb * jcp.nthr_g * nthr_oc_b * nthr_ic_b < nthreads) continue; int mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b); if (mem_cost <= best_mem_cost) { best_mem_cost = mem_cost; - jcp.nthr_mb_ = nthr_mb; - jcp.nthr_oc_b_ = nthr_oc_b; - jcp.nthr_ic_b_ = nthr_ic_b; + jcp.nthr_mb = nthr_mb; + jcp.nthr_oc_b = nthr_oc_b; + jcp.nthr_ic_b = nthr_ic_b; } } } - if (jcp.nthr_mb_ > nthreads / 2 && jcp.nthr_mb_ < nthreads) - jcp.nthr_mb_ = nstl::min(jcp.mb, nthreads); + if (jcp.nthr_mb > nthreads / 2 && jcp.nthr_mb < nthreads) + jcp.nthr_mb = nstl::min(jcp.mb, nthreads); - jcp.nthr_ = jcp.nthr_mb_ * jcp.nthr_g_ * jcp.nthr_oc_b_ * jcp.nthr_ic_b_; - assert(jcp.nthr_ <= nthreads); + jcp.nthr = jcp.nthr_mb * jcp.nthr_g * jcp.nthr_oc_b * jcp.nthr_ic_b; + assert(jcp.nthr <= nthreads); } } diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp index dfd4e804cab..a29c9f1779b 100644 --- a/src/cpu/jit_avx512_common_1x1_convolution.cpp +++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp @@ -417,24 +417,24 @@ jit_avx512_common_1x1_convolution_bwd_weights_t :: const auto &jcp = kernel_->jcp; bctx_ = (simple_barrier::ctx_t *)malloc( - jcp.nthr_ * sizeof(simple_barrier::ctx_t), 64); - for (int i = 0; i < jcp.nthr_; ++i) + jcp.nthr * sizeof(simple_barrier::ctx_t), 64); + for (int i = 0; i < jcp.nthr; ++i) simple_barrier::ctx_init(&bctx_[i]); const int wei_size = jcp.ngroups * jcp.oc * jcp.ic; ws_reduction_ = - (data_t *)malloc((jcp.nthr_mb_ - 1) * wei_size * sizeof(data_t), 64); + (data_t *)malloc((jcp.nthr_mb - 1) * wei_size * sizeof(data_t), 64); acc_ker_ = new cpu_accumulator_1d_t<data_type::f32>(); if (conf_.with_bias()) { - const size_t max_buffer_size = jcp.nthr_ * 3 * 5 * 5 * 16 * 16; + const size_t max_buffer_size = jcp.nthr * 3 * 5 * 5 * 16 * 16; reducer_bias_ = new cpu_reducer_t<data_type::f32>( - reduce_balancer_t(jcp.nthr_, jcp.oc_block, + reduce_balancer_t(jcp.nthr, jcp.oc_block, jcp.ngroups * jcp.nb_load, jcp.mb, max_buffer_size)); } if (jcp.transpose_src) { const size_t tr_src_size = - jcp.nthr_mb_ * jcp.ngroups * jcp.ic * jcp.tr_is; + jcp.nthr_mb * jcp.ngroups * jcp.ic * jcp.tr_is; tr_src_ = (data_t *)malloc(tr_src_size * sizeof(data_t), 64); # pragma omp parallel for for (size_t i = 0; i < tr_src_size; i++) @@ -544,26 +544,26 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() }; auto ker = [&](const int ithr, const int nthr) { - const int ithr_ic_b = ithr % jcp.nthr_ic_b_; - const int ithr_oc_b = ithr / jcp.nthr_ic_b_ % jcp.nthr_oc_b_; - const int ithr_g = ithr / jcp.nthr_ic_b_ / jcp.nthr_oc_b_ % jcp.nthr_g_; - const int ithr_mb = ithr / jcp.nthr_ic_b_ / jcp.nthr_oc_b_ / - jcp.nthr_g_; + const int ithr_ic_b = ithr % jcp.nthr_ic_b; + const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b; + const int ithr_g = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b % jcp.nthr_g; + const int ithr_mb = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b / + jcp.nthr_g; const int ithr_but_oc - = (ithr_mb * jcp.nthr_g_ + ithr_g) * jcp.nthr_ic_b_ + ithr_ic_b; + = (ithr_mb * jcp.nthr_g + ithr_g) * jcp.nthr_ic_b + ithr_ic_b; /* reduction dimension */ int mb_sp_b_start{ 0 }, mb_sp_b_end{ 0 }; - if (jcp.transpose_src && jcp.nthr_mb_ < jcp.mb / 2) { + if (jcp.transpose_src && jcp.nthr_mb < jcp.mb / 2) { // it's preferable to parallelize by mb if possible int img_start{ 0 }, img_end{ 0 }; - balance211(jcp.mb, jcp.nthr_mb_, ithr_mb, img_start, img_end); + balance211(jcp.mb, jcp.nthr_mb, ithr_mb, img_start, img_end); mb_sp_b_start = img_start * sp_nb; mb_sp_b_end = img_end * sp_nb; } else { - balance211(mb_sp_work, jcp.nthr_mb_, ithr_mb, mb_sp_b_start, + balance211(mb_sp_work, jcp.nthr_mb, ithr_mb, mb_sp_b_start, mb_sp_b_end); } @@ -571,10 +571,10 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() int g_start{ 0 }, oc_b_start{ 0 }, ic_b_start{ 0 }; int g_end{ 0 }, oc_b_end{ 0 }, ic_b_end{ 0 }; - balance211(jcp.ngroups, jcp.nthr_g_, ithr_g, g_start, g_end); - balance211(jcp.nb_load, jcp.nthr_oc_b_, ithr_oc_b, oc_b_start, + balance211(jcp.ngroups, jcp.nthr_g, ithr_g, g_start, g_end); + balance211(jcp.nb_load, jcp.nthr_oc_b, ithr_oc_b, oc_b_start, oc_b_end); - balance211(jcp.nb_bcast, jcp.nthr_ic_b_, ithr_ic_b, ic_b_start, + balance211(jcp.nb_bcast, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end); const int g_work = g_end - g_start; @@ -602,17 +602,17 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() bcast_step = step(nb_ic_blocking, ic_b_end - ic_b, jcp.nb_bcast_blocking_max); if (jcp.transpose_src) { - if (jcp.nthr_oc_b_ > 1) + if (jcp.nthr_oc_b > 1) simple_barrier::barrier( - &bctx_[ithr_but_oc], jcp.nthr_oc_b_); + &bctx_[ithr_but_oc], jcp.nthr_oc_b); const int sp_size = nstl::min(sp_b_step * jcp.reduce_block, jcp.is - sp_b * jcp.reduce_block); uker_trans(ithr_mb, img, sp_b, sp_size, g, 1, ic_b, - bcast_step, ithr_oc_b, jcp.nthr_oc_b_, ic_b_start); - if (jcp.nthr_oc_b_ > 1) + bcast_step, ithr_oc_b, jcp.nthr_oc_b, ic_b_start); + if (jcp.nthr_oc_b > 1) simple_barrier::barrier( - &bctx_[ithr_but_oc], jcp.nthr_oc_b_); + &bctx_[ithr_but_oc], jcp.nthr_oc_b); } for (int oc_b = oc_b_start; oc_b < oc_b_end; @@ -687,15 +687,15 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() } /* diff_weights[:] += sum(ws_reduction_[thr_mb][:]) */ - if (jcp.nthr_mb_ > 1) { - simple_barrier::barrier(&reduction_barrier, jcp.nthr_); + if (jcp.nthr_mb > 1) { + simple_barrier::barrier(&reduction_barrier, jcp.nthr); const int work = g_work * oc_b_work * ic_b_work; int start{ 0 }, end{ 0 }; - balance211(work, jcp.nthr_mb_, ithr_mb, start, end); + balance211(work, jcp.nthr_mb, ithr_mb, start, end); if (start == end) return; - for (int thr_mb = 1; thr_mb < jcp.nthr_mb_; ++thr_mb) { + for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) { int w = start; int sub_g_start{ 0 }, sub_oc_b_start{ 0 }, sub_ic_b_start{ 0 }; @@ -771,13 +771,13 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() rb->reduce(ithr, diff_bias); }; -#pragma omp parallel num_threads(jcp.nthr_) +#pragma omp parallel num_threads(jcp.nthr) { int ithr = omp_get_thread_num(); - assert(jcp.nthr_ == omp_get_num_threads()); - ker(ithr, jcp.nthr_); + assert(jcp.nthr == omp_get_num_threads()); + ker(ithr, jcp.nthr); if (conf_.with_bias()) - ker_bias(ithr, jcp.nthr_); + ker_bias(ithr, jcp.nthr); } } diff --git a/src/cpu/jit_primitive_conf.hpp b/src/cpu/jit_primitive_conf.hpp index eb66cb9563a..7d3a9802b2d 100644 --- a/src/cpu/jit_primitive_conf.hpp +++ b/src/cpu/jit_primitive_conf.hpp @@ -227,7 +227,7 @@ struct jit_1x1_conv_conf_t { /* 4fma */ bool transpose_src; int tr_is; - int nthr_, nthr_mb_, nthr_g_, nthr_oc_b_, nthr_ic_b_; + int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b; }; struct jit_gemm_conv_conf_t {