diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h index 1dda5556a5..3a9bc76d4c 100644 --- a/src/FbgemmI8Depthwise2DAvx2-inl.h +++ b/src/FbgemmI8Depthwise2DAvx2-inl.h @@ -9,6 +9,7 @@ #include // for tie #include "src/FbgemmI8DepthwiseAvx2-inl.h" +#include "src/MaskAvx2.h" namespace fbgemm { @@ -44,8 +45,8 @@ static inline __attribute__((always_inline)) void inner_prod_3x3_packed_( _mm256_set1_epi8(static_cast(A_zero_point)); __m256i mask_v = _mm256_setzero_si256(); if (REMAINDER) { - mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4])); + mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder / 4])); } // The code below can be written as a simple R*S loop but the compiler @@ -156,8 +157,8 @@ static inline __attribute__((always_inline)) void inner_prod_5x5_packed_( _mm256_set1_epi8(static_cast(A_zero_point)); __m256i mask_v = _mm256_setzero_si256(); if (REMAINDER) { - mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4])); + mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder / 4])); } // The code below can be written as a simple R*S loop but the compiler @@ -1600,24 +1601,24 @@ FBGEMM_API void depthwise_3x3_pad_1( template FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const std::int32_t* B_zero_point, - const PackedDepthWiseConvMatrix& Bp, - const float* C_multiplier, - std::int32_t C_zero_point, - std::uint8_t* C, - const std::int32_t* col_offsets, - const BIAS_TYPE* bias, - bool fuse_relu = false, - const float* act_times_w_scale = nullptr, - int thread_id = 0, - int num_threads = 1); + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const std::int32_t* B_zero_point, + const PackedDepthWiseConvMatrix& Bp, + const float* C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const BIAS_TYPE* bias, + bool fuse_relu = false, + const float* act_times_w_scale = nullptr, + int thread_id = 0, + int num_threads = 1); } // namespace fbgemm diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc index 0851722056..04fd5d1503 100644 --- a/src/FbgemmI8Depthwise3DAvx2.cc +++ b/src/FbgemmI8Depthwise3DAvx2.cc @@ -10,6 +10,7 @@ #include // for tie #include "FbgemmI8DepthwiseAvx2-inl.h" +#include "MaskAvx2.h" using namespace std; @@ -37,8 +38,8 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_( __m256i A_zero_point_v = _mm256_set1_epi8(static_cast(A_zero_point)); __m256i mask_v = _mm256_setzero_si256(); if (REMAINDER) { - mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4])); + mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder / 4])); } // The code below can be written as a simple R*S loop but the compiler diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h index 7ad39fc6d9..cf6f259a7c 100644 --- a/src/FbgemmI8DepthwiseAvx2-inl.h +++ b/src/FbgemmI8DepthwiseAvx2-inl.h @@ -16,21 +16,6 @@ namespace fbgemm { -// clang-format off -static int masks[8][8] = { - // NOTE: clang-format wants to use a different formatting but the current - // formatting should be easier to read. - { 0, 0, 0, 0, 0, 0, 0, 0, }, - { -1, 0, 0, 0, 0, 0, 0, 0, }, - { -1, -1, 0, 0, 0, 0, 0, 0, }, - { -1, -1, -1, 0, 0, 0, 0, 0, }, - { -1, -1, -1, -1, 0, 0, 0, 0, }, - { -1, -1, -1, -1, -1, 0, 0, 0, }, - { -1, -1, -1, -1, -1, -1, 0, 0, }, - { -1, -1, -1, -1, -1, -1, -1, 0, }, -}; -// clang-format on - // c = a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3 // A is in uint8_t // B is in int8_t and pre-interleaved diff --git a/src/MaskAvx2.h b/src/MaskAvx2.h new file mode 100644 index 0000000000..86e6a5cf2f --- /dev/null +++ b/src/MaskAvx2.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +namespace fbgemm { + +namespace internal { + +// A constant array to initialize an AVX2 register to be used as a 32-bit +// granularity mask. +// clang-format off +alignas(64) static const int avx2_ps_or_epi32_masks[8][8] = { + // NOTE: clang-format wants to use a different formatting but the current + // formatting should be easier to read. + { 0, 0, 0, 0, 0, 0, 0, 0, }, + { -1, 0, 0, 0, 0, 0, 0, 0, }, + { -1, -1, 0, 0, 0, 0, 0, 0, }, + { -1, -1, -1, 0, 0, 0, 0, 0, }, + { -1, -1, -1, -1, 0, 0, 0, 0, }, + { -1, -1, -1, -1, -1, 0, 0, 0, }, + { -1, -1, -1, -1, -1, -1, 0, 0, }, + { -1, -1, -1, -1, -1, -1, -1, 0, }, +}; +// clang-format on + +} // namespace internal + +} // namespace fbgemm diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index 0e17bcd491..651871aed9 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -8,25 +8,12 @@ #include +#include "MaskAvx2.h" + using namespace std; namespace fbgemm { -// clang-format off -static int masks[8][8] = { - // NOTE: clang-format wants to use a different formatting but the current - // formatting should be easier to read. - { 0, 0, 0, 0, 0, 0, 0, 0, }, - { -1, 0, 0, 0, 0, 0, 0, 0, }, - { -1, -1, 0, 0, 0, 0, 0, 0, }, - { -1, -1, -1, 0, 0, 0, 0, 0, }, - { -1, -1, -1, -1, 0, 0, 0, 0, }, - { -1, -1, -1, -1, -1, 0, 0, 0, }, - { -1, -1, -1, -1, -1, -1, 0, 0, }, - { -1, -1, -1, -1, -1, -1, -1, 0, }, -}; -// clang-format on - PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( int K, int kernel_prod, @@ -105,8 +92,8 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( __m256i b_v[kernel_prod]; int remainder = K - k1; if (remainder < 32) { - __m256i mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4])); + __m256i mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder / 4])); for (int i = 0; i < kernel_prod; ++i) { b_v[i] = _mm256_maskload_epi32( reinterpret_cast(smat_transposed + i * K + k1), mask_v); diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index c50f6d9c2d..8810e9bc1b 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -10,6 +10,7 @@ #include //for std::min/std::max #include //for nearbyint #include //for numeric_limits +#include "MaskAvx2.h" #include "fbgemm/Fbgemm.h" //for ReQuantizeOutput namespace fbgemm { @@ -99,20 +100,17 @@ void QuantizeAvx2( } // Instantiate QuantizeAvx2 for known datatypes -template -void QuantizeAvx2( +template void QuantizeAvx2( const float* src, uint8_t* dst, int len, const TensorQuantizationParams& qparams); -template -void QuantizeAvx2( +template void QuantizeAvx2( const float* src, int8_t* dst, int len, const TensorQuantizationParams& qparams); - void FindMinMax(const float* a, float* min, float* max, int len) { if (len <= 0) { *min = 0.0f; @@ -160,7 +158,7 @@ void RequantizeAvx2( int len, const RequantizationParams& params) { DoNothing<> doNothingObj{}; - int32_t Bq_zero_point[] = { 0 }; + int32_t Bq_zero_point[] = {0}; ReQuantizeOutput requantizeObj( doNothingObj, ¶ms.real_multiplier, @@ -670,29 +668,14 @@ void requantizeOutputProcessingAvx2( int remainder = block.col_start + block.col_size - j; if (remainder > 0) { - // clang-format off - alignas(64) const int masks[8][8] = { - // NOTE: clang-format wants to use a different formatting but the - // current formatting should be easier to read. - { 0, 0, 0, 0, 0, 0, 0, 0, }, - { -1, 0, 0, 0, 0, 0, 0, 0, }, - { -1, -1, 0, 0, 0, 0, 0, 0, }, - { -1, -1, -1, 0, 0, 0, 0, 0, }, - { -1, -1, -1, -1, 0, 0, 0, 0, }, - { -1, -1, -1, -1, -1, 0, 0, 0, }, - { -1, -1, -1, -1, -1, -1, 0, 0, }, - { -1, -1, -1, -1, -1, -1, -1, 0, }, - }; - // clang-format on - __m256i mask_v = _mm256_load_si256( - reinterpret_cast(masks[remainder])); + __m256i mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder])); __m256i x_v = _mm256_maskload_epi32( - inp + (i - block.row_start) * ld_in + (j - block.col_start), - mask_v); + inp + (i - block.row_start) * ld_in + (j - block.col_start), mask_v); if (!A_SYMMETRIC) { - __m256i col_off_v = _mm256_mullo_epi32( + __m256i col_off_v = _mm256_mullo_epi32( A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v)); x_v = _mm256_sub_epi32(x_v, col_off_v); } @@ -880,29 +863,14 @@ void requantizeForFloatAvx2( int remainder = block.col_start + block.col_size - j; if (remainder > 0) { - // clang-format off - alignas(64) const int masks[8][8] = { - // NOTE: clang-format wants to use a different formatting but the - // current formatting should be easier to read. - { 0, 0, 0, 0, 0, 0, 0, 0, }, - { -1, 0, 0, 0, 0, 0, 0, 0, }, - { -1, -1, 0, 0, 0, 0, 0, 0, }, - { -1, -1, -1, 0, 0, 0, 0, 0, }, - { -1, -1, -1, -1, 0, 0, 0, 0, }, - { -1, -1, -1, -1, -1, 0, 0, 0, }, - { -1, -1, -1, -1, -1, -1, 0, 0, }, - { -1, -1, -1, -1, -1, -1, -1, 0, }, - }; - // clang-format on - __m256i mask_v = _mm256_load_si256( - reinterpret_cast(masks[remainder])); + __m256i mask_v = _mm256_load_si256(reinterpret_cast( + internal::avx2_ps_or_epi32_masks[remainder])); __m256i x_v = _mm256_maskload_epi32( - inp + (i - block.row_start) * ld_in + (j - block.col_start), - mask_v); + inp + (i - block.row_start) * ld_in + (j - block.col_start), mask_v); if (!A_SYMMETRIC) { - __m256i col_off_v = _mm256_mullo_epi32( + __m256i col_off_v = _mm256_mullo_epi32( A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v)); x_v = _mm256_sub_epi32(x_v, col_off_v); }