Skip to content

Commit

Permalink
Zero point addition after rounding in quantization routines (pytorch#362
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: pytorch#362

We want to add zero point after rounding to unify numerics for PyTorch quantization. However, we also maintain the original version to maintain backward compatibility for C2.

Reviewed By: jspark1105

Differential Revision: D21188721

fbshipit-source-id: daaefabd7eafb39ca99eb2d9d90a4db7a8c26c32
  • Loading branch information
dskhudia authored and facebook-github-bot committed Apr 23, 2020
1 parent be54f47 commit 91063b0
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 100 deletions.
26 changes: 19 additions & 7 deletions include/fbgemm/QuantUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ T2 clamp(T1 src, int precision, bool is_signed = false) {

/// Quantize src using zero_point and scale, clamp to the specified precision,
/// and convert it to type T
template <typename T>
template <typename T, bool LEGACY = true>
T Quantize(
float src,
std::int32_t zero_point,
Expand All @@ -65,20 +65,32 @@ T Quantize(
// transformed_val is 127.499992 for src / scale.
// Eventually 127.5 gets rounded to 128 while 127.499992 gets rounded to 127.
float inv_scale = 1.0f / scale;
const float transformed_val = zero_point + src * inv_scale;

float transformed_val = src * inv_scale;
// nearbyint here performs round-to-nearest-ties-to-even with
// default rounding mode.
// For example, nearbyint(1.4) is 1.0, nearbyint(1.5) is 2.0
// and nearbyint(2.5) is 2.0
// Adding zero_point before or after rounding can make a difference
// in exactly halfway cases.
if (LEGACY) {
transformed_val = std::nearbyint(zero_point + transformed_val);
} else {
transformed_val = zero_point + std::nearbyint(transformed_val);
}
// Please note the use of double. Unlike float, a double can represent
// all int32 values exactly. Using a float results in a float value >
// INT32_MAX conversion to int32 in clamp function and hence an UBSAN error.
return clamp<double, T>(
std::nearbyint(transformed_val), result_precision, result_is_signed);
return clamp<double, T>(transformed_val, result_precision, result_is_signed);
}

template <typename T>
template <typename T, bool LEGACY = true>
T Quantize(float src, const TensorQuantizationParams& qparams) {
return Quantize<T>(src, qparams.zero_point, qparams.scale, qparams.precision);
return Quantize<T, LEGACY>(
src, qparams.zero_point, qparams.scale, qparams.precision);
}

template <typename T>
template <typename T, bool LEGACY = true>
FBGEMM_API void Quantize(
const float* src,
T* dst,
Expand Down
2 changes: 1 addition & 1 deletion include/fbgemm/QuantUtilsAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct FBGEMM_API RequantizationParams {
////////////////////////////////////////////////////////////////////////////////
// Utility functions

template <typename T = std::uint8_t>
template <typename T = std::uint8_t, bool LEGACY = true>
void QuantizeAvx2(
const float* src,
T* dst,
Expand Down
64 changes: 35 additions & 29 deletions src/QuantUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ void ChooseRequantizationMultiplier(
////////////////////////////////////////////////////////////////////////////////
// Utility functions

#define FBGEMM_SPECIALIZED_QUANTIZE(T) \
#define FBGEMM_SPECIALIZED_QUANTIZE(T, LEGACY) \
template <> \
FBGEMM_API void Quantize<T>( \
FBGEMM_API void Quantize<T, LEGACY>( \
const float* src, \
T* dst, \
const int len, \
Expand All @@ -173,39 +173,45 @@ void ChooseRequantizationMultiplier(
int i_begin, i_end; \
fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end); \
for (int i = i_begin; i < i_end; ++i) { \
dst[i] = Quantize<T>(src[i], qparams); \
dst[i] = Quantize<T, LEGACY>(src[i], qparams); \
} \
}
FBGEMM_SPECIALIZED_QUANTIZE(uint16_t)
FBGEMM_SPECIALIZED_QUANTIZE(int16_t)
FBGEMM_SPECIALIZED_QUANTIZE(int32_t)
FBGEMM_SPECIALIZED_QUANTIZE(uint16_t, true)
FBGEMM_SPECIALIZED_QUANTIZE(int16_t, true)
FBGEMM_SPECIALIZED_QUANTIZE(int32_t, true)
FBGEMM_SPECIALIZED_QUANTIZE(uint16_t, false)
FBGEMM_SPECIALIZED_QUANTIZE(int16_t, false)
FBGEMM_SPECIALIZED_QUANTIZE(int32_t, false)
#undef FBGEMM_SPECIALIZED_QUANTIZE

#define FBGEMM_SPECIALIZED_QUANTIZE_AVX2(T) \
template <> \
FBGEMM_API void Quantize<T>( \
const float* src, \
T* dst, \
int len, \
const TensorQuantizationParams& qparams, \
int thread_id, \
int num_threads) { \
bool avx2_support = cpuinfo_initialize() && fbgemmHasAvx2Support(); \
bool fma_support = cpuinfo_has_x86_fma3(); \
int i_begin, i_end; \
fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end); \
if (avx2_support && fma_support && qparams.precision == 8) { \
/* fast path */ \
QuantizeAvx2<T>(&src[i_begin], &dst[i_begin], i_end - i_begin, qparams); \
} else { \
for (std::size_t i = i_begin; i < i_end; ++i) { \
dst[i] = Quantize<T>(src[i], qparams); \
} \
} \
#define FBGEMM_SPECIALIZED_QUANTIZE_AVX2(T, LEGACY) \
template <> \
FBGEMM_API void Quantize<T, LEGACY>( \
const float* src, \
T* dst, \
int len, \
const TensorQuantizationParams& qparams, \
int thread_id, \
int num_threads) { \
bool avx2_support = cpuinfo_initialize() && fbgemmHasAvx2Support(); \
bool fma_support = cpuinfo_has_x86_fma3(); \
int i_begin, i_end; \
fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end); \
if (avx2_support && fma_support && qparams.precision == 8) { \
/* fast path */ \
QuantizeAvx2<T, LEGACY>( \
&src[i_begin], &dst[i_begin], i_end - i_begin, qparams); \
} else { \
for (std::size_t i = i_begin; i < i_end; ++i) { \
dst[i] = Quantize<T, LEGACY>(src[i], qparams); \
} \
} \
}

FBGEMM_SPECIALIZED_QUANTIZE_AVX2(int8_t)
FBGEMM_SPECIALIZED_QUANTIZE_AVX2(uint8_t)
FBGEMM_SPECIALIZED_QUANTIZE_AVX2(int8_t, true)
FBGEMM_SPECIALIZED_QUANTIZE_AVX2(uint8_t, true)
FBGEMM_SPECIALIZED_QUANTIZE_AVX2(int8_t, false)
FBGEMM_SPECIALIZED_QUANTIZE_AVX2(uint8_t, false)
#undef FBGEMM_SPECIALIZED_QUANTIZE_AVX2

#define FBGEMM_SPECIALIZED_QUANTIZEGROUPWISEKCX(T) \
Expand Down
136 changes: 73 additions & 63 deletions src/QuantUtilsAvx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,65 @@ using namespace std;
// Utility functions

// ASAN seems to have a false-positive for _mm_maskmoveu_si128
template <typename T>
template <typename T, bool LEGACY>
void NO_SANITIZE("address") QuantizeAvx2(
const float* src,
T* dst,
int len,
const TensorQuantizationParams& qparams) {
#if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER))
constexpr int VLEN = 8;
constexpr float min_val = std::numeric_limits<T>::min();
constexpr float max_val = std::numeric_limits<T>::max();
constexpr int32_t min_val = std::numeric_limits<T>::min();
constexpr int32_t max_val = std::numeric_limits<T>::max();
// This is the largest int32 value less than int32_max
// that is exactly representable in float
constexpr int32_t int32_float_max_val =
std::numeric_limits<int32_t>::max() - 127;
std::size_t i = 0;
float inverse_scale = 1.f / qparams.scale;
__m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
// clang-format off
__m256i shuffle_mask_v = _mm256_set_epi8(
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0x0c,
0x08,
0x04,
0x00,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0x0c,
0x08,
0x04,
0x00);
0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff,
0x0c, 0x08, 0x04, 0x00,
0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff,
0x0c, 0x08, 0x04, 0x00);
// clang-format on
__m256i permute_mask_v =
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
for (; i < len / VLEN * VLEN; i += VLEN) {
__m256 src_v = _mm256_loadu_ps(src + i);
__m256 transformed_v = _mm256_fmadd_ps(
src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
__m256 clipped_v = _mm256_min_ps(
_mm256_max_ps(transformed_v, _mm256_set1_ps(min_val)),
_mm256_set1_ps(max_val));
__m256i rounded_v = _mm256_cvtps_epi32(clipped_v);
__m256 transformed_v;
if (LEGACY) { // static if
transformed_v = _mm256_fmadd_ps(
src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
} else {
transformed_v = _mm256_mul_ps(src_v, inverse_scale_v);
}
// If the floating point value is greater than int32_max,
// _mm256_cvtps_epi32 converts them to negative. Clip at int32_float_max_val
// to avoid this.
transformed_v =
_mm256_min_ps(transformed_v, _mm256_set1_ps(int32_float_max_val));

__m256i rounded_v = _mm256_cvtps_epi32(transformed_v);
if (!LEGACY) {
rounded_v =
_mm256_add_epi32(rounded_v, _mm256_set1_epi32(qparams.zero_point));
}
__m256i clipped_v = _mm256_min_epi32(
_mm256_max_epi32(rounded_v, _mm256_set1_epi32(min_val)),
_mm256_set1_epi32(max_val));

// An instruction sequence to save 8 32-bit integers as 8 8-bit integers
rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v);
clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v);
clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v);
_mm_storel_epi64(
reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(rounded_v));
reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(clipped_v));
}

// Handle remainder using mask instructions so that
Expand All @@ -93,36 +90,49 @@ void NO_SANITIZE("address") QuantizeAvx2(
__m128i store_mask_v = _mm_load_si128(
reinterpret_cast<const __m128i*>(internal::sse_epi8_masks[rem]));
__m256 src_v = _mm256_maskload_ps(src + i, mask_v);
__m256 transformed_v = _mm256_fmadd_ps(
src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
__m256 clipped_v = _mm256_min_ps(
_mm256_max_ps(transformed_v, _mm256_set1_ps(min_val)),
_mm256_set1_ps(max_val));
__m256i rounded_v = _mm256_cvtps_epi32(clipped_v);
__m256 transformed_v;
if (LEGACY) {
transformed_v = _mm256_fmadd_ps(
src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
} else {
transformed_v = _mm256_mul_ps(src_v, inverse_scale_v);
}
transformed_v =
_mm256_min_ps(transformed_v, _mm256_set1_ps(int32_float_max_val));

__m256i rounded_v = _mm256_cvtps_epi32(transformed_v);
if (!LEGACY) {
rounded_v =
_mm256_add_epi32(rounded_v, _mm256_set1_epi32(qparams.zero_point));
}
__m256i clipped_v = _mm256_min_epi32(
_mm256_max_epi32(rounded_v, _mm256_set1_epi32(min_val)),
_mm256_set1_epi32(max_val));

// An instruction sequence to save "rem" number of 32-bit integers
// as "rem" number of 8-bit integers
rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v);
clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v);
clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v);
_mm_maskmoveu_si128(
_mm256_castsi256_si128(rounded_v),
_mm256_castsi256_si128(clipped_v),
store_mask_v,
reinterpret_cast<char*>(dst + i));
}
#endif
}

// Instantiate QuantizeAvx2 for known datatypes
template void QuantizeAvx2<uint8_t>(
const float* src,
uint8_t* dst,
int len,
const TensorQuantizationParams& qparams);
template void QuantizeAvx2<int8_t>(
const float* src,
int8_t* dst,
int len,
const TensorQuantizationParams& qparams);
#define SPECIALIZE_QUANTIZEAVX2(T, LEGACY) \
template void QuantizeAvx2<T, LEGACY>( \
const float* src, \
T* dst, \
int len, \
const TensorQuantizationParams& qparams);
SPECIALIZE_QUANTIZEAVX2(uint8_t, true)
SPECIALIZE_QUANTIZEAVX2(int8_t, true)
SPECIALIZE_QUANTIZEAVX2(uint8_t, false)
SPECIALIZE_QUANTIZEAVX2(int8_t, false)
#undef SPECIALIZE_QUANTIZEAVX2

void FindMinMax(const float* a, float* min, float* max, int len) {
if (len <= 0) {
Expand Down

0 comments on commit 91063b0

Please sign in to comment.