From 31905d2dd614f51d6e2d57ce74b2cba1745bec51 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Thu, 1 Jul 2021 08:14:10 -0700 Subject: [PATCH] Wextra pedantic fbgemm's (#642) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/642 Enable compilation flags to enforce a variety of safety measures Reviewed By: jianyuh Differential Revision: D29408318 fbshipit-source-id: 04ad39ae45fb9664aa1e05262f87b8f11e716e31 --- include/fbgemm/FbgemmFPCommon.h | 13 +++++------ include/fbgemm/FbgemmPackMatrixB.h | 4 ++-- include/fbgemm/Utils.h | 2 +- src/FbgemmI64.cc | 1 + src/FbgemmSparseDenseInt8Avx2.cc | 1 + src/GenerateKernelU8S8S32ACC16.cc | 6 ++++-- src/GenerateKernelU8S8S32ACC16Avx512.cc | 5 +++-- src/GenerateKernelU8S8S32ACC32.cc | 5 +++-- src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc | 5 +++-- src/QuantUtils.cc | 24 ++++++++++----------- src/QuantUtilsAvx2.cc | 23 ++++++++++---------- src/RowWiseSparseAdagradFused.cc | 6 +++--- src/SparseAdagrad.cc | 4 ++-- src/TransposeUtils.h | 7 ++++-- src/Utils.cc | 14 ++++++------ src/UtilsAvx512.cc | 8 +++---- 16 files changed, 70 insertions(+), 58 deletions(-) diff --git a/include/fbgemm/FbgemmFPCommon.h b/include/fbgemm/FbgemmFPCommon.h index 50625fc56c..83241e9859 100644 --- a/include/fbgemm/FbgemmFPCommon.h +++ b/include/fbgemm/FbgemmFPCommon.h @@ -111,7 +111,7 @@ void cblas_gemm_compute( i_end = m; for (auto m0 = i_begin; m0 < i_end; m0 += mb_max) { int mb = std::min(mb_max, i_end - m0); - assert(mb < partition.size()); + assert(mb < static_cast(partition.size())); for (auto k_ind = 0; k_ind < k; k_ind += Bp.blockRowSize()) { // set up proper accumulation to avoid "Nan" problem float beta_; @@ -128,13 +128,13 @@ void cblas_gemm_compute( auto m1 = m0; auto const num_cycles = partition[mb].size(); - for (auto c = 0; c < num_cycles; ++c) { + for (size_t c = 0; c < num_cycles; ++c) { auto kernel_nrows = partition[mb][c][0]; auto nkernel_nrows = partition[mb][c][1]; auto m_start = m1; auto m_end = m1 + kernel_nrows * nkernel_nrows; for (auto m2 = m_start; m2 < m_end; m2 += kernel_nrows) { - assert(kernel_nrows * kb < scratchpad->size()); + assert(kernel_nrows * kb < static_cast(scratchpad->size())); if (m != 1) { PackA(kernel_nrows, kb, &A[m2 * k + k_ind], k, scratchpad->data()); gp.A = scratchpad->data(); @@ -190,14 +190,15 @@ void cblas_gemm_compute( // use one thread to handle the fringe cases if (thread_id == num_threads - 1) { // leftover - int rem = n - last_blk_col; + const int rem = n - last_blk_col; + (void)rem; // Suppress unused variable warning assert(rem < Bp.blockColSize()); // small temporary buffer: the size should be larger than the // required kernel_nrow x kernel_ncols elements computed in the // registers. std::array c_tmp{0.f}; - assert(c_tmp.size() >= kernel_nrows * Bp.blockColSize()); + assert(static_cast(c_tmp.size()) >= kernel_nrows * Bp.blockColSize()); gp.B = &(Bp(k_ind, last_blk_col)); gp.C = c_tmp.data(); @@ -213,7 +214,7 @@ void cblas_gemm_compute( for (int j = last_blk_col; j < n; j++) { assert( i * Bp.blockColSize() + (j - last_blk_col) < - sizeof(c_tmp) / sizeof(c_tmp[0])); + static_cast(sizeof(c_tmp) / sizeof(c_tmp[0]))); if (beta_ == 0.f) { C[(m2 + i) * ldc + j] = c_tmp[i * Bp.blockColSize() + (j - last_blk_col)]; diff --git a/include/fbgemm/FbgemmPackMatrixB.h b/include/fbgemm/FbgemmPackMatrixB.h index 6e1c093607..c99852ba09 100644 --- a/include/fbgemm/FbgemmPackMatrixB.h +++ b/include/fbgemm/FbgemmPackMatrixB.h @@ -202,10 +202,10 @@ class PackedGemmMatrixB { } const T& operator()(const int r, const int c) const { - uint64_t a = addr(r, c); + const auto a = addr(r, c); assert(r < numRows()); assert(c < numCols()); - assert(a < this->matSize()); + assert(static_cast(a) < this->matSize()); return pmat_[a]; } diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index ec6115c02d..3ce19cc32d 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -129,7 +129,7 @@ FBGEMM_API int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol = 1e-3); /** diff --git a/src/FbgemmI64.cc b/src/FbgemmI64.cc index 474b6c0051..05a47e7132 100644 --- a/src/FbgemmI64.cc +++ b/src/FbgemmI64.cc @@ -155,6 +155,7 @@ CodeGenBase::getOrCreate( #endif const int maxMRegs = mRegBlockSize; + (void)maxMRegs; // Suppress unused variable warning const int maxNRegs = nRegBlockSize / vectorLen; assert( maxMRegs * maxNRegs <= 30 && diff --git a/src/FbgemmSparseDenseInt8Avx2.cc b/src/FbgemmSparseDenseInt8Avx2.cc index 4964d735a9..9a14b2d8b1 100644 --- a/src/FbgemmSparseDenseInt8Avx2.cc +++ b/src/FbgemmSparseDenseInt8Avx2.cc @@ -66,6 +66,7 @@ void SparseDenseInt8MMAvx2( constexpr int VLEN_INT8 = 32; constexpr int VLEN_INT32 = 8; constexpr int rowBlockSize = BCSRMatrix<>::RB; + (void)rowBlockSize; // Suppress unused variable warning constexpr int colBlockSize = BCSRMatrix<>::CB; constexpr int colTileSize = BCSRMatrix<>::COLTILE; diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index 3b12f2276a..37cc1df1e6 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -160,8 +160,10 @@ getOrCreate( assert( kc % row_interleave == 0 && "kc must be a multiple of row_interleave"); assert(nc % nRegBlockSizeMin == 0 && "nc must be a multiple of NR_MIN"); - int maxMRegs = mRegBlockSize; - int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + const int maxMRegs = mRegBlockSize; + const int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + (void)maxMRegs; // Suppress unused variable warning + (void)maxNRegs; // Suppress unused variable warning assert( maxMRegs * maxNRegs <= 13 && "MR*(NR*ROW_INTERLEAVE*8/256" diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc index 924cbbd1ad..580b6c608f 100644 --- a/src/GenerateKernelU8S8S32ACC16Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC16Avx512.cc @@ -126,8 +126,9 @@ CodeGenBase::getOrCreate( assert( kc % row_interleave == 0 && "kc must be a multiple of row_interleave"); assert(nc % nRegBlockSizeMin == 0 && "nc must be a multiple of NR_MIN"); - int maxMRegs = mRegBlockSize; - int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + const int maxMRegs = mRegBlockSize; + (void)maxMRegs; // Suppress unused variable warning + const int maxNRegs = nRegBlockSize * row_interleave / vectorLen; assert( (maxMRegs + 1) * maxNRegs <= 29 && "number of zmm registers for C + one row for loading B: \ diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc index 2601b01b0f..9bbb02f3d9 100644 --- a/src/GenerateKernelU8S8S32ACC32.cc +++ b/src/GenerateKernelU8S8S32ACC32.cc @@ -161,8 +161,9 @@ CodeGenBase::getOrCreate( assert( kc % row_interleave == 0 && "kc must be a multiple of row_interleave"); assert(nc % nRegBlockSizeMin == 0 && "nc must be a multiple of NR_MIN"); - int maxMRegs = mRegBlockSize; - int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + const int maxMRegs = mRegBlockSize; + (void)maxMRegs; // Suppress unused variable warning + const int maxNRegs = nRegBlockSize * row_interleave / vectorLen; assert( maxMRegs * maxNRegs <= numRegs - 4 && "MRegs x NRegs is above available registers (MAX_REGS - 4)"); diff --git a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc index 1ac6770fae..b521c8b36e 100644 --- a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc +++ b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc @@ -108,8 +108,9 @@ CodeGenBase::getOrCreate( assert( kc % row_interleave == 0 && "kc must be a multiple of row_interleave"); assert(nc % nRegBlockSizeMin == 0 && "nc must be a multiple of NR_MIN"); - int maxMRegs = mRegBlockSize; - int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + const int maxMRegs = mRegBlockSize; + const int maxNRegs = nRegBlockSize * row_interleave / vectorLen; + (void)maxMRegs; // Suppress unused variable warning assert( maxMRegs * maxNRegs <= 30 && "MR*(NR*ROW_INTERLEAVE*8/512) \ diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 6eff87976d..70a90e9a6d 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -230,7 +230,7 @@ FBGEMM_SPECIALIZED_QUANTIZE(int32_t, false) QuantizeAvx2( \ &src[i_begin], &dst[i_begin], i_end - i_begin, qparams); \ } else { \ - for (std::size_t i = i_begin; i < i_end; ++i) { \ + for (int i = i_begin; i < i_end; ++i) { \ dst[i] = Quantize(src[i], qparams); \ } \ } \ @@ -261,7 +261,7 @@ FBGEMM_SPECIALIZED_QUANTIZE_AVX2(uint8_t, false) FusedQuantizeDequantizeAvx2( \ &src[i_begin], &dst[i_begin], i_end - i_begin, qparams); \ } else if (noise_ratio <= 0.0f) { \ - for (std::size_t i = i_begin; i < i_end; ++i) { \ + for (int i = i_begin; i < i_end; ++i) { \ dst[i] = FusedQuantizeDequantize(src[i], qparams); \ } \ } else { \ @@ -510,7 +510,7 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( (input_columns + num_elem_per_byte - 1) / num_elem_per_byte + 2 * sizeof(float16); std::vector input_row_float(input_columns); - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const InputType* input_row = input + row * input_columns; std::uint8_t* output_row = output + row * output_columns; float16* output_row_scale_bias = reinterpret_cast( @@ -519,7 +519,7 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( // NOTE: this can be optimized, however we don't care much about performance // for reference implementation. - for (std::size_t col = 0; col < input_columns; ++col) { + for (int col = 0; col < input_columns; ++col) { if (std::is_same()) { input_row_float[col] = input_row[col]; } else { @@ -553,7 +553,7 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef( output_row_scale_bias[0] = cpu_float2half_rn(scale); output_row_scale_bias[1] = minimum_element_fp16; - for (std::size_t col = 0; col < input_columns; ++col) { + for (int col = 0; col < input_columns; ++col) { float X = input_row_float[col]; std::uint8_t quantized = std::max( 0, @@ -619,13 +619,13 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( int output_columns = input_columns + 2 * sizeof(float); std::vector input_row_float(input_columns); - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const InputType* input_row = input + row * input_columns; std::uint8_t* output_row = output + row * output_columns; float* output_row_scale_bias = reinterpret_cast(output_row + input_columns); - for (std::size_t col = 0; col < input_columns; ++col) { + for (int col = 0; col < input_columns; ++col) { if (std::is_same()) { input_row_float[col] = input_row[col]; } else { @@ -642,7 +642,7 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef( output_row_scale_bias[0] = range / 255.0f; output_row_scale_bias[1] = minimum_element; const auto inverse_scale = 255.0f / (range + kEpsilon); - for (std::size_t col = 0; col < input_columns; ++col) { + for (int col = 0; col < input_columns; ++col) { output_row[col] = std::lrintf((input_row_float[col] - minimum_element) * inverse_scale); } @@ -678,7 +678,7 @@ void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( int output_columns = (input_columns - 2 * sizeof(float16)) * num_elem_per_byte; - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const float16* input_row_scale_bias = reinterpret_cast( input_row + @@ -687,7 +687,7 @@ void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef( float bias = cpu_half2float(input_row_scale_bias[1]); OutputType* output_row = output + row * output_columns; - for (std::size_t col = 0; col < output_columns; ++col) { + for (int col = 0; col < output_columns; ++col) { std::uint8_t quantized = input_row[col / num_elem_per_byte]; quantized >>= (col % num_elem_per_byte) * bit_rate; quantized &= (1 << bit_rate) - 1; @@ -740,13 +740,13 @@ void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef( OutputType* output) { int output_columns = input_columns - 2 * sizeof(float); - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const float* input_row_scale_bias = reinterpret_cast(input_row + output_columns); OutputType* output_row = output + row * output_columns; - for (std::size_t col = 0; col < output_columns; ++col) { + for (int col = 0; col < output_columns; ++col) { float output_value = input_row[col] * input_row_scale_bias[0] + input_row_scale_bias[1]; if (std::is_same()) { diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 96171c07ec..0cb9be0954 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -36,7 +36,7 @@ void QuantizeAvx2( // that is exactly representable in float constexpr int32_t int32_float_max_val = std::numeric_limits::max() - 127; - std::size_t i = 0; + int i = 0; float inverse_scale = 1.f / qparams.scale; __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); // clang-format off @@ -170,7 +170,7 @@ void NO_SANITIZE("address") FusedQuantizeDequantizeAvx2( // that is exactly representable in float constexpr int32_t int32_float_max_val = std::numeric_limits::max() - 127; - std::size_t i = 0; + int i = 0; uint32_t rand; __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); __m256 scale_v = _mm256_set1_ps(qparams.scale); @@ -1356,7 +1356,8 @@ void requantizeOutputProcessingGConvAvx2( _mm256_castsi256_si128(x_clamped_v)); } // j loop vectorized - int remainder = block.col_start + block.col_size - j; + const int remainder = block.col_start + block.col_size - j; + (void)remainder; // Suppress unused variable warning assert(remainder == 0); } // i loop } @@ -1505,7 +1506,7 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2( fbgemmAlignedAlloc(64, input_columns * sizeof(float))); } - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const InputType* input_row = input + row * input_columns; const float* input_row_float; if (std::is_same()) { @@ -1527,7 +1528,7 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2( __m256 min_v = _mm256_set1_ps(minimum_element); __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; + int col; for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { __m256 in_v; if (std::is_same()) { @@ -1707,7 +1708,7 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2( input_row_float_for_fp16 = static_cast( fbgemmAlignedAlloc(64, input_columns * sizeof(float))); } - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const InputType* input_row = input + row * input_columns; const float* input_row_float; if (std::is_same()) { @@ -1726,7 +1727,7 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2( float maximum_element = -FLT_MAX; __m256 min_v = _mm256_set1_ps(minimum_element); __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; + int col; for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { __m256 in_v; if (std::is_same()) { @@ -1888,7 +1889,7 @@ void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfAvx2( (VLEN + 1)))); } - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const uint16_t* input_row_scale_bias = reinterpret_cast( input_row + @@ -1904,7 +1905,7 @@ void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfAvx2( output_row_float = reinterpret_cast(output_row); } - std::size_t col = 0; + int col = 0; if (BIT_RATE == 4 || BIT_RATE == 2) { __m256 vscale = _mm256_set1_ps(scale); __m256 vbias = _mm256_set1_ps(bias); @@ -2060,7 +2061,7 @@ void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfAvx2( constexpr int VLEN = 8; int output_columns = input_columns - 2 * sizeof(float); - for (std::size_t row = 0; row < input_rows; ++row) { + for (int row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const float* input_row_scale_bias = reinterpret_cast(input_row + output_columns); @@ -2069,7 +2070,7 @@ void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfAvx2( __m256 scale_v = _mm256_set1_ps(input_row_scale_bias[0]); __m256 bias_v = _mm256_set1_ps(input_row_scale_bias[1]); - std::size_t col; + int col; for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) { __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( _mm_loadl_epi64(reinterpret_cast(input_row + col)))); diff --git a/src/RowWiseSparseAdagradFused.cc b/src/RowWiseSparseAdagradFused.cc index 1b74cb3438..00994d73f4 100644 --- a/src/RowWiseSparseAdagradFused.cc +++ b/src/RowWiseSparseAdagradFused.cc @@ -634,7 +634,7 @@ typename ReturnFunctionSignature:: x86::rsp, x86::ptr( x86::rsp, static_cast(-vlen * sizeof(float16)))); - for (size_t r = 0; r < remainder; ++r) { + for (int r = 0; r < remainder; ++r) { a->mov( h.r16(), x86::word_ptr( @@ -652,7 +652,7 @@ typename ReturnFunctionSignature:: // Truncate rounding to 'counterwork' the random added part a->vcvtps2ph(x86::word_ptr(x86::rsp), out_vreg, 11); // Copy results back - for (size_t r = 0; r < remainder; ++r) { + for (int r = 0; r < remainder; ++r) { a->mov(h.r16(), x86::ptr(x86::rsp, sizeof(dataType) * r)); a->mov( x86::word_ptr( @@ -788,7 +788,7 @@ void rand_initialize() { for (auto i = 0; i < 4; ++i) { g_rnd128v_buffer[i * VLEN_MAX] = rnd128_init_next(h0); uint64_t h1 = g_rnd128v_buffer[i * VLEN_MAX]; - for (auto v = 1; v < VLEN_MAX; ++v) { + for (size_t v = 1; v < VLEN_MAX; ++v) { g_rnd128v_buffer[i * VLEN_MAX + v] = rnd128_init_next(h1); } } diff --git a/src/SparseAdagrad.cc b/src/SparseAdagrad.cc index 27cac4bff4..9499367ada 100644 --- a/src/SparseAdagrad.cc +++ b/src/SparseAdagrad.cc @@ -803,7 +803,7 @@ int SparseAdaGradBlockSize1_( if (weight_decay != 0.0f) { for (int i = 0; i < num_rows; ++i) { IndexType idx = indices[i]; - if (idx >= param_size) { + if (idx >= static_cast(param_size)) { return i; } @@ -821,7 +821,7 @@ int SparseAdaGradBlockSize1_( } else { for (int i = 0; i < num_rows; ++i) { IndexType idx = indices[i]; - if (idx >= param_size) { + if (idx >= static_cast(param_size)) { return i; } float gi = g[i]; diff --git a/src/TransposeUtils.h b/src/TransposeUtils.h index 86027a87b2..26f9c7c108 100644 --- a/src/TransposeUtils.h +++ b/src/TransposeUtils.h @@ -5,8 +5,11 @@ * LICENSE file in the root directory of this source tree. */ #pragma once + #include "fbgemm/FbgemmBuild.h" +#include + namespace fbgemm { /** @@ -39,8 +42,8 @@ void transpose_avx2(unsigned M, unsigned N, const T* src, unsigned ld_src, T* ds */ template void transpose_avx512( - unsigned M, - unsigned N, + int64_t M, + int64_t N, const T* src, unsigned ld_src, T* dst, diff --git a/src/Utils.cc b/src/Utils.cc index b8b627ee05..b3a00a38da 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -43,7 +43,7 @@ int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol /*=1e-3*/) { size_t mismatches = 0; for (int i = 0; i < m; ++i) { @@ -91,8 +91,8 @@ void printMatrix( std::cout << name << ":" << "[" << R << ", " << C << "]" << std::endl; bool tr = (op == matrix_op_t::Transpose); - for (auto r = 0; r < R; ++r) { - for (auto c = 0; c < C; ++c) { + for (size_t r = 0; r < R; ++r) { + for (size_t c = 0; c < C; ++c) { T res = tr ? inp[c * ld + r] : inp[r * ld + c]; if (std::is_integral::value) { std::cout << std::setw(5) << static_cast(res) << " "; @@ -110,7 +110,7 @@ template int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol); template int compare_buffers( @@ -119,7 +119,7 @@ template int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol); template int compare_buffers( @@ -128,7 +128,7 @@ template int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol); template int compare_buffers( @@ -137,7 +137,7 @@ template int compare_buffers( int m, int n, int ld, - int max_mismatches_to_report, + size_t max_mismatches_to_report, float atol); template void printMatrix( diff --git a/src/UtilsAvx512.cc b/src/UtilsAvx512.cc index 6d33b43a44..0230a37b70 100644 --- a/src/UtilsAvx512.cc +++ b/src/UtilsAvx512.cc @@ -307,8 +307,8 @@ namespace internal { template <> void transpose_avx512( - unsigned M, - unsigned N, + int64_t M, + int64_t N, const float* src, unsigned ld_src, float* dst, @@ -1040,8 +1040,8 @@ void transpose_16x32_block( template <> void transpose_avx512( - unsigned M, - unsigned N, + const int64_t M, + const int64_t N, const uint8_t* src, unsigned ld_src, uint8_t* dst,