enable -Wall -Wextra -Werror (pytorch#868)

Summary: Be more strict about warnings in OSS, and fix revealed less than best practice stuffs. Pull Request resolved: pytorch#868 Reviewed By: jianyuh Differential Revision: D33619867 fbshipit-source-id: da510c2412661537ca2005ed1ceba00fd159ad8c
ajunlonglive · Jan 20, 2022 · b047231 · b047231
1 parent 9e8820b
commit b047231
Show file tree

Hide file tree

Showing 46 changed files with 151 additions and 128 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -118,17 +118,28 @@ if(MSVC)
   target_compile_options(fbgemm_avx2 PRIVATE "/arch:AVX2")
   target_compile_options(fbgemm_avx512 PRIVATE "/arch:AVX512")
 else(MSVC)
+  string(APPEND CMAKE_CXX_FLAGS " -Wall")
+  string(APPEND CMAKE_CXX_FLAGS " -Wextra")
+  string(APPEND CMAKE_CXX_FLAGS " -Werror")
+  string(APPEND CMAKE_CXX_FLAGS " -Wno-deprecated-declarations")
   target_compile_options(fbgemm_avx2 PRIVATE
     "-m64" "-mavx2" "-mf16c" "-mfma")
   target_compile_options(fbgemm_avx512 PRIVATE
     "-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq"
     "-mavx512vl")
-  set_source_files_properties(src/FbgemmFP16UKernelsAvx2.cc
-    PROPERTIES COMPILE_FLAGS "-masm=intel")
-  set_source_files_properties(src/FbgemmFP16UKernelsAvx512.cc
-    PROPERTIES COMPILE_FLAGS "-masm=intel")
-  set_source_files_properties(src/FbgemmFP16UKernelsAvx512_256.cc
+  set_source_files_properties(
+    src/FbgemmFP16UKernelsAvx2.cc
+    src/FbgemmFP16UKernelsAvx512.cc
+    src/FbgemmFP16UKernelsAvx512_256.cc
     PROPERTIES COMPILE_FLAGS "-masm=intel")
+  set_source_files_properties(
+    src/FbgemmI64.cc
+    src/FbgemmI8Depthwise3DAvx2.cc
+    src/FbgemmI8DepthwiseAvx2.cc
+    src/UtilsAvx2.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-uninitialized")
+  set_source_files_properties(src/PackMatrix.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-infinite-recursion")
 endif(MSVC)
 
 if(USE_SANITIZER)

diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
@@ -53,7 +53,7 @@ void randFill(aligned_vector<int64_t>& vec, int64_t low, int64_t high) {
 
 void llc_flush(std::vector<char>& llc) {
   volatile char* data = llc.data();
-  for (auto i = 0; i < llc.size(); i++) {
+  for (size_t i = 0; i < llc.size(); i++) {
     data[i]++;
   }
 }

diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
@@ -117,14 +117,9 @@ double measureWithWarmup(
   {
 #endif
     for (int i = 0; i < measuredIterations; ++i) {
-      int thread_id = 0;
       std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
 
-#ifdef _OPENMP
-      if (useOpenMP) {
-        thread_id = omp_get_thread_num();
-      }
-#endif
+      const auto thread_id = useOpenMP ? fbgemm_get_thread_num() : 0;
 
       if (thread_id == 0) {
         fe();
@@ -193,7 +188,7 @@ void transpose_matrix(T* ref, int n, int k) {
   memcpy(ref, local.data(), n * k * sizeof(T));
 }
 
-#if defined(USE_MKL)
+#ifdef USE_MKL
 void test_xerbla(char* srname, const int* info, int);
 #endif
 
@@ -205,9 +200,10 @@ void performance_test(
     bool flush,
     int repetitions,
     bool is_mkl) {
-#if defined(USE_MKL)
+#ifdef USE_MKL
   mkl_set_xerbla((XerblaEntry)test_xerbla);
 #endif
+  (void)is_mkl; // Suppress unused variable warning
 
   float alpha = 1.f, beta = 1.f;
   matrix_op_t btran = matrix_op_t::Transpose;
@@ -361,11 +357,11 @@ void performance_test(
 
 #if defined(USE_MKL) || defined(USE_BLAS)
       // Compare results
-      for (auto i = 0; i < C_ref[0].size(); i++) {
+      for (size_t i = 0; i < C_ref[0].size(); i++) {
         if (std::abs(C_ref[0][i] - C_fb[0][i]) > 1e-3) {
           fprintf(
               stderr,
-              "Error: too high diff between fp32 ref %f and fp16 %f at %d\n",
+              "Error: too high diff between fp32 ref %f and fp16 %f at %ld\n",
               C_ref[0][i],
               C_fb[0][i],
               i);
@@ -375,7 +371,7 @@ void performance_test(
 #endif
     }
 
-#if defined(USE_MKL)
+#ifdef USE_MKL
     if (is_mkl) {
       // Gold via MKL sgemm
       type = "MKL_FP32";

diff --git a/bench/EmbeddingIndexRemappingBenchmark.cc b/bench/EmbeddingIndexRemappingBenchmark.cc
@@ -60,7 +60,6 @@ int run_benchmark(
       weights,
       batch_size,
       num_rows,
-      64, // embedding_dim (not used)
       average_len, // average number of indices in a batch
       EmbeddingSpMDMCornerCase::NONE);
 

diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc
@@ -249,7 +249,7 @@ int run_benchmark(
           assert(
               false && "ERROR: refernce impl and JIT imp did not both succeed");
         } else if (success) {
-          for (int i = 0; i < output.size(); ++i) {
+          for (size_t i = 0; i < output.size(); ++i) {
             assert(fabs(output[i] - output_ref[i]) < 1e-3);
             if (fabs(output[i] - output_ref[i]) >= 1e-3) {
               cout << i << " " << output[i] << " " << output_ref[i] << endl;

diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc
@@ -58,7 +58,7 @@ void run_benchmark(
 
   vector<float> embedding_table(num_rows * embedding_dim);
   normal_distribution<float> embedding_distribution;
-  for (int i = 0; i < embedding_table.size(); ++i) {
+  for (size_t i = 0; i < embedding_table.size(); ++i) {
     embedding_table[i] = embedding_distribution(generator);
   }
   vector<float16> embedding_table_fp16;
@@ -273,7 +273,7 @@ void run_benchmark(
           assert(
               false && "ERROR: refernce impl and JIT imp did not both succeed");
         } else if (success) {
-          for (int i = 0; i < output.size(); ++i) {
+          for (size_t i = 0; i < output.size(); ++i) {
             assert(output[i] == output_ref[i]);
             if (output[i] != output_ref[i]) {
               cout << i << " " << output[i] << " " << output_ref[i] << endl;

diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -248,7 +248,7 @@ int run_benchmark(
           assert(
               false && "ERROR: refernce impl and JIT imp did not both succeed");
         } else if (success) {
-          for (int i = 0; i < output.size(); ++i) {
+          for (size_t i = 0; i < output.size(); ++i) {
             assert(fabs(output[i] - output_ref[i]) < 1e-3);
             if (fabs(output[i] - output_ref[i]) >= 1e-3) {
               cout << i << " " << output[i] << " " << output_ref[i] << endl;

diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
@@ -284,7 +284,7 @@ int run_benchmark(
           assert(
               false && "ERROR: refernce impl and JIT imp did not both succeed");
         } else if (success) {
-          for (int i = 0; i < output.size(); ++i) {
+          for (size_t i = 0; i < output.size(); ++i) {
             assert(fabs(output[i] - output_ref[i]) < 1e-3);
             if (fabs(output[i] - output_ref[i]) >= 1e-3) {
               cout << i << " " << output[i] << " " << output_ref[i] << endl;

diff --git a/bench/GEMMsBenchmark.cc b/bench/GEMMsBenchmark.cc
@@ -64,12 +64,11 @@ void performance_test() {
 #endif
 
   chrono::time_point<chrono::high_resolution_clock> start, end;
-  for (auto shape : shapes) {
+  for (const auto& shape : shapes) {
     int m = shape[0];
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.f, beta = 0.f;
     aligned_vector<uint8_t> Aint8(m * k);
 
     aligned_vector<int8_t> Bint8(k * n);
@@ -93,6 +92,8 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 #ifdef USE_MKL
+    const float alpha = 1.f;
+    const float beta = 0.f;
     runType = "MKL_fp32";
     ttot = measureWithWarmup(
         [&]() {
@@ -212,7 +213,9 @@ void performance_test() {
 #endif
       }
     }
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -294,7 +297,9 @@ void performance_test() {
 #endif
       }
     }
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,

diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc
@@ -157,7 +157,9 @@ void performance_test(
 #endif
     }
   }
-  ((volatile char*)(llc.data()));
+  if (flush) {
+    ((volatile char*)(llc.data()))[0] += 1;
+  }
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
   cout << ", " << setw(16) << total_packing_time / (double)NITER / 1e3 << ", "

diff --git a/bench/GroupwiseConvRequantizeBenchmark.cc b/bench/GroupwiseConvRequantizeBenchmark.cc
@@ -25,7 +25,7 @@ using namespace fbgemm;
 
 void performance_test() {
   // clang-format off
-  vector<conv_param_t<>> shapes = {
+  const vector<conv_param_t<>> shapes = {
     // MB, IC, OC, {IH, IW}, G, {KH, KW}, {stride_h, stride_w}, pad_t, pad_l,
     // pad_b, pad_r
     // conv_param_t<>(1, 16, 16, {16, 14}, 4, {3, 3}, {1, 1}, {1, 1, 1, 1}),
@@ -479,7 +479,9 @@ void performance_test() {
       }
     }
 
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
 
     // packedB.printPackedMatrix("bench B Packed");
     // printMatrix(matrix_op_t::NoTranspose, Cint32_fb_fused.data(), MDim, NDim,

diff --git a/bench/Im2ColFusedRequantizeBenchmark.cc b/bench/Im2ColFusedRequantizeBenchmark.cc
@@ -316,7 +316,9 @@ void performance_test() {
       }
     }
 
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
 
     // packedB.printPackedMatrix("bench B Packed");
     // printMatrix(matrix_op_t::NoTranspose, Cint32_fb.data(), MDim, NDim, NDim,

diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
@@ -29,7 +29,7 @@ using namespace fbgemm;
 
 void performance_test() {
   // clang-format off
-  vector<vector<int>> shapes = {
+  const vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
     {1, 128, 512},
@@ -96,7 +96,6 @@ void performance_test() {
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.f, beta = 0.f;
     aligned_vector<float> Afp32(m * k);
     aligned_vector<uint8_t> Aint8(Afp32.size());
 
@@ -113,7 +112,7 @@ void performance_test() {
     randFill<uint8_t>(Aint8, 0, 255);
     float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
+    for (size_t i = 0; i < Afp32.size(); ++i) {
       Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
     }
 
@@ -122,7 +121,7 @@ void performance_test() {
 
     float Bint8_scale = 0.49;
     int32_t Bint8_zero_point = -30;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
+    for (size_t i = 0; i < Bfp32.size(); ++i) {
       Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
     }
 
@@ -135,6 +134,8 @@ void performance_test() {
     std::string type;
     double nops = 2.0 * m * n * k;
 #ifdef USE_MKL
+    const float alpha = 1.f;
+    const float beta = 0.f;
     type = "MKL_FP32";
     ttot = measureWithWarmup(
         [&]() {
@@ -163,7 +164,9 @@ void performance_test() {
         });
     ttot *= 1e9; // convert to ns
 
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
 
     cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k
          << ", ";
@@ -176,10 +179,6 @@ void performance_test() {
          << nops / ttot << endl;
 #endif
 
-    int32_t C_multiplier = 16544;
-    int32_t C_right_shift = 35;
-    int32_t C_zero_pt = 5;
-
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -267,7 +266,9 @@ void performance_test() {
 #endif
       }
     }
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -294,12 +295,12 @@ void performance_test() {
     cout << endl;
     // cout << "total time: " << ttot << " ns" << endl;
 
+#ifdef USE_MKL
+    // correctness check
     float maximum = *max_element(Cfp32_mkl.begin(), Cfp32_mkl.end());
     float minimum = *min_element(Cfp32_mkl.begin(), Cfp32_mkl.end());
     float atol = (maximum - minimum) / 255 / 1.9;
 
-#ifdef USE_MKL
-    // correctness check
     compare_buffers(Cfp32_mkl.data(), Cfp32_fb.data(), m, n, n, 5, atol);
 #endif
   }

diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -103,7 +103,6 @@ void performance_test() {
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.0f, beta = 0.0f;
     aligned_vector<uint8_t> Aint8(m * k);
     aligned_vector<int8_t> Bint8(k * n);
 
@@ -129,6 +128,8 @@ void performance_test() {
     string runType;
 
 #ifdef USE_MKL
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
     ttot = 0.0;
     runType = "MKL_fp32";
     cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k
@@ -163,7 +164,9 @@ void performance_test() {
         });
     ttot *= 1e9; // convert to ns
 
-    ((volatile char*)(llc.data()));
+    if (flush) {
+      ((volatile char*)(llc.data()))[0] += 1;
+    }
     cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
          << nops / ttot << endl;
 
@@ -415,7 +418,9 @@ void performance_test() {
         }
       }
 
-      ((volatile char*)(llc.data()));
+      if (flush) {
+        ((volatile char*)(llc.data()))[0] += 1;
+      }
       // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
       // unpacked");
       // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,