Apply clang format to fbgemm (gpu) (pytorch#799)

Summary: Pull Request resolved: pytorch#799 Reviewed By: rweyrauch Differential Revision: D32866248 fbshipit-source-id: 7620840fdc8eda79572471420b21dc0ab74348c6
abojarov · Dec 11, 2021 · b9e66ca · b9e66ca
1 parent 7730397
commit b9e66ca
Show file tree

Hide file tree

Showing 91 changed files with 775 additions and 691 deletions.
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
@@ -230,5 +230,4 @@ template aligned_vector<int32_t> getRandomBlockSparseMatrix(
     int32_t low,
     int32_t high);
 
-
 } // namespace fbgemm
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
@@ -212,7 +212,10 @@ vector<conv_param_t<3>> shapes_3d = {
 // clang-format on
 
 template <int SPATIAL_DIM, typename Acc_t>
-void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes, bool flush, int repetitions) {
+void performance_test(
+    const vector<conv_param_t<SPATIAL_DIM>>& shapes,
+    bool flush,
+    int repetitions) {
   std::vector<char> llc;
 
   if (flush) {
@@ -503,9 +506,12 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes, bool flus
 }
 
 typedef struct {
-  bool no_flush; /* if true, llc won't be flushed inbetween benchmark iterations */
-  bool run_extended_shapes; /* if true, runs additional shapes on top of the default set */
-  int benchmark_repetitions; /* specified number of timed benchmark iterations */
+  bool no_flush; /* if true, llc won't be flushed inbetween benchmark iterations
+                  */
+  bool run_extended_shapes; /* if true, runs additional shapes on top of the
+                               default set */
+  int benchmark_repetitions; /* specified number of timed benchmark iterations
+                              */
 } user_args_t;
 
 int main(int argc, const char* argv[]) {

diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc
@@ -279,7 +279,8 @@ int run_benchmark(
         }
 
         double max_time = *std::max_element(
-            benchmarkTimes.begin(), benchmarkTimes.begin() + fbgemm_get_num_threads());
+            benchmarkTimes.begin(),
+            benchmarkTimes.begin() + fbgemm_get_num_threads());
         double avg_time = std::accumulate(
                               benchmarkTimes.begin(),
                               benchmarkTimes.begin() + fbgemm_get_num_threads(),

diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
@@ -13,8 +13,8 @@
 #include <mkl.h>
 #endif
 
-#include "fbgemm/FbgemmFP16.h"
 #include "bench/BenchUtils.h"
+#include "fbgemm/FbgemmFP16.h"
 
 using namespace fbgemm;
 

diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc
@@ -167,12 +167,13 @@ void performance_test(
 #endif
 
   if (compare_buffers(Cint32_ref.data(), Cint32_fb_acc32.data(), m, n, n, 5)) {
-    vector<int> config = {tuning_params->MCB,
-                          tuning_params->NCB,
-                          tuning_params->KCB,
-                          tuning_params->MR,
-                          tuning_params->NR,
-                          tuning_params->ROW_INTERLEAVE};
+    vector<int> config = {
+        tuning_params->MCB,
+        tuning_params->NCB,
+        tuning_params->KCB,
+        tuning_params->MR,
+        tuning_params->NR,
+        tuning_params->ROW_INTERLEAVE};
     incorrect_configs.insert(config);
   } else {
     cout << setw(5) << "MCB, " << setw(5) << "NCB, " << setw(5) << "KCB, "
@@ -191,12 +192,13 @@ void performance_test(
          << nops / ttot << endl;
     if ((nops / ttot) > giga_ops) {
       giga_ops = nops / ttot;
-      best_config = {tuning_params->MCB,
-                     tuning_params->NCB,
-                     tuning_params->KCB,
-                     tuning_params->MR,
-                     tuning_params->NR,
-                     tuning_params->ROW_INTERLEAVE};
+      best_config = {
+          tuning_params->MCB,
+          tuning_params->NCB,
+          tuning_params->KCB,
+          tuning_params->MR,
+          tuning_params->NR,
+          tuning_params->ROW_INTERLEAVE};
     }
   }
 }

diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
@@ -231,7 +231,7 @@ void performance_test() {
     double total_run_time = 0.0;
 #endif
     cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k
-      << ", ";
+         << ", ";
 
     for (auto i = 0; i < NWARMUP + NITER; ++i) {
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN

diff --git a/bench/RequantizeBenchmark.cc b/bench/RequantizeBenchmark.cc
@@ -54,11 +54,12 @@ void performance_test() {
 
     aligned_vector<uint8_t> output(len);
 
-    for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE,
-                                     BenchmarkType::BIAS,
-                                     BenchmarkType::A_ASYMMETRIC,
-                                     BenchmarkType::B_ASYMMETRIC,
-                                     BenchmarkType::PER_CHANNEL}) {
+    for (BenchmarkType bench_type :
+         {BenchmarkType::BARE_BONE,
+          BenchmarkType::BIAS,
+          BenchmarkType::A_ASYMMETRIC,
+          BenchmarkType::B_ASYMMETRIC,
+          BenchmarkType::PER_CHANNEL}) {
       int32_t Aint8_zero_point =
           bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3;
       if (bench_type < BenchmarkType::B_ASYMMETRIC) {

diff --git a/bench/RowwiseAdagradBenchmark.cc b/bench/RowwiseAdagradBenchmark.cc
@@ -86,21 +86,23 @@ void run_benchmark(
   double data_moved = num_rows * (3 * sizeof(float) * block_size + 2 * 64);
 
   if (isIndex64b) {
-    auto fn_indices_64 = GenerateSparseAdaGrad<int64_t>(block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
+    auto fn_indices_64 = GenerateSparseAdaGrad<int64_t>(
+        block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
 
     t = measureWithWarmup(
         [&]() {
-          fn_indices_64(num_rows, // number of rows reading
-            param_size, // total number of parameters
-            w.data(), // input parameters
-            g.data(), // input gradients
-            h.data(), // input momentums
-            indices.data(), // indices of each row
-            epsilon,
-            lr,
-            weight_decay, // weight_decay
-            adjust_weight_decay ? counter.data() : nullptr, // counters
-            counter_halflife); // counter_halflife
+          fn_indices_64(
+              num_rows, // number of rows reading
+              param_size, // total number of parameters
+              w.data(), // input parameters
+              g.data(), // input gradients
+              h.data(), // input momentums
+              indices.data(), // indices of each row
+              epsilon,
+              lr,
+              weight_decay, // weight_decay
+              adjust_weight_decay ? counter.data() : nullptr, // counters
+              counter_halflife); // counter_halflife
         },
         NUM_WARMUP,
         NUM_ITER,
@@ -123,21 +125,23 @@ void run_benchmark(
           counter_halflife); // counter halflife value for adjustments
     }
   } else {
-    auto fn_indices_32 = GenerateSparseAdaGrad<int32_t>(block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
+    auto fn_indices_32 = GenerateSparseAdaGrad<int32_t>(
+        block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
 
     t = measureWithWarmup(
         [&]() {
-          fn_indices_32(num_rows, // number of rows reading
-            param_size, // total number of parameters
-            w.data(), // input parameters
-            g.data(), // input gradients
-            h.data(), // input momentums
-            indices_32.data(), // indices of each row
-            epsilon,
-            lr,
-            weight_decay, // weight_decay
-            adjust_weight_decay ? counter.data() : nullptr, // counters
-            counter_halflife); // counter_halflife
+          fn_indices_32(
+              num_rows, // number of rows reading
+              param_size, // total number of parameters
+              w.data(), // input parameters
+              g.data(), // input gradients
+              h.data(), // input momentums
+              indices_32.data(), // indices of each row
+              epsilon,
+              lr,
+              weight_decay, // weight_decay
+              adjust_weight_decay ? counter.data() : nullptr, // counters
+              counter_halflife); // counter_halflife
         },
         NUM_WARMUP,
         NUM_ITER,
@@ -193,14 +197,19 @@ int main() {
 
   for (auto isIndex64b : vector<bool>{true, false}) {
     for (auto adjust_weight_decay : vector<bool>{true, false}) {
-      for (auto prefetch: prefetch_distances) {
+      for (auto prefetch : prefetch_distances) {
         for (auto& input : inputs) {
           assert(input.size() >= 2);
           num_rows = input[0];
           block_size = input[1];
           param_size = num_rows * block_size;
           run_benchmark(
-              num_rows, block_size, param_size, isIndex64b, prefetch, adjust_weight_decay);
+              num_rows,
+              block_size,
+              param_size,
+              isIndex64b,
+              prefetch,
+              adjust_weight_decay);
         }
       }
     }

diff --git a/bench/SparseDenseMMFP32Benchmark.cc b/bench/SparseDenseMMFP32Benchmark.cc
@@ -10,8 +10,8 @@
 #include "fbgemm/spmmUtils.h"
 #include "src/RefImplementations.h"
 
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 
 using namespace std;
 using namespace fbgemm;
@@ -23,10 +23,9 @@ int main(int, char**) {
   // A is MxK -> AT is KxM
   // B is KxN -> BT is NxK
 
-  cout << setw(7) << "index"
-    << setw(7) << "m" << setw(7) << "n" << setw(7) << "k"
-    << setw(7) << "fnz" << setw(15) << "eff_GFLOPS"
-    << setw(15) << "real_GFLOPS" << endl;
+  cout << setw(7) << "index" << setw(7) << "m" << setw(7) << "n" << setw(7)
+       << "k" << setw(7) << "fnz" << setw(15) << "eff_GFLOPS" << setw(15)
+       << "real_GFLOPS" << endl;
 
   int index = 0;
   // for (int s = 64; s <= 128; s *= 2)
@@ -122,12 +121,10 @@ int main(int, char**) {
       }
 
       double effective_gflops_intrin = effective_flop / secs_intrin / 1e9;
-      cout << "[" << setw(5) << index << "]"
-        << setw(7) << m << setw(7) << n << setw(7) << k
-        << fixed << setw(7) << setprecision(2) << fnz
-        << setw(15) << setprecision(5) << effective_gflops_intrin
-        << setw(15) << setprecision(5) << fnz * effective_gflops_intrin
-        << endl;
+      cout << "[" << setw(5) << index << "]" << setw(7) << m << setw(7) << n
+           << setw(7) << k << fixed << setw(7) << setprecision(2) << fnz
+           << setw(15) << setprecision(5) << effective_gflops_intrin << setw(15)
+           << setprecision(5) << fnz * effective_gflops_intrin << endl;
       ++index;
     }
   }

diff --git a/bench/SparseDenseMMInt8Benchmark.cc b/bench/SparseDenseMMInt8Benchmark.cc
@@ -69,14 +69,15 @@ int main(int, char**) {
       aligned_vector<float> act_times_w_scale(n);
       randFill<float>(act_times_w_scale, -8.0f, 8.0f);
 
-      trRequantizationParams_t reqParams = {act_zero_point,
-                                            weight_zero_point.data(),
-                                            zero_point,
-                                            scale,
-                                            bcsr->row_offsets.data(),
-                                            nullptr,
-                                            nullptr,
-                                            act_times_w_scale.data()};
+      trRequantizationParams_t reqParams = {
+          act_zero_point,
+          weight_zero_point.data(),
+          zero_point,
+          scale,
+          bcsr->row_offsets.data(),
+          nullptr,
+          nullptr,
+          act_times_w_scale.data()};
 
       // printMatrix(matrix_op_t::NoTranspose, btData.data(), n, k, k,
       // "btData"); printMatrix( matrix_op_t::NoTranspose, bcsr->rowBPtr.data(),

diff --git a/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu b/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu
@@ -9,9 +9,9 @@
 #include <curand.h>
 #include <curand_kernel.h>
 
+#include <unistd.h>
 #include <chrono>
 #include <iostream>
-#include <unistd.h>
 #include <vector>
 
 __device__ half float_to_sto_half_direct(float w) {
@@ -56,7 +56,8 @@ __device__ half float_to_sto_half_assemblefloat(float w, uint8_t rand) {
   const unsigned w_int = __float_as_uint(w);
   const unsigned assmebles = (w_int & 0xff800000) | (rand << 5);
   const unsigned subtract = (w_int & 0xff800000);
-  const float assmeble_float = __uint_as_float(assmebles) - __uint_as_float(subtract);
+  const float assmeble_float =
+      __uint_as_float(assmebles) - __uint_as_float(subtract);
   return __float2half_rz(w + assmeble_float);
 }
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -367,18 +367,18 @@ at::Tensor split_embedding_codegen_lookup_dense_function(
         offsets)[0];
   } else {
     return SplitLookupFunction_Dense_Op::apply(
-      dev_weights,
-      weights_offsets,
-      D_offsets,
-      total_D,
-      max_D,
-      hash_size_cumsum,
-      total_hash_size_bits,
-      indices,
-      offsets,
-      pooling_mode,
-      indice_weights,
-      feature_requires_grad)[0];
+        dev_weights,
+        weights_offsets,
+        D_offsets,
+        total_D,
+        max_D,
+        hash_size_cumsum,
+        total_hash_size_bits,
+        indices,
+        offsets,
+        pooling_mode,
+        indice_weights,
+        feature_requires_grad)[0];
   }
 }
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
@@ -8,8 +8,8 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/script.h>
 
-#include "codegen/embedding_forward_split_cpu.h"
 #include "codegen/embedding_common.h"
+#include "codegen/embedding_forward_split_cpu.h"
 
 using namespace at;
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// clang-format off
 #include <map>
 #include <tuple>
 
@@ -216,3 +217,4 @@ split_embedding_backward_codegen_{{ optimizer }}_cpu(
 
   return;
 }
+// clang-format on
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// clang-format off
 #include <map>
 #include <tuple>
 #include <utility>
@@ -375,3 +376,4 @@ void split_embedding_backward_exact_cpu_dense_kernel(
   return grad;
   {% endif %}
 }
+// clang-format on
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// clang-format off
 #include <ATen/ATen.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/script.h>
@@ -212,3 +213,4 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
 }
 
 } // namespace
+// clang-format on
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// clang-format off
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
@@ -484,3 +485,4 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
     m.def("split_embedding_codegen_lookup_{{ optimizer }}_function(Tensor placeholder_autograd_tensor, Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, Tensor lxu_cache_locations, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
     m.impl("split_embedding_codegen_lookup_{{ optimizer }}_function", torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(split_embedding_codegen_lookup_{{ optimizer }}_function)));
 }
+// clang-format on
Original file line number	Diff line number	Diff line change
Expand Up		@@ -230,5 +230,4 @@ template aligned_vector<int32_t> getRandomBlockSparseMatrix(
		int32_t low,
		int32_t high);


		} // namespace fbgemm