Skip to content

Commit

Permalink
Apply clang format to fbgemm (gpu) (pytorch#799)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: pytorch#799

Reviewed By: rweyrauch

Differential Revision: D32866248

fbshipit-source-id: 7620840fdc8eda79572471420b21dc0ab74348c6
  • Loading branch information
jianyuh authored and facebook-github-bot committed Dec 11, 2021
1 parent 7730397 commit b9e66ca
Show file tree
Hide file tree
Showing 91 changed files with 775 additions and 691 deletions.
1 change: 0 additions & 1 deletion bench/BenchUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,5 +230,4 @@ template aligned_vector<int32_t> getRandomBlockSparseMatrix(
int32_t low,
int32_t high);


} // namespace fbgemm
14 changes: 10 additions & 4 deletions bench/ConvUnifiedBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,10 @@ vector<conv_param_t<3>> shapes_3d = {
// clang-format on

template <int SPATIAL_DIM, typename Acc_t>
void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes, bool flush, int repetitions) {
void performance_test(
const vector<conv_param_t<SPATIAL_DIM>>& shapes,
bool flush,
int repetitions) {
std::vector<char> llc;

if (flush) {
Expand Down Expand Up @@ -503,9 +506,12 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes, bool flus
}

typedef struct {
bool no_flush; /* if true, llc won't be flushed inbetween benchmark iterations */
bool run_extended_shapes; /* if true, runs additional shapes on top of the default set */
int benchmark_repetitions; /* specified number of timed benchmark iterations */
bool no_flush; /* if true, llc won't be flushed inbetween benchmark iterations
*/
bool run_extended_shapes; /* if true, runs additional shapes on top of the
default set */
int benchmark_repetitions; /* specified number of timed benchmark iterations
*/
} user_args_t;

int main(int argc, const char* argv[]) {
Expand Down
3 changes: 2 additions & 1 deletion bench/EmbeddingSpMDM8BitBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,8 @@ int run_benchmark(
}

double max_time = *std::max_element(
benchmarkTimes.begin(), benchmarkTimes.begin() + fbgemm_get_num_threads());
benchmarkTimes.begin(),
benchmarkTimes.begin() + fbgemm_get_num_threads());
double avg_time = std::accumulate(
benchmarkTimes.begin(),
benchmarkTimes.begin() + fbgemm_get_num_threads(),
Expand Down
2 changes: 1 addition & 1 deletion bench/FP16Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#include <mkl.h>
#endif

#include "fbgemm/FbgemmFP16.h"
#include "bench/BenchUtils.h"
#include "fbgemm/FbgemmFP16.h"

using namespace fbgemm;

Expand Down
26 changes: 14 additions & 12 deletions bench/GEMMsTunableBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,13 @@ void performance_test(
#endif

if (compare_buffers(Cint32_ref.data(), Cint32_fb_acc32.data(), m, n, n, 5)) {
vector<int> config = {tuning_params->MCB,
tuning_params->NCB,
tuning_params->KCB,
tuning_params->MR,
tuning_params->NR,
tuning_params->ROW_INTERLEAVE};
vector<int> config = {
tuning_params->MCB,
tuning_params->NCB,
tuning_params->KCB,
tuning_params->MR,
tuning_params->NR,
tuning_params->ROW_INTERLEAVE};
incorrect_configs.insert(config);
} else {
cout << setw(5) << "MCB, " << setw(5) << "NCB, " << setw(5) << "KCB, "
Expand All @@ -191,12 +192,13 @@ void performance_test(
<< nops / ttot << endl;
if ((nops / ttot) > giga_ops) {
giga_ops = nops / ttot;
best_config = {tuning_params->MCB,
tuning_params->NCB,
tuning_params->KCB,
tuning_params->MR,
tuning_params->NR,
tuning_params->ROW_INTERLEAVE};
best_config = {
tuning_params->MCB,
tuning_params->NCB,
tuning_params->KCB,
tuning_params->MR,
tuning_params->NR,
tuning_params->ROW_INTERLEAVE};
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion bench/PackedFloatInOutBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ void performance_test() {
double total_run_time = 0.0;
#endif
cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k
<< ", ";
<< ", ";

for (auto i = 0; i < NWARMUP + NITER; ++i) {
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
Expand Down
11 changes: 6 additions & 5 deletions bench/RequantizeBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ void performance_test() {

aligned_vector<uint8_t> output(len);

for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE,
BenchmarkType::BIAS,
BenchmarkType::A_ASYMMETRIC,
BenchmarkType::B_ASYMMETRIC,
BenchmarkType::PER_CHANNEL}) {
for (BenchmarkType bench_type :
{BenchmarkType::BARE_BONE,
BenchmarkType::BIAS,
BenchmarkType::A_ASYMMETRIC,
BenchmarkType::B_ASYMMETRIC,
BenchmarkType::PER_CHANNEL}) {
int32_t Aint8_zero_point =
bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3;
if (bench_type < BenchmarkType::B_ASYMMETRIC) {
Expand Down
61 changes: 35 additions & 26 deletions bench/RowwiseAdagradBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,23 @@ void run_benchmark(
double data_moved = num_rows * (3 * sizeof(float) * block_size + 2 * 64);

if (isIndex64b) {
auto fn_indices_64 = GenerateSparseAdaGrad<int64_t>(block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
auto fn_indices_64 = GenerateSparseAdaGrad<int64_t>(
block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);

t = measureWithWarmup(
[&]() {
fn_indices_64(num_rows, // number of rows reading
param_size, // total number of parameters
w.data(), // input parameters
g.data(), // input gradients
h.data(), // input momentums
indices.data(), // indices of each row
epsilon,
lr,
weight_decay, // weight_decay
adjust_weight_decay ? counter.data() : nullptr, // counters
counter_halflife); // counter_halflife
fn_indices_64(
num_rows, // number of rows reading
param_size, // total number of parameters
w.data(), // input parameters
g.data(), // input gradients
h.data(), // input momentums
indices.data(), // indices of each row
epsilon,
lr,
weight_decay, // weight_decay
adjust_weight_decay ? counter.data() : nullptr, // counters
counter_halflife); // counter_halflife
},
NUM_WARMUP,
NUM_ITER,
Expand All @@ -123,21 +125,23 @@ void run_benchmark(
counter_halflife); // counter halflife value for adjustments
}
} else {
auto fn_indices_32 = GenerateSparseAdaGrad<int32_t>(block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);
auto fn_indices_32 = GenerateSparseAdaGrad<int32_t>(
block_size, /*rowwise=*/true, prefetch, adjust_weight_decay);

t = measureWithWarmup(
[&]() {
fn_indices_32(num_rows, // number of rows reading
param_size, // total number of parameters
w.data(), // input parameters
g.data(), // input gradients
h.data(), // input momentums
indices_32.data(), // indices of each row
epsilon,
lr,
weight_decay, // weight_decay
adjust_weight_decay ? counter.data() : nullptr, // counters
counter_halflife); // counter_halflife
fn_indices_32(
num_rows, // number of rows reading
param_size, // total number of parameters
w.data(), // input parameters
g.data(), // input gradients
h.data(), // input momentums
indices_32.data(), // indices of each row
epsilon,
lr,
weight_decay, // weight_decay
adjust_weight_decay ? counter.data() : nullptr, // counters
counter_halflife); // counter_halflife
},
NUM_WARMUP,
NUM_ITER,
Expand Down Expand Up @@ -193,14 +197,19 @@ int main() {

for (auto isIndex64b : vector<bool>{true, false}) {
for (auto adjust_weight_decay : vector<bool>{true, false}) {
for (auto prefetch: prefetch_distances) {
for (auto prefetch : prefetch_distances) {
for (auto& input : inputs) {
assert(input.size() >= 2);
num_rows = input[0];
block_size = input[1];
param_size = num_rows * block_size;
run_benchmark(
num_rows, block_size, param_size, isIndex64b, prefetch, adjust_weight_decay);
num_rows,
block_size,
param_size,
isIndex64b,
prefetch,
adjust_weight_decay);
}
}
}
Expand Down
19 changes: 8 additions & 11 deletions bench/SparseDenseMMFP32Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#include "fbgemm/spmmUtils.h"
#include "src/RefImplementations.h"

#include <iostream>
#include <iomanip>
#include <iostream>

using namespace std;
using namespace fbgemm;
Expand All @@ -23,10 +23,9 @@ int main(int, char**) {
// A is MxK -> AT is KxM
// B is KxN -> BT is NxK

cout << setw(7) << "index"
<< setw(7) << "m" << setw(7) << "n" << setw(7) << "k"
<< setw(7) << "fnz" << setw(15) << "eff_GFLOPS"
<< setw(15) << "real_GFLOPS" << endl;
cout << setw(7) << "index" << setw(7) << "m" << setw(7) << "n" << setw(7)
<< "k" << setw(7) << "fnz" << setw(15) << "eff_GFLOPS" << setw(15)
<< "real_GFLOPS" << endl;

int index = 0;
// for (int s = 64; s <= 128; s *= 2)
Expand Down Expand Up @@ -122,12 +121,10 @@ int main(int, char**) {
}

double effective_gflops_intrin = effective_flop / secs_intrin / 1e9;
cout << "[" << setw(5) << index << "]"
<< setw(7) << m << setw(7) << n << setw(7) << k
<< fixed << setw(7) << setprecision(2) << fnz
<< setw(15) << setprecision(5) << effective_gflops_intrin
<< setw(15) << setprecision(5) << fnz * effective_gflops_intrin
<< endl;
cout << "[" << setw(5) << index << "]" << setw(7) << m << setw(7) << n
<< setw(7) << k << fixed << setw(7) << setprecision(2) << fnz
<< setw(15) << setprecision(5) << effective_gflops_intrin << setw(15)
<< setprecision(5) << fnz * effective_gflops_intrin << endl;
++index;
}
}
Expand Down
17 changes: 9 additions & 8 deletions bench/SparseDenseMMInt8Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,15 @@ int main(int, char**) {
aligned_vector<float> act_times_w_scale(n);
randFill<float>(act_times_w_scale, -8.0f, 8.0f);

trRequantizationParams_t reqParams = {act_zero_point,
weight_zero_point.data(),
zero_point,
scale,
bcsr->row_offsets.data(),
nullptr,
nullptr,
act_times_w_scale.data()};
trRequantizationParams_t reqParams = {
act_zero_point,
weight_zero_point.data(),
zero_point,
scale,
bcsr->row_offsets.data(),
nullptr,
nullptr,
act_times_w_scale.data()};

// printMatrix(matrix_op_t::NoTranspose, btData.data(), n, k, k,
// "btData"); printMatrix( matrix_op_t::NoTranspose, bcsr->rowBPtr.data(),
Expand Down
5 changes: 3 additions & 2 deletions fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
#include <curand.h>
#include <curand_kernel.h>

#include <unistd.h>
#include <chrono>
#include <iostream>
#include <unistd.h>
#include <vector>

__device__ half float_to_sto_half_direct(float w) {
Expand Down Expand Up @@ -56,7 +56,8 @@ __device__ half float_to_sto_half_assemblefloat(float w, uint8_t rand) {
const unsigned w_int = __float_as_uint(w);
const unsigned assmebles = (w_int & 0xff800000) | (rand << 5);
const unsigned subtract = (w_int & 0xff800000);
const float assmeble_float = __uint_as_float(assmebles) - __uint_as_float(subtract);
const float assmeble_float =
__uint_as_float(assmebles) - __uint_as_float(subtract);
return __float2half_rz(w + assmeble_float);
}

Expand Down
24 changes: 12 additions & 12 deletions fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,18 +367,18 @@ at::Tensor split_embedding_codegen_lookup_dense_function(
offsets)[0];
} else {
return SplitLookupFunction_Dense_Op::apply(
dev_weights,
weights_offsets,
D_offsets,
total_D,
max_D,
hash_size_cumsum,
total_hash_size_bits,
indices,
offsets,
pooling_mode,
indice_weights,
feature_requires_grad)[0];
dev_weights,
weights_offsets,
D_offsets,
total_D,
max_D,
hash_size_cumsum,
total_hash_size_bits,
indices,
offsets,
pooling_mode,
indice_weights,
feature_requires_grad)[0];
}
}

Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#include <ATen/core/op_registration/op_registration.h>
#include <torch/script.h>

#include "codegen/embedding_forward_split_cpu.h"
#include "codegen/embedding_common.h"
#include "codegen/embedding_forward_split_cpu.h"

using namespace at;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// clang-format off
#include <map>
#include <tuple>

Expand Down Expand Up @@ -216,3 +217,4 @@ split_embedding_backward_codegen_{{ optimizer }}_cpu(

return;
}
// clang-format on
2 changes: 2 additions & 0 deletions fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// clang-format off
#include <map>
#include <tuple>
#include <utility>
Expand Down Expand Up @@ -375,3 +376,4 @@ void split_embedding_backward_exact_cpu_dense_kernel(
return grad;
{% endif %}
}
// clang-format on
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// clang-format off
#include <ATen/ATen.h>
#include <ATen/core/op_registration/op_registration.h>
#include <torch/script.h>
Expand Down Expand Up @@ -212,3 +213,4 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
}

} // namespace
// clang-format on
2 changes: 2 additions & 0 deletions fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// clang-format off
#include <ATen/ATen.h>
#include <ATen/TypeDefault.h>
#include <ATen/core/op_registration/op_registration.h>
Expand Down Expand Up @@ -484,3 +485,4 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
m.def("split_embedding_codegen_lookup_{{ optimizer }}_function(Tensor placeholder_autograd_tensor, Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, Tensor lxu_cache_locations, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
m.impl("split_embedding_codegen_lookup_{{ optimizer }}_function", torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(split_embedding_codegen_lookup_{{ optimizer }}_function)));
}
// clang-format on
Loading

0 comments on commit b9e66ca

Please sign in to comment.