forked from pytorch/FBGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BenchUtils.h
129 lines (108 loc) · 2.79 KB
/
BenchUtils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/*
* Copyright (c) Facebook, Inc. and its affiliates.
* All rights reserved.
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <chrono>
#include <functional>
#include <vector>
#include <immintrin.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "./AlignedVec.h"
namespace fbgemm {
template <typename T>
void randFill(aligned_vector<T>& vec, T low, T high);
aligned_vector<float> getRandomSparseVector(
unsigned size,
float fractionNonZeros = 1.0);
void llc_flush(std::vector<char>& llc);
// Same as omp_get_max_threads() when OpenMP is available, otherwise 1
int fbgemm_get_max_threads();
// Same as omp_get_num_threads() when OpenMP is available, otherwise 1
int fbgemm_get_num_threads();
// Same as omp_get_thread_num() when OpenMP is available, otherwise 0
int fbgemm_get_thread_num();
template <typename T>
void cache_evict(const T& vec) {
auto const size = vec.size();
auto const elemSize = sizeof(typename T::value_type);
auto const dataSize = size * elemSize;
const char* data = reinterpret_cast<const char*>(vec.data());
constexpr int CACHE_LINE_SIZE = 64;
for (std::size_t i = 0; i < dataSize; i += CACHE_LINE_SIZE) {
_mm_clflush(&data[i]);
}
}
/**
* Parse application command line arguments
*
*/
int parseArgumentInt(
int argc,
const char* argv[],
const char* arg,
int non_exist_val,
int def_val);
bool parseArgumentBool(
int argc,
const char* argv[],
const char* arg,
bool def_val);
namespace {
struct empty_flush {
void operator()() const {}
};
} // namespace
/**
* @param Fn functor to execute
* @param Fe data eviction functor
*/
template <class Fn, class Fe = std::function<void()>>
double measureWithWarmup(
Fn&& fn,
int warmupIterations,
int measuredIterations,
const Fe& fe = empty_flush(),
bool useOpenMP = false) {
for (int i = 0; i < warmupIterations; ++i) {
// Evict data first
fe();
fn();
}
double ttot = 0.0;
#ifdef _OPENMP
#pragma omp parallel if (useOpenMP)
#endif
for (int i = 0; i < measuredIterations; ++i) {
int thread_id = 0;
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
#ifdef _OPENMP
if (useOpenMP) {
thread_id = omp_get_thread_num();
}
#endif
if (thread_id == 0) {
fe();
}
#ifdef _OPENMP
if (useOpenMP) {
#pragma omp barrier
}
#endif
start = std::chrono::high_resolution_clock::now();
fn();
end = std::chrono::high_resolution_clock::now();
auto dur =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
if (thread_id == 0) {
// TODO: measure load imbalance
ttot += dur.count();
}
}
return ttot / 1e9 / measuredIterations;
}
} // namespace fbgemm