-
Notifications
You must be signed in to change notification settings - Fork 166
/
Copy pathtime_resumable_tasks.cpp
323 lines (282 loc) · 12 KB
/
time_resumable_tasks.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
/*
Copyright (c) 2005-2020 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#define TBB_PREVIEW_RESUMABLE_TASKS 1
#include "tbb/tbb_config.h"
#include "tbb/task.h"
#include "tbb/task_group.h"
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
#include "tbb/parallel_for.h"
#include <vector>
#include <stack>
#include <functional>
#include <numeric>
#include <algorithm>
/************************************************************************/
/* SETTINGS */
/************************************************************************/
const int DEF_BENCH_RUNS = 1000;
/************************************************************************/
/* HELPERS */
/************************************************************************/
#include "harness_perf.h" // harness_perf::median
template<typename T>
T get_median(std::vector<T>& times) {
return harness_perf::median(times.begin(), times.end());
}
/************************************************************************/
/* SERIAL BENCHMARKS */
/************************************************************************/
//! Allocate COROUTINES_NUM fibers in a row (suspend) in a recursive manner
//! and then swith back (resume) unwinding the ctx_stack.
void BenchCoroutinesAllocation() {
tbb::task_scheduler_init init(1);
const int COROUTINES_NUM = 100;
std::stack<tbb::task::suspend_point> ctx_stack;
tbb::task_group tg;
std::function<void(int)> recursive_f;
recursive_f = [=, &ctx_stack, &tg, &recursive_f](int i) {
if (i < COROUTINES_NUM) {
tg.run([&recursive_f, i]() {
recursive_f(i + 1);
});
tbb::task::suspend([&ctx_stack](tbb::task::suspend_point ctx) {
ctx_stack.push(ctx);
});
}
if (ctx_stack.size() != 0) {
tbb::task::suspend_point ctx = ctx_stack.top(); ctx_stack.pop();
tbb::task::resume(ctx);
}
};
tg.run([=, &recursive_f]() {
std::vector<double> times;
for (int i = 0; i < DEF_BENCH_RUNS; i++) {
tbb::tick_count tick = tbb::tick_count::now();
recursive_f(1);
double interval = (tbb::tick_count::now() - tick).seconds() * 1e6;
times.push_back(interval);
}
// COROUTINES_NUM suspend and resume operations in each run
double median = get_median(times) / double(COROUTINES_NUM);
printf("Test 1 (Coroutines alloc/dealloc): Median time (microseconds): %.4f\n", median);
});
tg.wait();
}
//! Create a task, which suspends and resumes intself, thus reusing once created coroutine
void BenchReusage() {
tbb::task_scheduler_init init(1);
tbb::task_group tg;
std::vector<double> times;
tg.run([×]() {
for (int i = 0; i < DEF_BENCH_RUNS * 10; i++) {
tbb::tick_count tick = tbb::tick_count::now();
tbb::task::suspend([](tbb::task::suspend_point ctx) {
tbb::task::resume(ctx);
});
double diff = (tbb::tick_count::now() - tick).seconds() * 1e6;
times.push_back(diff);
}
});
tg.wait();
double median = get_median(times);
printf("Test 2 (Coroutine reusage): Median time (microseconds): %.4f\n", median);
}
//! Create two tasks and switch between them (suspend current and resume previously suspended coroutine)
//! Measure an average time of the context switch
void BenchContextSwitch() {
tbb::task_scheduler_init init(1);
tbb::task_group tg;
const int N = 10000; // number of switches
const int tasks_num = 2;
std::vector<double> times;
for (int i = 0; i < 100; ++i) {
int switch_counter = N;
tbb::task::suspend_point current_ctx = NULL;
tbb::tick_count tick = tbb::tick_count::now();
for (int j = 0; j < tasks_num; ++j) {
tg.run([=, &switch_counter, ¤t_ctx]() {
while (switch_counter-- > 0) {
tbb::task::suspend([=, &switch_counter, ¤t_ctx](tbb::task::suspend_point ctx) {
if (switch_counter == N - 1) {
current_ctx = ctx;
} else {
tbb::task::suspend_point ctx_to_resume = current_ctx;
current_ctx = ctx;
tbb::task::resume(ctx_to_resume);
}
});
}
if (switch_counter == -1) {
tbb::task::resume(current_ctx);
}
});
}
tg.wait();
// To get an average context switch time divide the bench time by the number of context switches
double diff = ((tbb::tick_count::now() - tick).seconds() / double(N)) * 1e6;
times.push_back(diff);
}
printf("Test 3 (Context Switch): Median time (microseconds): %.4f\n", get_median(times));
}
/************************************************************************/
/* PARALLEL BENCHMARKS */
/************************************************************************/
//! Strong scaling benchmark with predefined number of iterations (N), each parallel_for task
//! suspends and resumes itself with a predefined busy-waiting iterations (work size).
//! Reports 3 numbers: serial, half of the machine, and full available concurrency
template <bool UseResumableTasks>
void ScalabilityBenchmark(const size_t work_size) {
const int N = 1000;
const int NUM_THREADS = tbb::task_scheduler_init::default_num_threads();
const int STEP_RATIO = 2;
// Count 3 scalability metrics: the serial, half and full machine concurrency
for (int i = 0; i <= NUM_THREADS; i += (NUM_THREADS / STEP_RATIO)) {
const int concurrency = (i == 0) ? 1 : i; // just to make step loop nice looking
tbb::task_scheduler_init init(concurrency);
std::vector<double> times;
for (int j = 0; j < 100; j++) {
tbb::tick_count tick = tbb::tick_count::now();
tbb::parallel_for(0, N, [&work_size](const int /*j*/) {
if (UseResumableTasks) {
tbb::task::suspend([](tbb::task::suspend_point ctx) {
tbb::task::resume(ctx);
});
}
for (volatile size_t k = 0; k < work_size; ++k);
}, tbb::simple_partitioner());
double diff = (tbb::tick_count::now() - tick).seconds() * 1e3;
times.push_back(diff);
}
printf("Test 4 (Scalability): Work Size: %zu, With RT-feature: %s, Concurrency: %d, Time (milliseconds): %.4f\n",
work_size, (UseResumableTasks ? "true" : "false"), concurrency, get_median(times));
}
}
/************************************************************************/
/* NATIVE IMPLEMENTATION */
/************************************************************************/
// Dependencies section for co_context.h
#if _WIN32
#include <windows.h> // GetSystemInfo
#else
#include <unistd.h> // sysconf(_SC_PAGESIZE)
#endif
namespace tbb {
namespace internal {
//! System dependent impl
inline size_t GetDefaultSystemPageSize() {
#if _WIN32
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGESIZE);
#endif
}
class governor {
//! Caches the size of OS regular memory page
static size_t DefaultPageSize;
public:
//! Staic accessor for OS regular memory page size
static size_t default_page_size () {
return DefaultPageSize ? DefaultPageSize : DefaultPageSize = GetDefaultSystemPageSize();
}
};
size_t governor::DefaultPageSize;
} // namespace internal
} // namespace tbb
// No-op versions of __TBB_ASSERT/EX for co_context.h header
#define __TBB_ASSERT(predicate,comment) ((void)0)
#define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
// TBB coroutines implementation
// Disable governor header to remove the dependency
#define _TBB_governor_H
#include "../tbb/co_context.h"
using namespace tbb::internal;
#undef _TBB_governor_H
#define HARNESS_CUSTOM_MAIN 1
#include "../test/harness.h" // NativeParallelFor
namespace tbb {
namespace internal {
// Our native coroutine function
#if _WIN32
/* [[noreturn]] */ inline void __stdcall co_local_wait_for_all(void* arg) {
#else
/* [[noreturn]] */ inline void co_local_wait_for_all(void* arg) {
#endif
coroutine_type next = *static_cast<coroutine_type*>(arg);
coroutine_type current; current_coroutine(current);
swap_coroutine(current, next);
}
} // namespace internal
} // namespace tbb
// The same scalability benchmark as for TBB, but written with native OS fibers implementation
void BenchNativeImpl(const size_t work_size) {
const int N = 1000;
const int NUM_THREADS = tbb::task_scheduler_init::default_num_threads();
const int STEP_RATIO = 2;
const size_t STACK_SIZE = 4 * 1024 * 1024; // Just like default TBB worker thread stack size
// Count 3 scalability metrics: the serial, half and full machine concurrency
for (int i = 0; i <= NUM_THREADS; i += (NUM_THREADS / STEP_RATIO)) {
const int concurrency = (i == 0) ? 1 : i; // just to make step loop nice looking
const int sub_range = N / concurrency;
std::vector<double> times;
for (int r = 0; r < 100; r++) {
tbb::tick_count tick = tbb::tick_count::now();
NativeParallelFor(concurrency, [=, &work_size, &sub_range](int /*idx*/) {
// Each iteration of sub-range emulates a single TBB task
for (int j = 0; j < sub_range; j++) {
coroutine_type co_next;
coroutine_type co_current; current_coroutine(co_current);
create_coroutine(co_next, STACK_SIZE, &co_current);
swap_coroutine(co_current, co_next);
// Busy-wait for a while emulating some work
for (volatile size_t k = 0; k < work_size; ++k);
destroy_coroutine(co_next);
}
});
double diff = (tbb::tick_count::now() - tick).seconds() * 1e3;
times.push_back(diff);
}
printf("Test 5 (Native Implementation): Work size: %zu, Concurrency: %d, Time (milliseconds): %.4f\n",
work_size, concurrency, get_median(times));
}
}
/************************************************************************/
/* MAIN DRIVER */
/************************************************************************/
int main() {
// Serial microbenchmarks
BenchCoroutinesAllocation();
BenchReusage();
BenchContextSwitch();
// Scalability benchmarks
// Big work size + no resumable tasks feature (false)
ScalabilityBenchmark<false>(100000);
// Big work size + resumable tasks feature (true)
ScalabilityBenchmark<true>(100000);
// Small work size + no resumable tasks feature (false)
ScalabilityBenchmark<false>(1000);
// Small work size + resumable tasks feature (true)
ScalabilityBenchmark<true>(1000);
// No any work + just resumable tasks feature (true)
ScalabilityBenchmark<true>(0);
// Native implementation
// Big work size
BenchNativeImpl(100000);
// Small work size
BenchNativeImpl(1000);
// Just coroutines/fibers switching
BenchNativeImpl(0);
return 0;
}