forked from notaz/mesa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintel_pps_driver.cc
336 lines (267 loc) · 9.84 KB
/
intel_pps_driver.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/*
* Copyright © 2020-2021 Collabora, Ltd.
* Author: Antonio Caggiano <[email protected]>
* Author: Corentin Noël <[email protected]>
*
* SPDX-License-Identifier: MIT
*/
#include "intel_pps_driver.h"
#include <dirent.h>
#include <fcntl.h>
#include <math.h>
#include <poll.h>
#include <strings.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include "drm-uapi/i915_drm.h"
#include "common/intel_gem.h"
#include "dev/intel_device_info.h"
#include "perf/intel_perf.h"
#include "perf/intel_perf_query.h"
#include <pps/pps.h>
#include <pps/pps_algorithm.h>
#include "intel_pps_perf.h"
#include "intel_pps_priv.h"
namespace pps
{
// The HW sampling period is programmed using period_exponent following this
// formula:
// sample_period = timestamp_period * 2^(period_exponent + 1)
// So our minimum sampling period is twice the timestamp period
uint64_t IntelDriver::get_min_sampling_period_ns()
{
return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
}
IntelDriver::IntelDriver()
{
}
IntelDriver::~IntelDriver()
{
}
void IntelDriver::enable_counter(uint32_t counter_id)
{
auto &counter = counters[counter_id];
enabled_counters.emplace_back(counter);
}
void IntelDriver::enable_all_counters()
{
// We should only have one group
assert(groups.size() == 1);
for (uint32_t counter_id : groups[0].counters) {
auto &counter = counters[counter_id];
enabled_counters.emplace_back(counter);
}
}
bool IntelDriver::init_perfcnt()
{
/* Note: clock_id's below 128 are reserved.. for custom clock sources,
* using the hash of a namespaced string is the recommended approach.
* See: https://perfetto.dev/docs/concepts/clock-sync
*/
this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
assert(!perf && "Intel perf should not be initialized at this point");
perf = std::make_unique<IntelPerf>(drm_device.fd);
const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
struct intel_perf_query_info *default_query = nullptr;
selected_query = nullptr;
for (auto &query : perf->get_queries()) {
if (!strcmp(query->symbol_name, "RenderBasic"))
default_query = query;
if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
selected_query = query;
}
assert(default_query);
if (!selected_query) {
if (metric_set_name) {
PPS_LOG_ERROR("Available metric sets:");
for (auto &query : perf->get_queries())
PPS_LOG_ERROR(" %s", query->symbol_name);
PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
}
selected_query = default_query;
}
PPS_LOG("Using metric set '%s': %s",
selected_query->symbol_name, selected_query->name);
// Create group
CounterGroup group = {};
group.id = groups.size();
group.name = selected_query->symbol_name;
for (int i = 0; i < selected_query->n_counters; ++i) {
intel_perf_query_counter &counter = selected_query->counters[i];
// Create counter
Counter counter_desc = {};
counter_desc.id = counters.size();
counter_desc.name = counter.symbol_name;
counter_desc.group = group.id;
counter_desc.getter = [counter, this](
const Counter &c, const Driver &dri) -> Counter::Value {
switch (counter.data_type) {
case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
selected_query,
&perf->result);
break;
case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
return counter.oa_counter_read_float(perf->cfg,
selected_query,
&perf->result);
break;
}
return {};
};
// Add counter id to the group
group.counters.emplace_back(counter_desc.id);
// Store counter
counters.emplace_back(std::move(counter_desc));
}
// Store group
groups.emplace_back(std::move(group));
assert(counters.size() && "Failed to query counters");
// Clear accumulations
intel_perf_query_result_clear(&perf->result);
return true;
}
void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
{
this->sampling_period_ns = sampling_period_ns;
gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask;
if (!perf->open(sampling_period_ns, selected_query)) {
PPS_LOG_FATAL("Failed to open intel perf");
}
}
void IntelDriver::disable_perfcnt()
{
gpu_timestamp_udw = 0;
perf = nullptr;
groups.clear();
counters.clear();
enabled_counters.clear();
}
/// @brief Some perf record durations can be really short
/// @return True if the duration is at least close to the sampling period
static bool close_enough(uint64_t duration, uint64_t sampling_period)
{
return duration > sampling_period - 100000;
}
/// @brief Transforms the raw data received in from the driver into records
std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
const size_t byte_count)
{
std::vector<PerfRecord> records;
records.reserve(128);
PerfRecord record;
record.data.reserve(512);
const uint8_t *iter = data.data();
const uint8_t *end = iter + byte_count;
uint64_t prev_gpu_timestamp = last_gpu_timestamp;
while (iter < end) {
// Iterate a record at a time
auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
// Report is next to the header
const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
uint64_t gpu_timestamp_ldw =
intel_perf_report_timestamp(selected_query, report);
/* Our HW only provides us with the lower 32 bits of the 36bits
* timestamp counter value. If we haven't captured the top bits yet,
* do it now. If we see a roll over the lower 32bits capture it
* again.
*/
if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp)
gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask;
uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw;
auto duration = intel_device_info_timebase_scale(&perf->devinfo,
gpu_timestamp - prev_gpu_timestamp);
// Skip perf-records that are too short by checking
// the distance between last report and this one
if (close_enough(duration, sampling_period_ns)) {
prev_gpu_timestamp = gpu_timestamp;
// Add the new record to the list
record.timestamp = gpu_timestamp;
record.data.resize(header->size); // Possibly 264?
memcpy(record.data.data(), iter, header->size);
records.emplace_back(record);
}
}
// Go to the next record
iter += header->size;
}
return records;
}
/// @brief Read all the available data from the metric set currently in use
void IntelDriver::read_data_from_metric_set()
{
assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
ssize_t bytes_read = 0;
while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
metric_buffer.size() - total_bytes_read)) > 0 ||
errno == EINTR) {
total_bytes_read += std::max(ssize_t(0), bytes_read);
// Increase size of the buffer for the next read
if (metric_buffer.size() / 2 < total_bytes_read) {
metric_buffer.resize(metric_buffer.size() * 2);
}
}
assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
}
bool IntelDriver::dump_perfcnt()
{
if (!perf->oa_stream_ready()) {
return false;
}
read_data_from_metric_set();
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
if (new_records.empty()) {
// No new records from the GPU yet
return false;
} else {
// Records are parsed correctly, so we can reset the
// number of bytes read so far from the metric set
total_bytes_read = 0;
}
APPEND(records, new_records);
if (records.size() < 2) {
// Not enough records to accumulate
return false;
}
return true;
}
uint64_t IntelDriver::gpu_next()
{
if (records.size() < 2) {
// Not enough records to accumulate
return 0;
}
// Get first and second
auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
intel_perf_query_result_accumulate_fields(&perf->result,
selected_query,
record_a + 1,
record_b + 1,
false /* no_oa_accumulate */);
// Get last timestamp
auto gpu_timestamp = records[1].timestamp;
// Consume first record
records.erase(std::begin(records), std::begin(records) + 1);
return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
}
uint64_t IntelDriver::next()
{
// Reset accumulation
intel_perf_query_result_clear(&perf->result);
return gpu_next();
}
uint32_t IntelDriver::gpu_clock_id() const
{
return this->clock_id;
}
uint64_t IntelDriver::gpu_timestamp() const
{
return intel_device_info_timebase_scale(&perf->devinfo,
intel_read_gpu_timestamp(drm_device.fd));
}
} // namespace pps