Skip to content

Commit

Permalink
[Profiler] Prefer TSC to wall clock when available (pytorch#73855)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#73855

Calling the clock is one of the most expensive parts of profiling. We can reduce the profiling overhead by using `rdtsc` instead. The tradeoff is that we have to measure and convert. (shift and scale)

Test Plan: I added a cpp unit test with *very* aggressive anti-flake measures. I also ran the overhead benchmark (9 replicates) with `--stressTestKineto` (0.94 -> 0.89 us) and `--stressTestKineto --kinetoProfileMemory` (1.27 -> 1.17 us)

Reviewed By: chaekit

Differential Revision: D34231071

fbshipit-source-id: e3b3dd7580d93bcc783e87c7f2fc726cb74f4df8
(cherry picked from commit e8be9f8)
  • Loading branch information
Taylor Robie authored and pytorchmergebot committed Mar 13, 2022
1 parent 198d727 commit 0b1f3bd
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 35 deletions.
33 changes: 33 additions & 0 deletions test/cpp/profiler/containers.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#include <algorithm>
#include <cmath>
#include <utility>
#include <vector>

#include <gtest/gtest.h>

#include <c10/util/irange.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/util.h>

TEST(ProfilerTest, AppendOnlyList) {
const int n = 4096;
Expand Down Expand Up @@ -41,3 +44,33 @@ TEST(ProfilerTest, AppendOnlyList_ref) {
ASSERT_EQ(i.first, expected++);
}
}

// Test that we can convert TSC measurements back to wall clock time.
TEST(ProfilerTest, clock_converter) {
const int n = 10001;
torch::profiler::impl::ApproximateClockToUnixTimeConverter converter;
std::vector<torch::profiler::impl::ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair> pairs;
for (const auto i : c10::irange(n)) {
pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter::measurePair());
}
auto count_to_ns = converter.makeConverter();
std::vector<int64_t> deltas;
for (const auto& i : pairs) {
deltas.push_back(i.t_ - count_to_ns(i.approx_t_));
}
std::sort(deltas.begin(), deltas.end());

// In general it's not a good idea to put clocks in unit tests as it leads
// to flakiness. We mitigate this by:
// 1) Testing the clock itself. While the time to complete a task may
// vary, two clocks measuring the same time should be much more
// consistent.
// 2) Only testing the interquartile range. Context switches between
// calls to the two timers do occur and can result in hundreds of
// nanoseconds of noise, but such switches are only a few percent
// of cases.
// 3) We're willing to accept a somewhat large bias which can emerge from
// differences in the cost of calling each clock.
EXPECT_LT(std::abs(deltas[n / 2]), 200);
EXPECT_LT(deltas[n * 3 / 4] - deltas[n / 4], 50);
}
66 changes: 42 additions & 24 deletions torch/csrc/autograd/profiler_kineto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,18 @@ namespace {
using torch::profiler::impl::ProfilerThreadLocalStateBase;
using torch::profiler::impl::ActiveProfilerType;

// `reportBackendEventToActiveKinetoProfiler` reports times rather than counts,
// so `OpEventData` has to be able to store both cases.
union TimeStamp {
int64_t us_; // Backend event.
torch::profiler::impl::approx_time_t count_;
};

// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct OpEventData {
// POD members
int64_t start_us_;
int64_t end_us_;
TimeStamp start_time_;
TimeStamp end_time_;
uint64_t correlation_id_;
uint64_t start_thread_id_;
uint64_t end_thread_id_;
Expand Down Expand Up @@ -157,7 +164,7 @@ struct OpEventData {
};

struct MemoryEventData {
int64_t start_time;
torch::profiler::impl::approx_time_t start_time;
void* ptr;
int64_t alloc_size;
int64_t total_allocated;
Expand Down Expand Up @@ -219,7 +226,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
std::lock_guard<std::mutex> guard(state_mutex_);
memory_events_.emplace_back(
getTimeUs(),
torch::profiler::impl::getApproximateTime(),
ptr,
alloc_size,
total_allocated,
Expand Down Expand Up @@ -265,22 +272,24 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {

void materializeOpEvents() {
std::lock_guard<std::mutex> guard(state_mutex_);
auto converter = clock_converter_.makeConverter();

for (const auto& e : memory_events_) {
cpu_trace_.addMemoryUsageActivity(
kMemoryEventName,
e.kineto_info,
e.start_time,
c10::Device(e.device_type, e.device_index),
e.ptr,
e.alloc_size,
e.total_allocated,
e.total_reserved);
auto start_time_us = converter(e.start_time) / 1000;
cpu_trace_.addMemoryUsageActivity(
kMemoryEventName,
e.kineto_info,
start_time_us,
c10::Device(e.device_type, e.device_index),
e.ptr,
e.alloc_size,
e.total_allocated,
e.total_reserved);

kineto_events_.emplace_back();
auto& evt = kineto_events_.back();
evt.name(kMemoryEventName)
.startUs(e.start_time)
.startUs(start_time_us)
.deviceIndex(e.device_index)
.deviceType(e.device_type)
.nBytes(e.alloc_size)
Expand All @@ -289,7 +298,15 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
memory_events_.clear();

for (const auto& e : op_events_) {
if (e.end_us_ < e.start_us_) {
int64_t start_us = e.backend_.has_value()
? e.start_time_.us_
: converter(e.start_time_.count_) / 1000;

int64_t end_us = e.backend_.has_value()
? e.end_time_.us_
: converter(e.end_time_.count_) / 1000;

if (end_us < start_us) {
// We initialize end_us_ to the smallest int64_t, so this means that
// the op did not finish before we stopped profiling.
continue;
Expand All @@ -299,14 +316,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
e.name_,
e.kineto_info_,
e.correlation_id_,
e.start_us_,
e.end_us_);
start_us,
end_us);

kineto_events_.emplace_back();
kineto_events_.back()
.name(e.name_)
.startUs(e.start_us_)
.durationUs(e.end_us_ - e.start_us_)
.startUs(start_us)
.durationUs(end_us - start_us)
.correlationId(e.correlation_id_)
.deviceType(c10::DeviceType::CPU)
.startThreadId(e.start_thread_id_)
Expand Down Expand Up @@ -613,6 +630,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
}

uint64_t start_time_;
torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
std::set<torch::profiler::impl::ActivityType> activities_;
torch::profiler::impl::AppendOnlyList<OpEventData, 1024> op_events_;
torch::profiler::impl::AppendOnlyList<MemoryEventData, 1024> memory_events_;
Expand Down Expand Up @@ -640,7 +658,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
auto ctx_ptr = state_ptr->newOpEvent();
auto data_ptr = ctx_ptr->data_;

data_ptr->end_us_ = std::numeric_limits<int64_t>::min();
data_ptr->end_time_.count_ = std::numeric_limits<torch::profiler::impl::approx_time_t>::min();
data_ptr->correlation_id_ = corr_id;
data_ptr->start_thread_id_ = fn.threadId();
data_ptr->sequence_number_ = fn.seqNr();
Expand Down Expand Up @@ -670,7 +688,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
if (config.with_flops) {
data_ptr->extra_args_ = torch::profiler::impl::saveExtraArgs(fn);
}
data_ptr->start_us_ = getTimeUs();
data_ptr->start_time_.count_ = torch::profiler::impl::getApproximateTime();

if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
try {
Expand All @@ -692,7 +710,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
static_cast<KinetoObserverContext*>(ctx_ptr);
TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
auto data_ptr = kineto_ctx_ptr->data_;
data_ptr->end_us_ = getTimeUs();
data_ptr->end_time_.count_=torch::profiler::impl::getApproximateTime();
data_ptr->end_thread_id_ = at::RecordFunction::currentThreadId();

if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
Expand Down Expand Up @@ -728,8 +746,8 @@ void reportBackendEventToActiveKinetoProfiler(

auto ctx_ptr = state_ptr->newOpEvent();
auto data_ptr = ctx_ptr->data_;
data_ptr->start_us_ = start_time_us;
data_ptr->end_us_ = end_time_us;
data_ptr->start_time_.us_ = start_time_us;
data_ptr->end_time_.us_ = end_time_us;
data_ptr->correlation_id_ = std::numeric_limits<uint64_t>::max();
data_ptr->start_thread_id_ = at::RecordFunction::currentThreadId();
data_ptr->end_thread_id_ = data_ptr->start_thread_id_;
Expand Down
70 changes: 70 additions & 0 deletions torch/csrc/profiler/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,76 @@ namespace torch {
namespace profiler {
namespace impl {

ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter()
: start_times_(measurePairs()) {}

ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair
ApproximateClockToUnixTimeConverter::measurePair() {
// Take a measurement on either side to avoid an ordering bias.
auto fast_0 = getApproximateTime();
auto wall = std::chrono::system_clock::now();
auto fast_1 = getApproximateTime();

TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic.");
auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(
wall.time_since_epoch());

// `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`.
return {t.count(), fast_0 + (fast_1 - fast_0) / 2};
}

ApproximateClockToUnixTimeConverter::time_pairs
ApproximateClockToUnixTimeConverter::measurePairs() {
static constexpr auto n_warmup = 5;
for (const auto _ : c10::irange(n_warmup)) {
getApproximateTime();
steady_clock_t::now();
}

time_pairs out;
for (const auto i : c10::irange(out.size())) {
out[i] = measurePair();
}
return out;
}

std::function<time_t(approx_time_t)>
ApproximateClockToUnixTimeConverter::makeConverter() {
auto end_times = measurePairs();

// Compute the real time that passes for each tick of the approximate clock.
std::array<long double, replicates> scale_factors{};
for (const auto i : c10::irange(replicates)) {
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
scale_factors[i] = (double)delta_ns / (double)delta_approx;
}
std::sort(scale_factors.begin(), scale_factors.end());
long double scale_factor = scale_factors[replicates / 2 + 1];

// We shift all times by `t0` for better numerics. Double precision only has
// 16 decimal digits of accuracy, so if we blindly multiply times by
// `scale_factor` we may suffer from precision loss. The choice of `t0` is
// mostly arbitrary; we just need a factor that is the correct order of
// magnitude to bring the intermediate values closer to zero. We are not,
// however, guaranteed that `t0_approx` is *exactly* the getApproximateTime
// equivilent of `t0`; it is only an estimate that we have to fine tune.
auto t0 = start_times_[0].t_;
auto t0_approx = start_times_[0].approx_t_;
std::array<double, replicates> t0_correction{};
for (const auto i : c10::irange(replicates)) {
auto dt = start_times_[i].t_ - t0;
auto dt_approx = (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
t0_correction[i] = dt - (time_t)dt_approx;
}
t0 += t0_correction[t0_correction.size() / 2 + 1];

return [=](approx_time_t t_approx) {
// See above for why this is more stable than `A * t_approx + B`.
return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
};
}

// ----------------------------------------------------------------------------
// -- NVTX --------------------------------------------------------------------
// ----------------------------------------------------------------------------
Expand Down
88 changes: 77 additions & 11 deletions torch/csrc/profiler/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,50 @@
#include <sys/time.h> // for gettimeofday()
#endif

#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
#define C10_RDTSC
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__CUDACC__) || defined(__HIPCC__)
#undef C10_RDTSC
#elif defined(__clang__)
// `__rdtsc` is available by default.
// NB: This has to be first, because Clang will also define `__GNUC__`
#elif defined(__GNUC__)
#include <x86intrin.h>
#else
#undef C10_RDTSC
#endif
#endif

namespace torch {
namespace profiler {
namespace impl {

inline int64_t getTime(bool allow_monotonic = false) {
using time_t = int64_t;
using steady_clock_t = std::conditional<
std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;

inline time_t getTimeSinceEpoch() {
auto now = std::chrono::system_clock::now().time_since_epoch();
return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
}

inline time_t getTime(bool allow_monotonic = false) {
#if defined(C10_IOS) && defined(C10_MOBILE)
// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
// can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
// is implemented or not
struct timeval now;
gettimeofday(&now, NULL);
return static_cast<int64_t>(now.tv_sec) * 1000000000 +
static_cast<int64_t>(now.tv_usec) * 1000;
return static_cast<time_t>(now.tv_sec) * 1000000000 +
static_cast<time_t>(now.tv_usec) * 1000;
#elif defined(_WIN32) || defined(__MACH__)
using namespace std::chrono;
using clock = std::conditional<
high_resolution_clock::is_steady,
high_resolution_clock,
steady_clock>::type;
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
return std::chrono::duration_cast<std::chrono::nanoseconds>(
steady_clock_t::now().time_since_epoch())
.count();
#else
// clock_gettime is *much* faster than std::chrono implementation on Linux
struct timespec t {};
Expand All @@ -46,11 +70,53 @@ inline int64_t getTime(bool allow_monotonic = false) {
mode = CLOCK_MONOTONIC;
}
clock_gettime(mode, &t);
return static_cast<int64_t>(t.tv_sec) * 1000000000 +
static_cast<int64_t>(t.tv_nsec);
return static_cast<time_t>(t.tv_sec) * 1000000000 +
static_cast<time_t>(t.tv_nsec);
#endif
}

// We often do not need to capture true wall times. If a fast mechanism such
// as TSC is available we can use that instead and convert back to epoch time
// during post processing. This greatly reduce the clock's contribution to
// profiling.
// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
// TODO: We should use
// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
inline auto getApproximateTime() {
#if defined(C10_RDTSC)
return static_cast<uint64_t>(__rdtsc());
#else
return getTime();
#endif
}

using approx_time_t = decltype(getApproximateTime());
static_assert(
std::is_same<approx_time_t, int64_t>::value ||
std::is_same<approx_time_t, uint64_t>::value,
"Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");

// Convert `getCount` results to Nanoseconds since unix epoch.
class ApproximateClockToUnixTimeConverter final {
public:
ApproximateClockToUnixTimeConverter();
std::function<time_t(approx_time_t)> makeConverter();

struct UnixAndApproximateTimePair {
time_t t_;
approx_time_t approx_t_;
};
static UnixAndApproximateTimePair measurePair();

private:
static constexpr size_t replicates = 1001;
using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
time_pairs measurePairs();

time_pairs start_times_;
};

std::string getNvtxStr(
const char* name,
int64_t sequence_nr,
Expand Down

0 comments on commit 0b1f3bd

Please sign in to comment.