[Profiler] Prefer TSC to wall clock when available (pytorch#73855)

Summary: Pull Request resolved: pytorch#73855 Calling the clock is one of the most expensive parts of profiling. We can reduce the profiling overhead by using `rdtsc` instead. The tradeoff is that we have to measure and convert. (shift and scale) Test Plan: I added a cpp unit test with *very* aggressive anti-flake measures. I also ran the overhead benchmark (9 replicates) with `--stressTestKineto` (0.94 -> 0.89 us) and `--stressTestKineto --kinetoProfileMemory` (1.27 -> 1.17 us) Reviewed By: chaekit Differential Revision: D34231071 fbshipit-source-id: e3b3dd7580d93bcc783e87c7f2fc726cb74f4df8 (cherry picked from commit e8be9f8)
bmedishe · Mar 13, 2022 · 0b1f3bd · 0b1f3bd
1 parent 198d727
commit 0b1f3bd
Show file tree

Hide file tree

Showing 4 changed files with 222 additions and 35 deletions.
diff --git a/test/cpp/profiler/containers.cpp b/test/cpp/profiler/containers.cpp
@@ -1,10 +1,13 @@
+#include <algorithm>
+#include <cmath>
 #include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
 
 #include <c10/util/irange.h>
 #include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/util.h>
 
 TEST(ProfilerTest, AppendOnlyList) {
     const int n = 4096;
@@ -41,3 +44,33 @@ TEST(ProfilerTest, AppendOnlyList_ref) {
         ASSERT_EQ(i.first, expected++);
     }
 }
+
+// Test that we can convert TSC measurements back to wall clock time.
+TEST(ProfilerTest, clock_converter) {
+    const int n = 10001;
+    torch::profiler::impl::ApproximateClockToUnixTimeConverter converter;
+    std::vector<torch::profiler::impl::ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair> pairs;
+    for (const auto i : c10::irange(n)) {
+        pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter::measurePair());
+    }
+    auto count_to_ns = converter.makeConverter();
+    std::vector<int64_t> deltas;
+    for (const auto& i : pairs) {
+        deltas.push_back(i.t_ - count_to_ns(i.approx_t_));
+    }
+    std::sort(deltas.begin(), deltas.end());
+
+    // In general it's not a good idea to put clocks in unit tests as it leads
+    // to flakiness. We mitigate this by:
+    //   1) Testing the clock itself. While the time to complete a task may
+    //      vary, two clocks measuring the same time should be much more
+    //      consistent.
+    //   2) Only testing the interquartile range. Context switches between
+    //      calls to the two timers do occur and can result in hundreds of
+    //      nanoseconds of noise, but such switches are only a few percent
+    //      of cases.
+    //   3) We're willing to accept a somewhat large bias which can emerge from
+    //      differences in the cost of calling each clock.
+    EXPECT_LT(std::abs(deltas[n / 2]), 200);
+    EXPECT_LT(deltas[n * 3 / 4] - deltas[n / 4], 50);
+}
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
@@ -118,11 +118,18 @@ namespace {
 using torch::profiler::impl::ProfilerThreadLocalStateBase;
 using torch::profiler::impl::ActiveProfilerType;
 
+// `reportBackendEventToActiveKinetoProfiler` reports times rather than counts,
+// so `OpEventData` has to be able to store both cases.
+union TimeStamp {
+  int64_t us_; // Backend event.
+  torch::profiler::impl::approx_time_t count_;
+};
+
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct OpEventData {
     // POD members
-    int64_t start_us_;
-    int64_t end_us_;
+    TimeStamp start_time_;
+    TimeStamp end_time_;
     uint64_t correlation_id_;
     uint64_t start_thread_id_;
     uint64_t end_thread_id_;
@@ -157,7 +164,7 @@ struct OpEventData {
 };
 
 struct MemoryEventData {
-  int64_t start_time;
+  torch::profiler::impl::approx_time_t start_time;
   void* ptr;
   int64_t alloc_size;
   int64_t total_allocated;
@@ -219,7 +226,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       std::lock_guard<std::mutex> guard(state_mutex_);
       memory_events_.emplace_back(
-          getTimeUs(),
+          torch::profiler::impl::getApproximateTime(),
           ptr,
           alloc_size,
           total_allocated,
@@ -265,22 +272,24 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
 
   void materializeOpEvents() {
     std::lock_guard<std::mutex> guard(state_mutex_);
+    auto converter = clock_converter_.makeConverter();
 
     for (const auto& e : memory_events_) {
-        cpu_trace_.addMemoryUsageActivity(
-            kMemoryEventName,
-            e.kineto_info,
-            e.start_time,
-            c10::Device(e.device_type, e.device_index),
-            e.ptr,
-            e.alloc_size,
-            e.total_allocated,
-            e.total_reserved);
+      auto start_time_us = converter(e.start_time) / 1000;
+      cpu_trace_.addMemoryUsageActivity(
+          kMemoryEventName,
+          e.kineto_info,
+          start_time_us,
+          c10::Device(e.device_type, e.device_index),
+          e.ptr,
+          e.alloc_size,
+          e.total_allocated,
+          e.total_reserved);
 
       kineto_events_.emplace_back();
       auto& evt = kineto_events_.back();
       evt.name(kMemoryEventName)
-          .startUs(e.start_time)
+          .startUs(start_time_us)
           .deviceIndex(e.device_index)
           .deviceType(e.device_type)
           .nBytes(e.alloc_size)
@@ -289,7 +298,15 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
     memory_events_.clear();
 
     for (const auto& e : op_events_) {
-      if (e.end_us_ < e.start_us_) {
+      int64_t start_us = e.backend_.has_value()
+        ? e.start_time_.us_
+        : converter(e.start_time_.count_) / 1000;
+
+      int64_t end_us = e.backend_.has_value()
+        ? e.end_time_.us_
+        : converter(e.end_time_.count_) / 1000;
+
+      if (end_us < start_us) {
         // We initialize end_us_ to the smallest int64_t, so this means that
         // the op did not finish before we stopped profiling.
         continue;
@@ -299,14 +316,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
           e.name_,
           e.kineto_info_,
           e.correlation_id_,
-          e.start_us_,
-          e.end_us_);
+          start_us,
+          end_us);
 
       kineto_events_.emplace_back();
       kineto_events_.back()
           .name(e.name_)
-          .startUs(e.start_us_)
-          .durationUs(e.end_us_ - e.start_us_)
+          .startUs(start_us)
+          .durationUs(end_us - start_us)
           .correlationId(e.correlation_id_)
           .deviceType(c10::DeviceType::CPU)
           .startThreadId(e.start_thread_id_)
@@ -613,6 +630,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
   }
 
   uint64_t start_time_;
+  torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
   std::set<torch::profiler::impl::ActivityType> activities_;
   torch::profiler::impl::AppendOnlyList<OpEventData, 1024> op_events_;
   torch::profiler::impl::AppendOnlyList<MemoryEventData, 1024> memory_events_;
@@ -640,7 +658,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
             auto ctx_ptr = state_ptr->newOpEvent();
             auto data_ptr = ctx_ptr->data_;
 
-            data_ptr->end_us_ = std::numeric_limits<int64_t>::min();
+            data_ptr->end_time_.count_ = std::numeric_limits<torch::profiler::impl::approx_time_t>::min();
             data_ptr->correlation_id_ = corr_id;
             data_ptr->start_thread_id_ = fn.threadId();
             data_ptr->sequence_number_ = fn.seqNr();
@@ -670,7 +688,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
             if (config.with_flops) {
               data_ptr->extra_args_ = torch::profiler::impl::saveExtraArgs(fn);
             }
-            data_ptr->start_us_ = getTimeUs();
+            data_ptr->start_time_.count_ = torch::profiler::impl::getApproximateTime();
 
             if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
               try {
@@ -692,7 +710,7 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
                 static_cast<KinetoObserverContext*>(ctx_ptr);
             TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
             auto data_ptr = kineto_ctx_ptr->data_;
-            data_ptr->end_us_ = getTimeUs();
+            data_ptr->end_time_.count_=torch::profiler::impl::getApproximateTime();
             data_ptr->end_thread_id_ = at::RecordFunction::currentThreadId();
 
             if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
@@ -728,8 +746,8 @@ void reportBackendEventToActiveKinetoProfiler(
 
   auto ctx_ptr = state_ptr->newOpEvent();
   auto data_ptr = ctx_ptr->data_;
-  data_ptr->start_us_ = start_time_us;
-  data_ptr->end_us_ = end_time_us;
+  data_ptr->start_time_.us_ = start_time_us;
+  data_ptr->end_time_.us_ = end_time_us;
   data_ptr->correlation_id_ = std::numeric_limits<uint64_t>::max();
   data_ptr->start_thread_id_ = at::RecordFunction::currentThreadId();
   data_ptr->end_thread_id_ = data_ptr->start_thread_id_;

diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
@@ -13,6 +13,76 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
+ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter()
+  : start_times_(measurePairs()) {}
+
+ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair
+ApproximateClockToUnixTimeConverter::measurePair() {
+  // Take a measurement on either side to avoid an ordering bias.
+  auto fast_0 = getApproximateTime();
+  auto wall = std::chrono::system_clock::now();
+  auto fast_1 = getApproximateTime();
+
+  TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic.");
+  auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(
+      wall.time_since_epoch());
+
+  // `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`.
+  return {t.count(), fast_0 + (fast_1 - fast_0) / 2};
+}
+
+ApproximateClockToUnixTimeConverter::time_pairs
+    ApproximateClockToUnixTimeConverter::measurePairs() {
+  static constexpr auto n_warmup = 5;
+  for (const auto _ : c10::irange(n_warmup)) {
+    getApproximateTime();
+    steady_clock_t::now();
+  }
+
+  time_pairs out;
+  for (const auto i : c10::irange(out.size())) {
+    out[i] = measurePair();
+  }
+  return out;
+}
+
+std::function<time_t(approx_time_t)>
+ApproximateClockToUnixTimeConverter::makeConverter() {
+  auto end_times = measurePairs();
+
+  // Compute the real time that passes for each tick of the approximate clock.
+  std::array<long double, replicates> scale_factors{};
+  for (const auto i : c10::irange(replicates)) {
+    auto delta_ns = end_times[i].t_ - start_times_[i].t_;
+    auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
+    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+  }
+  std::sort(scale_factors.begin(), scale_factors.end());
+  long double scale_factor = scale_factors[replicates / 2 + 1];
+
+  // We shift all times by `t0` for better numerics. Double precision only has
+  // 16 decimal digits of accuracy, so if we blindly multiply times by
+  // `scale_factor` we may suffer from precision loss. The choice of `t0` is
+  // mostly arbitrary; we just need a factor that is the correct order of
+  // magnitude to bring the intermediate values closer to zero. We are not,
+  // however, guaranteed that `t0_approx` is *exactly* the getApproximateTime
+  // equivilent of `t0`; it is only an estimate that we have to fine tune.
+  auto t0 = start_times_[0].t_;
+  auto t0_approx = start_times_[0].approx_t_;
+  std::array<double, replicates> t0_correction{};
+  for (const auto i : c10::irange(replicates)) {
+    auto dt = start_times_[i].t_  - t0;
+    auto dt_approx = (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
+    t0_correction[i] = dt - (time_t)dt_approx;
+  }
+  t0 += t0_correction[t0_correction.size() / 2 + 1];
+
+  return [=](approx_time_t t_approx) {
+    // See above for why this is more stable than `A * t_approx + B`.
+    return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
+  };
+}
+
 // ----------------------------------------------------------------------------
 // -- NVTX --------------------------------------------------------------------
 // ----------------------------------------------------------------------------

diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
@@ -18,26 +18,50 @@
 #include <sys/time.h> // for gettimeofday()
 #endif
 
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
+#define C10_RDTSC
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__CUDACC__) || defined(__HIPCC__)
+#undef C10_RDTSC
+#elif defined(__clang__)
+// `__rdtsc` is available by default.
+// NB: This has to be first, because Clang will also define `__GNUC__`
+#elif defined(__GNUC__)
+#include <x86intrin.h>
+#else
+#undef C10_RDTSC
+#endif
+#endif
+
 namespace torch {
 namespace profiler {
 namespace impl {
 
-inline int64_t getTime(bool allow_monotonic = false) {
+using time_t = int64_t;
+using steady_clock_t = std::conditional<
+    std::chrono::high_resolution_clock::is_steady,
+    std::chrono::high_resolution_clock,
+    std::chrono::steady_clock>::type;
+
+inline time_t getTimeSinceEpoch() {
+  auto now = std::chrono::system_clock::now().time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
+}
+
+inline time_t getTime(bool allow_monotonic = false) {
 #if defined(C10_IOS) && defined(C10_MOBILE)
   // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
   // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
   // is implemented or not
   struct timeval now;
   gettimeofday(&now, NULL);
-  return static_cast<int64_t>(now.tv_sec) * 1000000000 +
-      static_cast<int64_t>(now.tv_usec) * 1000;
+  return static_cast<time_t>(now.tv_sec) * 1000000000 +
+      static_cast<time_t>(now.tv_usec) * 1000;
 #elif defined(_WIN32) || defined(__MACH__)
-  using namespace std::chrono;
-  using clock = std::conditional<
-      high_resolution_clock::is_steady,
-      high_resolution_clock,
-      steady_clock>::type;
-  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             steady_clock_t::now().time_since_epoch())
+      .count();
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
   struct timespec t {};
@@ -46,11 +70,53 @@ inline int64_t getTime(bool allow_monotonic = false) {
     mode = CLOCK_MONOTONIC;
   }
   clock_gettime(mode, &t);
-  return static_cast<int64_t>(t.tv_sec) * 1000000000 +
-      static_cast<int64_t>(t.tv_nsec);
+  return static_cast<time_t>(t.tv_sec) * 1000000000 +
+      static_cast<time_t>(t.tv_nsec);
 #endif
 }
 
+// We often do not need to capture true wall times. If a fast mechanism such
+// as TSC is available we can use that instead and convert back to epoch time
+// during post processing. This greatly reduce the clock's contribution to
+// profiling.
+//   http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
+//   https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
+// TODO: We should use
+// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
+inline auto getApproximateTime() {
+#if defined(C10_RDTSC)
+  return static_cast<uint64_t>(__rdtsc());
+#else
+  return getTime();
+#endif
+}
+
+using approx_time_t = decltype(getApproximateTime());
+static_assert(
+    std::is_same<approx_time_t, int64_t>::value ||
+    std::is_same<approx_time_t, uint64_t>::value,
+    "Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
+
+// Convert `getCount` results to Nanoseconds since unix epoch.
+class ApproximateClockToUnixTimeConverter final {
+ public:
+  ApproximateClockToUnixTimeConverter();
+  std::function<time_t(approx_time_t)> makeConverter();
+
+  struct UnixAndApproximateTimePair {
+    time_t t_;
+    approx_time_t approx_t_;
+  };
+  static UnixAndApproximateTimePair measurePair();
+
+ private:
+  static constexpr size_t replicates = 1001;
+  using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
+  time_pairs measurePairs();
+
+  time_pairs start_times_;
+};
+
 std::string getNvtxStr(
     const char* name,
     int64_t sequence_nr,