Remove tcmalloc_spinlock_contention metric

Spinlock contention was removed from tcmalloc upstream, so this metric has been set to 0 since we upgraded to tcmalloc 2.6. I looked briefly at reverting its removal upstream, but the revert would be fairly large, so I think it's best to just accept the loss of this instrumentation. Change-Id: I2c29b81a2c692d5d876d2dd5381ba2c295684324 Reviewed-on: http://gerrit.cloudera.org:8080/9595 Tested-by: Todd Lipcon <[email protected]> Reviewed-by: Adar Dembo <[email protected]>
cutiechi · Mar 13, 2018 · 4889a91 · 4889a91
1 parent 13155d4
commit 4889a91
Show file tree

Hide file tree

Showing 7 changed files with 7 additions and 103 deletions.
diff --git a/docs/release_notes.adoc b/docs/release_notes.adoc
@@ -40,6 +40,8 @@
 [[rn_1.7.0_obsoletions]]
 == Obsoletions
 
+* The `tcmalloc_contention_time` metric, which previously tracked the amount
+  of time spent in memory allocator lock contention, has been removed.
 
 [[rn_1.7.0_deprecations]]
 == Deprecations

diff --git a/src/kudu/scripts/parse_metrics_log.py b/src/kudu/scripts/parse_metrics_log.py
@@ -63,7 +63,6 @@
   #  ("server.voluntary_context_switches", "vol_cs"),
   #  ("tablet.rows_inserted", "inserts_per_sec"),
   #  ("tablet.rows_upserted", "upserts_per_sec"),
-  #  ("server.tcmalloc_contention_time", "tcmalloc_contention_time")
 ]
 
 # These metrics will be extracted as percentile metrics into the TSV.

diff --git a/src/kudu/util/spinlock_profiling-test.cc b/src/kudu/util/spinlock_profiling-test.cc
@@ -43,10 +43,6 @@ namespace gutil {
 extern void SubmitSpinLockProfileData(const void *, int64);
 } // namespace gutil
 
-namespace base {
-extern void SubmitSpinLockProfileData(const void *, int64);
-} // namespace base
-
 namespace kudu {
 
 class SpinLockProfilingTest : public KuduTest {};
@@ -82,10 +78,4 @@ TEST_F(SpinLockProfilingTest, TestStackCollection) {
   ASSERT_EQ(0, dropped);
 }
 
-TEST_F(SpinLockProfilingTest, TestTcmallocContention) {
-  StartSynchronizationProfiling();
-  base::SubmitSpinLockProfileData(nullptr, 12345);
-  ASSERT_GE(GetTcmallocContentionMicros(), 0);
-}
-
 } // namespace kudu
diff --git a/src/kudu/util/spinlock_profiling.cc b/src/kudu/util/spinlock_profiling.cc
@@ -38,7 +38,6 @@
 #include "kudu/util/metrics.h"
 #include "kudu/util/striped64.h"
 #include "kudu/util/trace.h"
-#include "kudu/util/trace_metrics.h"
 
 DEFINE_int32(lock_contention_trace_threshold_cycles,
              2000000, // 2M cycles should be about 1ms
@@ -53,12 +52,6 @@ METRIC_DEFINE_gauge_uint64(server, spinlock_contention_time,
     "started. If this increases rapidly, it may indicate a performance issue in Kudu "
     "internals triggered by a particular workload and warrant investigation.",
     kudu::EXPOSE_AS_COUNTER);
-METRIC_DEFINE_gauge_uint64(server, tcmalloc_contention_time,
-    "TCMalloc Contention Time", kudu::MetricUnit::kMicroseconds,
-    "Amount of time consumed by contention on tcmalloc's locks since the server "
-    "started. If this increases rapidly, it may indicate a performance issue in Kudu "
-    "internals triggered by a particular workload and warrant investigation.",
-    kudu::EXPOSE_AS_COUNTER);
 
 
 using base::SpinLock;
@@ -72,18 +65,6 @@ static LongAdder* g_contended_cycles = nullptr;
 
 namespace {
 
-// We can't use LongAdder for tcmalloc contention, because its
-// implementation can allocate memory, and doing so is prohibited
-// in the tcmalloc contention callback.
-//
-// We pad and align this struct to two cachelines to avoid any false
-// sharing with the g_contended_cycles counter or any other globals.
-struct PaddedAtomic64 {
-  Atomic64 val;
-  char padding[CACHELINE_SIZE * 2 - sizeof(Atomic64)];
-} CACHELINE_ALIGNED;
-static PaddedAtomic64 g_tcmalloc_contention;
-
 // Implements a very simple linear-probing hashtable of stack traces with
 // a fixed number of entries.
 //
@@ -291,10 +272,6 @@ void RegisterSpinLockContentionMetrics(const scoped_refptr<MetricEntity>& entity
   entity->NeverRetire(
       METRIC_spinlock_contention_time.InstantiateFunctionGauge(
           entity, Bind(&GetSpinLockContentionMicros)));
-  entity->NeverRetire(
-      METRIC_tcmalloc_contention_time.InstantiateFunctionGauge(
-          entity, Bind(&GetTcmallocContentionMicros)));
-
 }
 
 uint64_t GetSpinLockContentionMicros() {
@@ -304,13 +281,6 @@ uint64_t GetSpinLockContentionMicros() {
   return implicit_cast<int64_t>(micros);
 }
 
-uint64_t GetTcmallocContentionMicros() {
-  int64_t wait_cycles = base::subtle::NoBarrier_Load(&g_tcmalloc_contention.val);
-  double micros = static_cast<double>(wait_cycles) / base::CyclesPerSecond()
-    * kMicrosPerSecond;
-  return implicit_cast<int64_t>(micros);
-}
-
 void StartSynchronizationProfiling() {
   InitSpinLockContentionProfiling();
   base::subtle::Barrier_AtomicIncrement(&g_profiling_enabled, 1);
@@ -335,20 +305,3 @@ void SubmitSpinLockProfileData(const void *contendedlock, int64_t wait_cycles) {
   kudu::SubmitSpinLockProfileData(contendedlock, wait_cycles);
 }
 } // namespace gutil
-
-// tcmalloc's internal spinlocks also support submitting contention metrics
-// using the base::SubmitSpinLockProfileData weak symbol. However, this function might be
-// called while tcmalloc's heap lock is held. Thus, we cannot allocate memory here or else
-// we risk a deadlock. So, this implementation just does the bare minimum to expose
-// tcmalloc contention.
-namespace base {
-void SubmitSpinLockProfileData(const void* /* contendedlock */, int64_t wait_cycles) {
-#if !defined(__APPLE__)
-  auto t = kudu::Trace::CurrentTrace();
-  if (t) {
-    t->metrics()->IncrementTcmallocContentionCycles(wait_cycles);
-  }
-#endif // !defined(__APPLE__)
-  base::subtle::NoBarrier_AtomicIncrement(&kudu::g_tcmalloc_contention.val, wait_cycles);
-}
-} // namespace base
diff --git a/src/kudu/util/spinlock_profiling.h b/src/kudu/util/spinlock_profiling.h
@@ -38,10 +38,6 @@ void InitSpinLockContentionProfiling();
 // since the server started.
 uint64_t GetSpinLockContentionMicros();
 
-// Return the total number of microseconds spent in tcmalloc contention
-// since the server started.
-uint64_t GetTcmallocContentionMicros();
-
 // Register metrics in the given server entity which measure the amount of
 // spinlock contention.
 void RegisterSpinLockContentionMetrics(const scoped_refptr<MetricEntity>& entity);

diff --git a/src/kudu/util/trace.h b/src/kudu/util/trace.h
@@ -256,30 +256,10 @@ class ScopedAdoptTrace {
   }
 
   ~ScopedAdoptTrace() {
-    auto t = Trace::threadlocal_trace_;
-    Trace::threadlocal_trace_ = old_trace_;
-
-    // It's critical that we Release() the reference count on 't' only
-    // after we've unset the thread-local variable. Otherwise, we can hit
-    // a nasty interaction with tcmalloc contention profiling. Consider
-    // the following sequence:
-    //
-    //   1. threadlocal_trace_ has refcount = 1
-    //   2. we call threadlocal_trace_->Release() which decrements refcount to 0
-    //   3. this calls 'delete' on the Trace object
-    //   3a. this calls tcmalloc free() on the Trace and various sub-objects
-    //   3b. the free() calls may end up experiencing contention in tcmalloc
-    //   3c. we try to account the contention in threadlocal_trace_'s TraceMetrics,
-    //       but it has already been freed.
-    //
-    // In the best case, we just scribble into some free tcmalloc memory. In the
-    // worst case, tcmalloc would have already re-used this memory for a new
-    // allocation on another thread, and we end up overwriting someone else's memory.
-    //
-    // Waiting to Release() only after 'unpublishing' the trace solves this.
-    if (t) {
-      t->Release();
+    if (Trace::threadlocal_trace_) {
+      Trace::threadlocal_trace_->Release();
     }
+    Trace::threadlocal_trace_ = old_trace_;
     DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_);
   }
 

diff --git a/src/kudu/util/trace_metrics.h b/src/kudu/util/trace_metrics.h
@@ -38,7 +38,7 @@ namespace kudu {
 // are plausible.
 class TraceMetrics {
  public:
-  TraceMetrics() : tcmalloc_contention_cycles_(0) {}
+  TraceMetrics() {}
   ~TraceMetrics() {}
 
   // Internalize the given string by duplicating it into a process-wide
@@ -59,14 +59,6 @@ class TraceMetrics {
   // Return a copy of the current counter map.
   std::map<const char*, int64_t> Get() const;
 
-  // Increment the number of cycles spent in tcmalloc lock contention.
-  //
-  // tcmalloc contention is stored separately from other metrics since
-  // it is incremented from a code path that prohibits allocation.
-  void IncrementTcmallocContentionCycles(int64_t cycles) {
-    tcmalloc_contention_cycles_.IncrementBy(cycles);
-  }
-
   // Return metric's current value.
   //
   // NOTE: the 'name' MUST be the same const char* which is used for
@@ -76,7 +68,6 @@ class TraceMetrics {
  private:
   mutable simple_spinlock lock_;
   std::map<const char*, int64_t> counters_;
-  AtomicInt<int64_t> tcmalloc_contention_cycles_;
 
   DISALLOW_COPY_AND_ASSIGN(TraceMetrics);
 };
@@ -88,14 +79,7 @@ inline void TraceMetrics::Increment(const char* name, int64_t amount) {
 
 inline std::map<const char*, int64_t> TraceMetrics::Get() const {
   std::unique_lock<simple_spinlock> l(lock_);
-  auto m = counters_;
-  l.unlock();
-
-  auto v = tcmalloc_contention_cycles_.Load();
-  if (v > 0) {
-    m["tcmalloc_contention_cycles"] = v;
-  }
-  return m;
+  return counters_;
 }
 
 inline int64_t TraceMetrics::GetMetric(const char* name) const {