add cache mem stats (pytorch#3035)

Summary: X-link: facebookresearch/FBGEMM#133 Pull Request resolved: pytorch#3035 add cache utilization related stats Reviewed By: q10 Differential Revision: D61755279 fbshipit-source-id: e39ac373c38bf53c14529f148bfad5e2160fdc77
22quinn · Sep 3, 2024 · 225ac16 · 225ac16
1 parent 583243c
commit 225ac16
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 19 deletions.
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -596,6 +596,12 @@ def __init__(
         self.l2_num_cache_lookups_stats_name: str = (
             f"l2_cache.perf.get.tbe_id{tbe_unique_id}.num_lookups"
         )
+        self.l2_cache_free_mem_stats_name: str = (
+            f"l2_cache.mem.tbe_id{tbe_unique_id}.free_mem_bytes"
+        )
+        self.l2_cache_capacity_stats_name: str = (
+            f"l2_cache.mem.tbe_id{tbe_unique_id}.capacity_bytes"
+        )
         if self.stats_reporter:
             self.ssd_prefetch_read_timer = AsyncSeriesTimer(
                 functools.partial(
@@ -617,6 +623,8 @@ def __init__(
             self.stats_reporter.register_stats(self.l2_num_cache_misses_stats_name)
             # pyre-ignore
             self.stats_reporter.register_stats(self.l2_num_cache_lookups_stats_name)
+            self.stats_reporter.register_stats(self.l2_cache_free_mem_stats_name)
+            self.stats_reporter.register_stats(self.l2_cache_capacity_stats_name)
 
     @torch.jit.ignore
     def _report_duration(
@@ -1748,8 +1756,8 @@ def _report_l2_cache_perf_stats(self) -> None:
             self.step, stats_reporter.report_interval  # pyre-ignore
         )
 
-        if len(l2_cache_perf_stats) != 9:
-            logging.error("l2 perf stats should have 9 elements")
+        if len(l2_cache_perf_stats) != 11:
+            logging.error("l2 perf stats should have 11 elements")
             return
 
         num_cache_misses = l2_cache_perf_stats[0]
@@ -1762,16 +1770,29 @@ def _report_l2_cache_perf_stats(self) -> None:
         get_tensor_copy_for_cache_update_duration = l2_cache_perf_stats[7]
         set_tensor_copy_for_cache_update_duration = l2_cache_perf_stats[8]
 
+        l2_cache_free_bytes = l2_cache_perf_stats[9]
+        l2_cache_capacity = l2_cache_perf_stats[10]
+
         stats_reporter.report_data_amount(
             iteration_step=self.step,
-            event_name=self.l2_num_cache_misses_stats_name,  # ods only show integer
+            event_name=self.l2_num_cache_misses_stats_name,
             data_bytes=num_cache_misses,
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
-            event_name=self.l2_num_cache_lookups_stats_name,  # ods only show integer
+            event_name=self.l2_num_cache_lookups_stats_name,
             data_bytes=num_lookups,
         )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.l2_cache_capacity_stats_name,
+            data_bytes=l2_cache_capacity,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.l2_cache_free_mem_stats_name,
+            data_bytes=l2_cache_free_bytes,
+        )
 
         stats_reporter.report_duration(
             iteration_step=self.step,

diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h
@@ -39,8 +39,8 @@ class CacheLibCache {
   };
 
   explicit CacheLibCache(size_t cacheSizeBytes, int64_t num_shards)
-      : cacheConfig_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}),
-        cache_(initializeCacheLib(cacheConfig_)),
+      : cache_config_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}),
+        cache_(initializeCacheLib(cache_config_)),
         admin_(createCacheAdmin(*cache_)) {
     for (int i = 0; i < num_shards; i++) {
       pool_ids_.push_back(cache_->addPool(
@@ -205,8 +205,19 @@ class CacheLibCache {
     }
   }
 
+  /// get L2 cache utilization stats
+  std::vector<int64_t> get_cache_usage() {
+    std::vector<int64_t> cache_mem_stats(2, 0); // freeBytes, capacity
+    cache_mem_stats[1] = cache_config_.cacheSizeBytes;
+    for (auto& pool_id : pool_ids_) {
+      auto pool_stats = cache_->getPoolStats(pool_id);
+      cache_mem_stats[0] += pool_stats.freeMemoryBytes();
+    }
+    return cache_mem_stats;
+  }
+
  private:
-  const CacheConfig cacheConfig_;
+  const CacheConfig cache_config_;
   std::unique_ptr<Cache> cache_;
   std::vector<facebook::cachelib::PoolId> pool_ids_;
   std::unique_ptr<facebook::cachelib::CacheAdmin> admin_;

diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp
@@ -88,8 +88,7 @@ void EmbeddingKVDB::set_cuda(
 std::vector<double> EmbeddingKVDB::get_l2cache_perf(
     const int64_t step,
     const int64_t interval) {
-  std::vector<double> ret;
-  ret.reserve(9); // num metrics
+  std::vector<double> ret(11, 0); // num metrics
   if (step > 0 && step % interval == 0) {
     int reset_val = 0;
     auto num_cache_misses = num_cache_misses_.exchange(reset_val);
@@ -107,16 +106,20 @@ std::vector<double> EmbeddingKVDB::get_l2cache_perf(
         get_tensor_copy_for_cache_update_.exchange(reset_val);
     auto set_tensor_copy_for_cache_update_dur =
         set_tensor_copy_for_cache_update_.exchange(reset_val);
-    ret.push_back(double(num_cache_misses) / interval);
-    ret.push_back(double(num_lookups) / interval);
-    ret.push_back(double(get_total_duration) / interval);
-    ret.push_back(double(get_cache_lookup_total_duration) / interval);
-    ret.push_back(
-        double(get_cache_lookup_wait_filling_thread_duration) / interval);
-    ret.push_back(double(get_weights_fillup_total_duration) / interval);
-    ret.push_back(double(total_cache_update_duration) / interval);
-    ret.push_back(double(get_tensor_copy_for_cache_update_dur) / interval);
-    ret.push_back(double(set_tensor_copy_for_cache_update_dur) / interval);
+    ret[0] = (double(num_cache_misses) / interval);
+    ret[1] = (double(num_lookups) / interval);
+    ret[2] = (double(get_total_duration) / interval);
+    ret[3] = (double(get_cache_lookup_total_duration) / interval);
+    ret[4] = (double(get_cache_lookup_wait_filling_thread_duration) / interval);
+    ret[5] = (double(get_weights_fillup_total_duration) / interval);
+    ret[6] = (double(total_cache_update_duration) / interval);
+    ret[7] = (double(get_tensor_copy_for_cache_update_dur) / interval);
+    ret[8] = (double(set_tensor_copy_for_cache_update_dur) / interval);
+    if (l2_cache_) {
+      auto cache_mem_stats = l2_cache_->get_cache_usage();
+      ret[9] = (cache_mem_stats[0]); // free cache in bytes
+      ret[10] = (cache_mem_stats[1]); // total cache capacity in bytes
+    }
   }
   return ret;
 }