Skip to content

Commit

Permalink
add cache mem stats (pytorch#3035)
Browse files Browse the repository at this point in the history
Summary:
X-link: facebookresearch/FBGEMM#133

Pull Request resolved: pytorch#3035

add cache utilization related stats

Reviewed By: q10

Differential Revision: D61755279

fbshipit-source-id: e39ac373c38bf53c14529f148bfad5e2160fdc77
  • Loading branch information
duduyi2013 authored and facebook-github-bot committed Sep 3, 2024
1 parent 583243c commit 225ac16
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 19 deletions.
29 changes: 25 additions & 4 deletions fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,12 @@ def __init__(
self.l2_num_cache_lookups_stats_name: str = (
f"l2_cache.perf.get.tbe_id{tbe_unique_id}.num_lookups"
)
self.l2_cache_free_mem_stats_name: str = (
f"l2_cache.mem.tbe_id{tbe_unique_id}.free_mem_bytes"
)
self.l2_cache_capacity_stats_name: str = (
f"l2_cache.mem.tbe_id{tbe_unique_id}.capacity_bytes"
)
if self.stats_reporter:
self.ssd_prefetch_read_timer = AsyncSeriesTimer(
functools.partial(
Expand All @@ -617,6 +623,8 @@ def __init__(
self.stats_reporter.register_stats(self.l2_num_cache_misses_stats_name)
# pyre-ignore
self.stats_reporter.register_stats(self.l2_num_cache_lookups_stats_name)
self.stats_reporter.register_stats(self.l2_cache_free_mem_stats_name)
self.stats_reporter.register_stats(self.l2_cache_capacity_stats_name)

@torch.jit.ignore
def _report_duration(
Expand Down Expand Up @@ -1748,8 +1756,8 @@ def _report_l2_cache_perf_stats(self) -> None:
self.step, stats_reporter.report_interval # pyre-ignore
)

if len(l2_cache_perf_stats) != 9:
logging.error("l2 perf stats should have 9 elements")
if len(l2_cache_perf_stats) != 11:
logging.error("l2 perf stats should have 11 elements")
return

num_cache_misses = l2_cache_perf_stats[0]
Expand All @@ -1762,16 +1770,29 @@ def _report_l2_cache_perf_stats(self) -> None:
get_tensor_copy_for_cache_update_duration = l2_cache_perf_stats[7]
set_tensor_copy_for_cache_update_duration = l2_cache_perf_stats[8]

l2_cache_free_bytes = l2_cache_perf_stats[9]
l2_cache_capacity = l2_cache_perf_stats[10]

stats_reporter.report_data_amount(
iteration_step=self.step,
event_name=self.l2_num_cache_misses_stats_name, # ods only show integer
event_name=self.l2_num_cache_misses_stats_name,
data_bytes=num_cache_misses,
)
stats_reporter.report_data_amount(
iteration_step=self.step,
event_name=self.l2_num_cache_lookups_stats_name, # ods only show integer
event_name=self.l2_num_cache_lookups_stats_name,
data_bytes=num_lookups,
)
stats_reporter.report_data_amount(
iteration_step=self.step,
event_name=self.l2_cache_capacity_stats_name,
data_bytes=l2_cache_capacity,
)
stats_reporter.report_data_amount(
iteration_step=self.step,
event_name=self.l2_cache_free_mem_stats_name,
data_bytes=l2_cache_free_bytes,
)

stats_reporter.report_duration(
iteration_step=self.step,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ class CacheLibCache {
};

explicit CacheLibCache(size_t cacheSizeBytes, int64_t num_shards)
: cacheConfig_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}),
cache_(initializeCacheLib(cacheConfig_)),
: cache_config_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}),
cache_(initializeCacheLib(cache_config_)),
admin_(createCacheAdmin(*cache_)) {
for (int i = 0; i < num_shards; i++) {
pool_ids_.push_back(cache_->addPool(
Expand Down Expand Up @@ -205,8 +205,19 @@ class CacheLibCache {
}
}

/// get L2 cache utilization stats
std::vector<int64_t> get_cache_usage() {
std::vector<int64_t> cache_mem_stats(2, 0); // freeBytes, capacity
cache_mem_stats[1] = cache_config_.cacheSizeBytes;
for (auto& pool_id : pool_ids_) {
auto pool_stats = cache_->getPoolStats(pool_id);
cache_mem_stats[0] += pool_stats.freeMemoryBytes();
}
return cache_mem_stats;
}

private:
const CacheConfig cacheConfig_;
const CacheConfig cache_config_;
std::unique_ptr<Cache> cache_;
std::vector<facebook::cachelib::PoolId> pool_ids_;
std::unique_ptr<facebook::cachelib::CacheAdmin> admin_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ void EmbeddingKVDB::set_cuda(
std::vector<double> EmbeddingKVDB::get_l2cache_perf(
const int64_t step,
const int64_t interval) {
std::vector<double> ret;
ret.reserve(9); // num metrics
std::vector<double> ret(11, 0); // num metrics
if (step > 0 && step % interval == 0) {
int reset_val = 0;
auto num_cache_misses = num_cache_misses_.exchange(reset_val);
Expand All @@ -107,16 +106,20 @@ std::vector<double> EmbeddingKVDB::get_l2cache_perf(
get_tensor_copy_for_cache_update_.exchange(reset_val);
auto set_tensor_copy_for_cache_update_dur =
set_tensor_copy_for_cache_update_.exchange(reset_val);
ret.push_back(double(num_cache_misses) / interval);
ret.push_back(double(num_lookups) / interval);
ret.push_back(double(get_total_duration) / interval);
ret.push_back(double(get_cache_lookup_total_duration) / interval);
ret.push_back(
double(get_cache_lookup_wait_filling_thread_duration) / interval);
ret.push_back(double(get_weights_fillup_total_duration) / interval);
ret.push_back(double(total_cache_update_duration) / interval);
ret.push_back(double(get_tensor_copy_for_cache_update_dur) / interval);
ret.push_back(double(set_tensor_copy_for_cache_update_dur) / interval);
ret[0] = (double(num_cache_misses) / interval);
ret[1] = (double(num_lookups) / interval);
ret[2] = (double(get_total_duration) / interval);
ret[3] = (double(get_cache_lookup_total_duration) / interval);
ret[4] = (double(get_cache_lookup_wait_filling_thread_duration) / interval);
ret[5] = (double(get_weights_fillup_total_duration) / interval);
ret[6] = (double(total_cache_update_duration) / interval);
ret[7] = (double(get_tensor_copy_for_cache_update_dur) / interval);
ret[8] = (double(set_tensor_copy_for_cache_update_dur) / interval);
if (l2_cache_) {
auto cache_mem_stats = l2_cache_->get_cache_usage();
ret[9] = (cache_mem_stats[0]); // free cache in bytes
ret[10] = (cache_mem_stats[1]); // total cache capacity in bytes
}
}
return ret;
}
Expand Down

0 comments on commit 225ac16

Please sign in to comment.