Skip to content

Commit

Permalink
add per-cgroup cpu perf metrics (iopsystems#395)
Browse files Browse the repository at this point in the history
Adds per-cgroup cpu perf metrics.
  • Loading branch information
brayniac authored Dec 16, 2024
1 parent 21878ac commit 5273650
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 16 deletions.
21 changes: 21 additions & 0 deletions src/common/bpf/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub struct Builder<T: 'static + SkelBuilder<'static>> {
Vec<&'static CounterGroup>,
)>,
perf_events: Vec<(&'static str, PerfEvent)>,
packed_counters: Vec<(&'static str, &'static CounterGroup)>,
}

impl<T: 'static> Builder<T>
Expand All @@ -78,6 +79,7 @@ where
maps: Vec::new(),
cpu_counters: Vec::new(),
perf_events: Vec::new(),
packed_counters: Vec::new(),
}
}

Expand Down Expand Up @@ -177,6 +179,12 @@ where
perf_events.len()
);

let mut packed_counters: Vec<PackedCounters> = self
.packed_counters
.into_iter()
.map(|(name, counters)| PackedCounters::new(skel.map(name), counters))
.collect();

// load any data from userspace into BPF maps
for (name, values) in self.maps.into_iter() {
let fd = skel.map(name).as_fd().as_raw_fd();
Expand Down Expand Up @@ -223,6 +231,10 @@ where
v.refresh();
}

for v in &mut packed_counters {
v.refresh();
}

// notify that we have finished running
sync.notify();
}
Expand Down Expand Up @@ -295,4 +307,13 @@ where
self.perf_events.push((name, event));
self
}

/// Register a set of packed counters. The `name` is the BPF map name and
/// the `counters` are a set of userspace dynamic counters. The BPF map is
/// expected to be densely packed, meaning there is no padding. The order of
/// the `counters` must exactly match the order in the BPF map.
pub fn packed_counters(mut self, name: &'static str, counters: &'static CounterGroup) -> Self {
self.packed_counters.push((name, counters));
self
}
}
54 changes: 54 additions & 0 deletions src/common/bpf/counters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,57 @@ impl<'a> CpuCounters<'a> {
}
}
}

/// Represents a set of counters where the BPF map is a dense set of counters,
/// meaning there is no padding. No aggregation is performed, and the values are
/// updated into a single `RwLockCounterGroup`.
pub struct PackedCounters<'a> {
_map: &'a Map<'a>,
mmap: MmapMut,
counters: &'static CounterGroup,
}

impl<'a> PackedCounters<'a> {
/// Create a new set of counters from the provided BPF map and collection of
/// counter metrics.
///
/// The map layout is not cacheline padded. The ordering of the dynamic
/// counters must exactly match the layout in the BPF map.
pub fn new(map: &'a Map, counters: &'static CounterGroup) -> Self {
let total_bytes = counters.len() * std::mem::size_of::<u64>();

let fd = map.as_fd().as_raw_fd();
let file = unsafe { std::fs::File::from_raw_fd(fd as _) };
let mmap: MmapMut = unsafe {
MmapOptions::new()
.len(total_bytes)
.map_mut(&file)
.expect("failed to mmap() bpf counterset")
};

let (_prefix, values, _suffix) = unsafe { mmap.align_to::<u64>() };

if values.len() != counters.len() {
panic!("mmap region not aligned or width doesn't match");
}

Self {
_map: map,
mmap,
counters,
}
}

/// Refreshes the counters by reading from the BPF map and setting each
/// counter metric to the current value.
pub fn refresh(&mut self) {
let (_prefix, values, _suffix) = unsafe { self.mmap.align_to::<u64>() };

// update all individual counters
for (idx, value) in values.iter().enumerate() {
if *value != 0 {
let _ = self.counters.set(idx, *value);
}
}
}
}
7 changes: 5 additions & 2 deletions src/common/bpf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ const CACHELINE_SIZE: usize = 64;
const PAGE_SIZE: usize = 4096;

// This is the maximum number of CPUs we track with BPF counters.
const MAX_CPUS: usize = 1024;
pub const MAX_CPUS: usize = 1024;

// This is the maximum number of cgroups we track with BPF counters.
pub const MAX_CGROUPS: usize = 4096;

const COUNTER_SIZE: usize = std::mem::size_of::<u64>();

Expand All @@ -37,7 +40,7 @@ fn whole_pages<T>(count: usize) -> usize {
((count * std::mem::size_of::<T>()) + PAGE_SIZE - 1) / PAGE_SIZE
}

use counters::{Counters, CpuCounters};
use counters::{Counters, CpuCounters, PackedCounters};
use histogram::Histogram;
use sync_primitive::SyncPrimitive;

Expand Down
98 changes: 87 additions & 11 deletions src/samplers/cpu/linux/perf/mod.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
#include "../../../common/bpf/helpers.h"
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

#define COUNTERS 2
#define COUNTER_GROUP_WIDTH 8
#define MAX_CPUS 1024
#define MAX_CGROUPS 4096

#define TASK_RUNNING 0

// counter positions
#define CYCLES 0
#define INSTRUCTIONS 1
#define TSC 2
#define APERF 3
#define MPERF 4

// counters (see constants defined at top)
struct {
Expand All @@ -27,6 +27,38 @@ struct {
__uint(max_entries, MAX_CPUS * COUNTER_GROUP_WIDTH);
} counters SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_MMAPABLE);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CGROUPS);
} cgroup_cycles SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_MMAPABLE);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CGROUPS);
} cgroup_instructions SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_MMAPABLE);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CGROUPS);
} cycles_prev SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_MMAPABLE);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CGROUPS);
} instructions_prev SEC(".maps");

/**
* perf event arrays
*/
Expand All @@ -35,14 +67,12 @@ struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, u32);
__type(value, u32);
// __uint(max_entries, MAX_CPUS);
} cycles SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, u32);
__type(value, u32);
// __uint(max_entries, MAX_CPUS);
} instructions SEC(".maps");

/**
Expand All @@ -68,6 +98,26 @@ static __always_inline __s64 get_task_state(void *task)
return BPF_CORE_READ((struct task_struct___o *)task, state);
}

// attach a kprobe cpuacct to update per-cpu counters

SEC("kprobe/cpuacct_account_field")
int BPF_KPROBE(cpuacct_account_field_kprobe, void *task, u32 index, u64 delta)
{
u32 idx;
u32 processor_id = bpf_get_smp_processor_id();

u64 c = bpf_perf_event_read(&cycles, BPF_F_CURRENT_CPU);
u64 i = bpf_perf_event_read(&instructions, BPF_F_CURRENT_CPU);

idx = processor_id * COUNTER_GROUP_WIDTH + CYCLES;
bpf_map_update_elem(&counters, &idx, &c, BPF_ANY);

idx = processor_id * COUNTER_GROUP_WIDTH + INSTRUCTIONS;
bpf_map_update_elem(&counters, &idx, &i, BPF_ANY);
}

// attach a tracepoint on sched_switch for per-cgroup accounting

SEC("tp_btf/sched_switch")
int handle__sched_switch(u64 *ctx)
{
Expand All @@ -78,22 +128,48 @@ int handle__sched_switch(u64 *ctx)
struct task_struct *next = (struct task_struct *)ctx[2];

u32 idx;
u64 *elem;
u64 *elem, delta_c, delta_i;

u32 processor_id = bpf_get_smp_processor_id();
u64 ts = bpf_ktime_get_ns();

u64 flags = processor_id & BPF_F_INDEX_MASK;

u64 c = bpf_perf_event_read(&cycles, flags);
u64 i = bpf_perf_event_read(&instructions, flags);
u64 c = bpf_perf_event_read(&cycles, BPF_F_CURRENT_CPU);
u64 i = bpf_perf_event_read(&instructions, BPF_F_CURRENT_CPU);

idx = processor_id * COUNTER_GROUP_WIDTH + CYCLES;
bpf_map_update_elem(&counters, &idx, &c, BPF_ANY);

idx = processor_id * COUNTER_GROUP_WIDTH + INSTRUCTIONS;
bpf_map_update_elem(&counters, &idx, &i, BPF_ANY);

if (bpf_core_field_exists(prev->sched_task_group)) {
int cgroup_id = prev->sched_task_group->css.id;

if (cgroup_id && cgroup_id < MAX_CGROUPS) {
// update cgroup cycles

elem = bpf_map_lookup_elem(&cycles_prev, &processor_id);

if (elem) {
delta_c = c - *elem;

array_add(&cgroup_cycles, cgroup_id, delta_c);
}

// update cgroup instructions

elem = bpf_map_lookup_elem(&instructions_prev, &processor_id);

if (elem) {
delta_i = i - *elem;

array_add(&cgroup_instructions, cgroup_id, delta_i);
}
}
}

bpf_map_update_elem(&cycles_prev, &processor_id, &c, BPF_ANY);
bpf_map_update_elem(&instructions_prev, &processor_id, &i, BPF_ANY);

return 0;
}

Expand Down
8 changes: 8 additions & 0 deletions src/samplers/cpu/linux/perf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ fn init(config: Arc<Config>) -> SamplerResult {
.perf_event("cycles", PerfEvent::cpu_cycles())
.perf_event("instructions", PerfEvent::instructions())
.cpu_counters("counters", totals, individual)
.packed_counters("cgroup_cycles", &CGROUP_CPU_CYCLES)
.packed_counters("cgroup_instructions", &CGROUP_CPU_INSTRUCTIONS)
.build()?;

Ok(Some(Box::new(bpf)))
Expand All @@ -45,6 +47,8 @@ fn init(config: Arc<Config>) -> SamplerResult {
impl SkelExt for ModSkel<'_> {
fn map(&self, name: &str) -> &libbpf_rs::Map {
match name {
"cgroup_cycles" => &self.maps.cgroup_cycles,
"cgroup_instructions" => &self.maps.cgroup_instructions,
"counters" => &self.maps.counters,
"cycles" => &self.maps.cycles,
"instructions" => &self.maps.instructions,
Expand All @@ -55,6 +59,10 @@ impl SkelExt for ModSkel<'_> {

impl OpenSkelExt for ModSkel<'_> {
fn log_prog_instructions(&self) {
debug!(
"{NAME} cpuacct_account_field() BPF instruction count: {}",
self.progs.cpuacct_account_field_kprobe.insn_cnt()
);
debug!(
"{NAME} handle__sched_switch() BPF instruction count: {}",
self.progs.handle__sched_switch.insn_cnt()
Expand Down
29 changes: 26 additions & 3 deletions src/samplers/cpu/linux/stats.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
use crate::common::CounterGroup;
use crate::common::*;
use crate::samplers::cpu::stats::*;

use metriken::*;

pub static MAX_CPUS: usize = 1024;

#[metric(
name = "cpu/usage/total",
description = "The amount of CPU time spent servicing interrupts",
Expand Down Expand Up @@ -174,6 +172,22 @@ pub static CPU_MPERF_PERCORE: CounterGroup = CounterGroup::new(MAX_CPUS);
)]
pub static CPU_TSC_PERCORE: CounterGroup = CounterGroup::new(MAX_CPUS);

#[metric(
name = "cgroup/cpu/cycles",
description = "The number of elapsed CPU cycles on a per-cgroup basis",
formatter = cpu_metric_cgroup_formatter,
metadata = { unit = "cycles" }
)]
pub static CGROUP_CPU_CYCLES: CounterGroup = CounterGroup::new(MAX_CGROUPS);

#[metric(
name = "cgroup/cpu/instructions",
description = "The number of elapsed CPU cycles on a per-cgroup basis",
formatter = cpu_metric_cgroup_formatter,
metadata = { unit = "cycles" }
)]
pub static CGROUP_CPU_INSTRUCTIONS: CounterGroup = CounterGroup::new(MAX_CGROUPS);

pub fn cpu_metric_percore_formatter(metric: &MetricEntry, format: Format) -> String {
match format {
Format::Simple => {
Expand All @@ -195,3 +209,12 @@ pub fn cpu_usage_percore_formatter(metric: &MetricEntry, format: Format) -> Stri
_ => metric.name().to_string(),
}
}

pub fn cpu_metric_cgroup_formatter(metric: &MetricEntry, format: Format) -> String {
match format {
Format::Simple => {
format!("{}/cgroup", metric.name())
}
_ => metric.name().to_string(),
}
}

0 comments on commit 5273650

Please sign in to comment.