Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Callgrind instead of Cachegrind #26

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion macro/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ pub fn iai(_attr: TokenStream, item: TokenStream) -> TokenStream {
let span = proc_macro2::Span::call_site();

let function_name = find_name(item.clone());
let wrapper_function_name = Ident::new(&format!("wrap_{}", function_name.to_string()), span);
let wrapper_function_name =
Ident::new(&format!("__iai_bench_{}", function_name.to_string()), span);
let const_name = Ident::new(&format!("IAI_FUNC_{}", function_name.to_string()), span);
let name_literal = function_name.to_string();

let output = quote_spanned!(span=>
#item

#[no_mangle]
#[inline(never)]
fn #wrapper_function_name() {
let _ = iai::black_box(#function_name());
}
Expand Down
208 changes: 94 additions & 114 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub fn black_box<T>(dummy: T) -> T {

fn check_valgrind() -> bool {
let result = Command::new("valgrind")
.arg("--tool=cachegrind")
.arg("--tool=callgrind")
.arg("--version")
.stdout(Stdio::null())
.stderr(Stdio::null())
Expand All @@ -65,35 +65,36 @@ fn check_valgrind() -> bool {
}
}

fn get_arch() -> String {
let output = Command::new("uname")
.arg("-m")
.stdout(Stdio::piped())
.output()
.expect("Failed to run `uname` to determine CPU architecture.");

String::from_utf8(output.stdout)
.expect("`-uname -m` returned invalid unicode.")
.trim()
.to_owned()
}

fn basic_valgrind() -> Command {
Command::new("valgrind")
}

// Invoke Valgrind, disabling ASLR if possible because ASLR could noise up the results a bit
cfg_if! {
if #[cfg(target_os = "linux")] {
fn valgrind_without_aslr(arch: &str) -> Command {
fn get_arch() -> String {
let output = Command::new("uname")
.arg("-m")
.stdout(Stdio::piped())
.output()
.expect("Failed to run `uname` to determine CPU architecture.");

String::from_utf8(output.stdout)
.expect("`-uname -m` returned invalid unicode.")
.trim()
.to_owned()
}

fn valgrind_without_aslr() -> Command {
let arch = get_arch();
let mut cmd = Command::new("setarch");
cmd.arg(arch)
.arg("-R")
.arg("valgrind");
cmd
}
} else if #[cfg(target_os = "freebsd")] {
fn valgrind_without_aslr(_arch: &str) -> Command {
fn valgrind_without_aslr() -> Command {
let mut cmd = Command::new("proccontrol");
cmd.arg("-m")
.arg("aslr")
Expand All @@ -102,22 +103,23 @@ cfg_if! {
cmd
}
} else {
fn valgrind_without_aslr(_arch: &str) -> Command {
fn valgrind_without_aslr() -> Command {
// Can't disable ASLR on this platform
basic_valgrind()
}
}
}

fn run_bench(
arch: &str,
fn run_benches(
benches: &[&(&'static str, fn())],
executable: &str,
i: isize,
name: &str,
allow_aslr: bool,
) -> (CachegrindStats, Option<CachegrindStats>) {
let output_file = PathBuf::from(format!("target/iai/cachegrind.out.{}", name));
let old_file = output_file.with_file_name(format!("cachegrind.out.{}.old", name));
) -> (
HashMap<String, CallgrindStats>,
HashMap<String, CallgrindStats>,
) {
let output_file = PathBuf::from("target/iai/callgrind.out");
let old_file = output_file.with_file_name("callgrind.out.old");
std::fs::create_dir_all(output_file.parent().unwrap()).expect("Failed to create directory");

if output_file.exists() {
Expand All @@ -128,86 +130,98 @@ fn run_bench(
let mut cmd = if allow_aslr {
basic_valgrind()
} else {
valgrind_without_aslr(arch)
valgrind_without_aslr()
};
let status = cmd
.arg("--tool=cachegrind")

let cmd = cmd
.arg("--tool=callgrind")
// Set some reasonable cache sizes. The exact sizes matter less than having fixed sizes,
// since otherwise cachegrind would take them from the CPU and make benchmark runs
// since otherwise callgrind would take them from the CPU and make benchmark runs
// even more incomparable between machines.
.arg("--I1=32768,8,64")
.arg("--D1=32768,8,64")
.arg("--LL=8388608,16,64")
.arg(format!("--cachegrind-out-file={}", output_file.display()))
.arg("--cache-sim=yes")
.arg(format!("--callgrind-out-file={}", output_file.display()))
.arg("--compress-strings=no")
.arg("--compress-pos=no")
.arg("--collect-atstart=no");

for (name, _func) in benches.iter() {
// cmd.arg(format!("--zero-before=__iai_bench_{name}"));
// cmd.arg(format!("--dump-after=__iai_bench_{name}"));
cmd.arg(format!("--toggle-collect=__iai_bench_{name}"));
}

let status = cmd
.arg(executable)
.arg("--iai-run")
.arg(i.to_string())
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.expect("Failed to run benchmark in cachegrind");
.expect("Failed to run benchmark in callgrind");

if !status.success() {
panic!(
"Failed to run benchmark in cachegrind. Exit code: {}",
"Failed to run benchmark in callgrind. Exit code: {}",
status
);
}

let new_stats = parse_cachegrind_output(&output_file);
let new_stats = parse_callgrind_output(&output_file);
let old_stats = if old_file.exists() {
Some(parse_cachegrind_output(&old_file))
parse_callgrind_output(&old_file)
} else {
None
HashMap::new()
};

(new_stats, old_stats)
}

fn parse_cachegrind_output(file: &Path) -> CachegrindStats {
fn parse_callgrind_output(file: &Path) -> HashMap<String, CallgrindStats> {
let mut events_line = None;
let mut summary_line = None;
let mut res = HashMap::new();

let file_in = File::open(file).expect("Unable to open cachegrind output file");
let file_in = File::open(file).expect("Unable to open callgrind output file");

for line in BufReader::new(file_in).lines() {
let mut lines = BufReader::new(file_in).lines();

while let Some(line) = lines.next() {
let line = line.unwrap();
if let Some(line) = line.strip_prefix("events: ") {
events_line = Some(line.trim().to_owned());
}
if let Some(line) = line.strip_prefix("summary: ") {
summary_line = Some(line.trim().to_owned());
}
}

match (events_line, summary_line) {
(Some(events), Some(summary)) => {
let events: HashMap<_, _> = events
if let Some(name) = line.strip_prefix("cfn=__iai_bench_") {
let _calls = lines.next().unwrap().unwrap();
let data = lines.next().unwrap().unwrap();
let data: HashMap<_, _> = events_line
.as_deref()
.expect("Unable to find events in callgrind output file (must appear early)")
.split_whitespace()
.zip(summary.split_whitespace().map(|s| {
.zip(data.trim().split_whitespace().skip(1).map(|s| {
s.parse::<u64>()
.expect("Unable to parse summary line from cachegrind output file")
.expect("Unable to parse summary line from callgrind output file")
}))
.collect();

CachegrindStats {
instruction_reads: events["Ir"],
instruction_l1_misses: events["I1mr"],
instruction_cache_misses: events["ILmr"],
data_reads: events["Dr"],
data_l1_read_misses: events["D1mr"],
data_cache_read_misses: events["DLmr"],
data_writes: events["Dw"],
data_l1_write_misses: events["D1mw"],
data_cache_write_misses: events["DLmw"],
}
res.insert(
name.to_owned(),
CallgrindStats {
instruction_reads: *data.get("Ir").unwrap_or(&0),
instruction_l1_misses: *data.get("I1mr").unwrap_or(&0),
instruction_cache_misses: *data.get("ILmr").unwrap_or(&0),
data_reads: *data.get("Dr").unwrap_or(&0),
data_l1_read_misses: *data.get("D1mr").unwrap_or(&0),
data_cache_read_misses: *data.get("DLmr").unwrap_or(&0),
data_writes: *data.get("Dw").unwrap_or(&0),
data_l1_write_misses: *data.get("D1mw").unwrap_or(&0),
data_cache_write_misses: *data.get("DLmw").unwrap_or(&0),
},
);
}
_ => panic!("Unable to parse cachegrind output file"),
}
res
}

#[derive(Clone, Debug)]
struct CachegrindStats {
struct CallgrindStats {
instruction_reads: u64,
instruction_l1_misses: u64,
instruction_cache_misses: u64,
Expand All @@ -218,11 +232,12 @@ struct CachegrindStats {
data_l1_write_misses: u64,
data_cache_write_misses: u64,
}
impl CachegrindStats {
impl CallgrindStats {
pub fn ram_accesses(&self) -> u64 {
self.instruction_cache_misses + self.data_cache_read_misses + self.data_cache_write_misses
}
pub fn summarize(&self) -> CachegrindSummary {

pub fn summarize(&self) -> CallgrindSummary {
let ram_hits = self.ram_accesses();
let l3_accesses =
self.instruction_l1_misses + self.data_l1_read_misses + self.data_l1_write_misses;
Expand All @@ -231,36 +246,21 @@ impl CachegrindStats {
let total_memory_rw = self.instruction_reads + self.data_reads + self.data_writes;
let l1_hits = total_memory_rw - (ram_hits + l3_hits);

CachegrindSummary {
CallgrindSummary {
l1_hits,
l3_hits,
ram_hits,
}
}

#[rustfmt::skip]
pub fn subtract(&self, calibration: &CachegrindStats) -> CachegrindStats {
CachegrindStats {
instruction_reads: self.instruction_reads.saturating_sub(calibration.instruction_reads),
instruction_l1_misses: self.instruction_l1_misses.saturating_sub(calibration.instruction_l1_misses),
instruction_cache_misses: self.instruction_cache_misses.saturating_sub(calibration.instruction_cache_misses),
data_reads: self.data_reads.saturating_sub(calibration.data_reads),
data_l1_read_misses: self.data_l1_read_misses.saturating_sub(calibration.data_l1_read_misses),
data_cache_read_misses: self.data_cache_read_misses.saturating_sub(calibration.data_cache_read_misses),
data_writes: self.data_writes.saturating_sub(calibration.data_writes),
data_l1_write_misses: self.data_l1_write_misses.saturating_sub(calibration.data_l1_write_misses),
data_cache_write_misses: self.data_cache_write_misses.saturating_sub(calibration.data_cache_write_misses),
}
}
}

#[derive(Clone, Debug)]
struct CachegrindSummary {
struct CallgrindSummary {
l1_hits: u64,
l3_hits: u64,
ram_hits: u64,
}
impl CachegrindSummary {
impl CallgrindSummary {
fn cycles(&self) -> u64 {
// Uses Itamar Turner-Trauring's formula from https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
self.l1_hits + (5 * self.l3_hits) + (35 * self.ram_hits)
Expand All @@ -274,46 +274,26 @@ pub fn runner(benches: &[&(&'static str, fn())]) {
let executable = args_iter.next().unwrap();

if let Some("--iai-run") = args_iter.next().as_deref() {
// In this branch, we're running under cachegrind, so execute the benchmark as quickly as
// possible and exit
let index: isize = args_iter.next().unwrap().parse().unwrap();

// -1 is used as a signal to do nothing and return. By recording an empty benchmark, we can
// subtract out the overhead from startup and dispatching to the right benchmark.
if index == -1 {
return;
// In this branch, we're running under callgrind
for (_name, func) in benches.iter() {
func();
}

let index = index as usize;

(benches[index].1)();
return;
}

// Otherwise we're running normally, under cargo

if !check_valgrind() {
return;
}

let arch = get_arch();

let allow_aslr = std::env::var_os("IAI_ALLOW_ASLR").is_some();

let (calibration, old_calibration) =
run_bench(&arch, &executable, -1, "iai_calibration", allow_aslr);
let (stats, old_stats) = run_benches(&benches, &executable, allow_aslr);

for (i, (name, _func)) in benches.iter().enumerate() {
for (name, _func) in benches.iter() {
println!("{}", name);
let (stats, old_stats) = run_bench(&arch, &executable, i as isize, name, allow_aslr);
let (stats, old_stats) = (
stats.subtract(&calibration),
match (&old_stats, &old_calibration) {
(Some(old_stats), Some(old_calibration)) => {
Some(old_stats.subtract(old_calibration))
}
_ => None,
},
);
let stats = stats.get(*name).unwrap();
let old_stats = old_stats.get(*name);

fn signed_short(n: f64) -> String {
let n_abs = n.abs();
Expand Down Expand Up @@ -358,7 +338,7 @@ pub fn runner(benches: &[&(&'static str, fn())]) {
}
);
let summary = stats.summarize();
let old_summary = old_stats.map(|stat| stat.summarize());
let old_summary = old_stats.clone().map(|stat| stat.summarize());
println!(
" L1 Accesses: {:>15}{}",
summary.l1_hits,
Expand Down
3 changes: 3 additions & 0 deletions src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ macro_rules! main {
( $( $func_name:ident ),+ $(,)* ) => {
mod iai_wrappers {
$(
#[no_mangle]
#[inline(never)]
#[export_name = concat!("__iai_bench_", stringify!($func_name))]
pub fn $func_name() {
let _ = $crate::black_box(super::$func_name());
}
Expand Down