diff --git a/man/man8/fileslower.8 b/man/man8/fileslower.8 index 716d6d06a9fc..169013bdc0fb 100644 --- a/man/man8/fileslower.8 +++ b/man/man8/fileslower.8 @@ -2,7 +2,7 @@ .SH NAME fileslower \- Trace slow synchronous file reads and writes. .SH SYNOPSIS -.B fileslower [\-h] [\-p PID] [min_ms] +.B fileslower [\-h] [\-p PID] [-a] [min_ms] .SH DESCRIPTION This script uses kernel dynamic tracing of synchronous reads and writes at the VFS interface, to identify slow file reads and writes for any file @@ -30,6 +30,9 @@ CONFIG_BPF and bcc. \-p PID Trace this PID only. .TP +\-a +Include non-regular file types in output (sockets, FIFOs, etc). +.TP min_ms Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. .SH EXAMPLES diff --git a/man/man8/filetop.8 b/man/man8/filetop.8 index 6a1c15c636b5..1cc6a0417ea0 100644 --- a/man/man8/filetop.8 +++ b/man/man8/filetop.8 @@ -4,11 +4,12 @@ filetop \- File reads and writes by filename and process. Top for files. .SH SYNOPSIS .B filetop [\-h] [\-C] [\-r MAXROWS] [\-p PID] [interval] [count] .SH DESCRIPTION -This is top for files. +This is top for files. This traces file reads and writes, and prints a per-file summary every interval (by default, 1 second). The summary is sorted on the highest read -throughput (Kbytes). +throughput (Kbytes). By default only IO on regular files is shown. The -a +option will list all file types (sokets, FIFOs, etc). This uses in-kernel eBPF maps to store per process summaries for efficiency. @@ -29,6 +30,9 @@ Since this uses BPF, only the root user can use this tool. CONFIG_BPF and bcc. .SH OPTIONS .TP +\-a +Include non-regular file types (sockets, FIFOs, etc). +.TP \-C Don't clear the screen. .TP diff --git a/tools/fileslower.py b/tools/fileslower.py index 20c963500b89..2ae4756e6cb0 100755 --- a/tools/fileslower.py +++ b/tools/fileslower.py @@ -4,7 +4,7 @@ # fileslower Trace slow synchronous file reads and writes. # For Linux, uses BCC, eBPF. # -# USAGE: fileslower [-h] [-p PID] [min_ms] +# USAGE: fileslower [-h] [-p PID] [-a] [min_ms] # # This script uses kernel dynamic tracing of synchronous reads and writes # at the VFS interface, to identify slow file reads and writes for any file @@ -32,7 +32,6 @@ from bcc import BPF import argparse import ctypes as ct -import signal import time # arguments @@ -45,19 +44,17 @@ description="Trace slow synchronous file reads and writes", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) -parser.add_argument("-p", "--pid", +parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", help="trace this PID only") +parser.add_argument("-a", "--all-files", action="store_true", + help="include non-regular file types (sockets, FIFOs, etc)") parser.add_argument("min_ms", nargs="?", default='10', help="minimum I/O duration to trace, in ms (default 10)") args = parser.parse_args() min_ms = int(args.min_ms) -pid = args.pid +tgid = args.tgid debug = 0 -# signal handler -def signal_ignore(signal, frame): - print() - # define BPF program bpf_text = """ #include @@ -72,6 +69,8 @@ def signal_ignore(signal, frame): struct val_t { u32 sz; u64 ts; + u32 name_len; + // de->d_name.name may point to de->d_iname so limit len accordingly char name[DNAME_INLINE_LEN]; char comm[TASK_COMM_LEN]; }; @@ -81,6 +80,7 @@ def signal_ignore(signal, frame): u32 pid; u32 sz; u64 delta_us; + u32 name_len; char name[DNAME_INLINE_LEN]; char comm[TASK_COMM_LEN]; }; @@ -92,22 +92,25 @@ def signal_ignore(signal, frame): static int trace_rw_entry(struct pt_regs *ctx, struct file *file, char __user *buf, size_t count) { - u32 pid; - - pid = bpf_get_current_pid_tgid(); - if (FILTER) + u32 tgid = bpf_get_current_pid_tgid() >> 32; + if (TGID_FILTER) return 0; + u32 pid = bpf_get_current_pid_tgid(); + // skip I/O lacking a filename struct dentry *de = file->f_path.dentry; - if (de->d_iname[0] == 0) + int mode = file->f_inode->i_mode; + if (de->d_name.len == 0 || TYPE_FILTER) return 0; // store size and timestamp by pid struct val_t val = {}; val.sz = count; val.ts = bpf_ktime_get_ns(); - __builtin_memcpy(&val.name, de->d_iname, sizeof(val.name)); + + val.name_len = de->d_name.len; + bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name); bpf_get_current_comm(&val.comm, sizeof(val.comm)); entryinfo.update(&pid, &val); @@ -153,6 +156,7 @@ def signal_ignore(signal, frame): data.pid = pid; data.sz = valp->sz; data.delta_us = delta_us; + data.name_len = valp->name_len; bpf_probe_read(&data.name, sizeof(data.name), valp->name); bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); events.perf_submit(ctx, &data, sizeof(data)); @@ -172,15 +176,20 @@ def signal_ignore(signal, frame): """ bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) -if args.pid: - bpf_text = bpf_text.replace('FILTER', 'pid != %s' % pid) +if args.tgid: + bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid) else: - bpf_text = bpf_text.replace('FILTER', '0') + bpf_text = bpf_text.replace('TGID_FILTER', '0') +if args.all_files: + bpf_text = bpf_text.replace('TYPE_FILTER', '0') +else: + bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') + if debug: print(bpf_text) # initialize BPF -b = BPF(text=bpf_text) +b = BPF(text=bpf_text,) # I'd rather trace these via new_sync_read/new_sync_write (which used to be # do_sync_read/do_sync_write), but those became static. So trace these from @@ -205,6 +214,7 @@ class Data(ct.Structure): ("pid", ct.c_uint), ("sz", ct.c_uint), ("delta_us", ct.c_ulonglong), + ("name_len", ct.c_uint), ("name", ct.c_char * DNAME_INLINE_LEN), ("comm", ct.c_char * TASK_COMM_LEN), ] @@ -216,7 +226,7 @@ class Data(ct.Structure): # header print("Tracing sync read/writes slower than %d ms" % min_ms) -print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D", +print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D", "BYTES", "LAT(ms)", "FILENAME")) start_ts = time.time() @@ -225,10 +235,13 @@ def print_event(cpu, data, size): event = ct.cast(data, ct.POINTER(Data)).contents ms = float(event.delta_us) / 1000 + name = event.name + if event.name_len > DNAME_INLINE_LEN: + name = name[:-3] + "..." print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( time.time() - start_ts, event.comm, event.pid, mode_s[event.mode], - event.sz, ms, event.name)) + event.sz, ms, name)) b["events"].open_perf_buffer(print_event) while 1: diff --git a/tools/fileslower_example.txt b/tools/fileslower_example.txt index 7c2e12be0d71..0e0c7caf2f7e 100644 --- a/tools/fileslower_example.txt +++ b/tools/fileslower_example.txt @@ -105,7 +105,7 @@ locks, run queue latency, etc. These can be explored using other commands. USAGE message: # ./fileslower -h -usage: fileslower [-h] [-p PID] [min_ms] +usage: fileslower.py [-h] [-p PID] [-a] [min_ms] Trace slow synchronous file reads and writes @@ -115,6 +115,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit -p PID, --pid PID trace this PID only + -a, --all-files include non-regular file types examples: ./fileslower # trace sync file I/O slower than 10 ms (default) diff --git a/tools/filetop.py b/tools/filetop.py index 047dd8c254c5..9f7c58df19e0 100755 --- a/tools/filetop.py +++ b/tools/filetop.py @@ -32,11 +32,13 @@ description="File reads and writes by process", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) +parser.add_argument("-a", "--all-files", action="store_true", + help="include non-regular file types (sockets, FIFOs, etc)") parser.add_argument("-C", "--noclear", action="store_true", help="don't clear the screen") parser.add_argument("-r", "--maxrows", default=20, help="maximum rows to print, default 20") -parser.add_argument("-p", "--pid", +parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", help="trace this PID only") parser.add_argument("interval", nargs="?", default=1, help="output interval, in seconds") @@ -61,13 +63,13 @@ def signal_ignore(signal, frame): #include #include -#define MAX_FILE_LEN 32 - // the key for the output summary struct info_t { u32 pid; - char name[TASK_COMM_LEN]; - char file[MAX_FILE_LEN]; + u32 name_len; + char comm[TASK_COMM_LEN]; + // de->d_name.name may point to de->d_iname so limit len accordingly + char name[DNAME_INLINE_LEN]; char type; }; @@ -84,22 +86,23 @@ def signal_ignore(signal, frame): static int do_entry(struct pt_regs *ctx, struct file *file, char __user *buf, size_t count, int is_read) { - u32 pid; - - pid = bpf_get_current_pid_tgid(); - if (FILTER) + u32 tgid = bpf_get_current_pid_tgid() >> 32; + if (TGID_FILTER) return 0; + u32 pid = bpf_get_current_pid_tgid(); + // skip I/O lacking a filename struct dentry *de = file->f_path.dentry; - if (de->d_iname[0] == 0) + int mode = file->f_inode->i_mode; + if (de->d_name.len == 0 || TYPE_FILTER) return 0; // store counts and sizes by pid & file struct info_t info = {.pid = pid}; - bpf_get_current_comm(&info.name, sizeof(info.name)); - __builtin_memcpy(&info.file, de->d_iname, sizeof(info.file)); - int mode = file->f_inode->i_mode; + bpf_get_current_comm(&info.comm, sizeof(info.comm)); + info.name_len = de->d_name.len; + bpf_probe_read(&info.name, sizeof(info.name), (void *)de->d_name.name); if (S_ISREG(mode)) { info.type = 'R'; } else if (S_ISSOCK(mode)) { @@ -134,17 +137,28 @@ def signal_ignore(signal, frame): } """ -if args.pid: - bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid) +if args.tgid: + bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid) +else: + bpf_text = bpf_text.replace('TGID_FILTER', '0') +if args.all_files: + bpf_text = bpf_text.replace('TYPE_FILTER', '0') else: - bpf_text = bpf_text.replace('FILTER', '0') + bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') + if debug: print(bpf_text) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") -b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") +try: + b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") +except: + # older kernels don't have __vfs_write so try vfs_write instead + b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") + +DNAME_INLINE_LEN = 32 # linux/dcache.h print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) @@ -163,19 +177,22 @@ def signal_ignore(signal, frame): print() with open(loadavg) as stats: print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) - print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("PID", "COMM", + print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("TID", "COMM", "READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE")) - # by-PID output + # by-TID output counts = b.get_table("counts") line = 0 for k, v in reversed(sorted(counts.items(), key=lambda counts: counts[1].rbytes)): + name = k.name + if k.name_len > DNAME_INLINE_LEN: + name = name[:-3] + "..." # print line - print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.name, + print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.comm, v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type, - k.file)) + name)) line += 1 if line >= maxrows: diff --git a/tools/filetop_example.txt b/tools/filetop_example.txt index 1fa0b1915726..de1591db0bcf 100644 --- a/tools/filetop_example.txt +++ b/tools/filetop_example.txt @@ -24,11 +24,9 @@ PID COMM READS WRITES R_Kb W_Kb T FILE 26628 ld 18 0 72 0 R hibernate.o 26628 ld 16 0 64 0 R suspend.o 26628 ld 16 0 64 0 R snapshot.o -26630 cat 1 0 64 0 O null 26628 ld 16 0 64 0 R qos.o 26628 ld 13 0 52 0 R main.o 26628 ld 12 0 52 0 R swap.o -12421 sshd 3 0 48 0 O ptmx [...] This shows various files read and written during a Linux kernel build. The @@ -40,7 +38,8 @@ While not printed, the average read and write size can be calculated by dividing R_Kb by READS, and the same for writes. The "T" column indicates the type of the file: "R" for regular files, "S" for -sockets, and "O" for other (including pipes). +sockets, and "O" for other (including pipes). By default only regular files are +shown; use the -a option to show all file types. This script works by tracing the vfs_read() and vfs_write() functions using kernel dynamic tracing, which instruments explicit read and write calls. If @@ -65,7 +64,6 @@ Tracing... Output every 1 secs. Hit Ctrl-C to end PID COMM READS WRITES R_Kb W_Kb T FILE 32672 cksum 5006 0 320384 0 R data1 -12296 sshd 2 0 32 0 O ptmx 809 run 2 0 8 0 R nsswitch.conf 811 run 2 0 8 0 R nsswitch.conf 804 chown 2 0 8 0 R nsswitch.conf @@ -89,13 +87,13 @@ PID COMM READS WRITES R_Kb W_Kb T FILE 848 run 2 0 8 0 R nsswitch.conf [...] -This output shows a cksum command reading data1. Note that +This output shows a cksum command reading data1. An optional interval and optional count can also be added to the end of the command line. For example, for 1 second interval, and 3 summaries in total: -# ./filetop -Cr 5 1 3 +# ./filetop -Cr 5 -a 1 3 Tracing... Output every 1 secs. Hit Ctrl-C to end 08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187 @@ -126,15 +124,15 @@ PID COMM READS WRITES R_Kb W_Kb T FILE 5227 run 2 0 8 0 R nsswitch.conf Detaching... -This example has caught heavy socket I/O from an sshd process, showing up as -non-regular file types (the "O" for other, and "S" for socket, in the type -column: "T"). +This example shows the -a option to include all file types. It caught heavy +socket I/O from an sshd process, showing up as non-regular file types (the "O" +for other, and "S" for socket, in the type column: "T"). USAGE message: # ./filetop -h -usage: filetop [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count] +usage: filetop.py [-h] [-a] [-C] [-r MAXROWS] [-p PID] [interval] [count] File reads and writes by process @@ -144,6 +142,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit + -a, --all-files include non-regular file types (sockets, FIFOs, etc) -C, --noclear don't clear the screen -r MAXROWS, --maxrows MAXROWS maximum rows to print, default 20