#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # fileslower Trace slow synchronous file reads and writes. # For Linux, uses BCC, eBPF. # # USAGE: fileslower [-h] [-p PID] [-a] [min_ms] # # This script uses kernel dynamic tracing of synchronous reads and writes # at the VFS interface, to identify slow file reads and writes for any file # system. # # This works by tracing __vfs_read() and __vfs_write(), and filtering for # synchronous I/O (the path to new_sync_read() and new_sync_write()), and # for I/O with filenames. This approach provides a view of just two file # system request types. There are typically many others: asynchronous I/O, # directory operations, file handle operations, etc, that this tool does not # instrument. # # WARNING: This traces VFS reads and writes, which can be extremely frequent, # and so the overhead of this tool can become severe depending on the # workload. # # By default, a minimum millisecond threshold of 10 is used. # # Copyright 2016 Netflix, Inc. # Licensed under the Apache License, Version 2.0 (the "License") # # 06-Feb-2016 Brendan Gregg Created this. from __future__ import print_function from bcc import BPF import argparse import ctypes as ct import time # arguments examples = """examples: ./fileslower # trace sync file I/O slower than 10 ms (default) ./fileslower 1 # trace sync file I/O slower than 1 ms ./fileslower -p 185 # trace PID 185 only """ parser = argparse.ArgumentParser( description="Trace slow synchronous file reads and writes", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", help="trace this PID only") parser.add_argument("-a", "--all-files", action="store_true", help="include non-regular file types (sockets, FIFOs, etc)") parser.add_argument("min_ms", nargs="?", default='10', help="minimum I/O duration to trace, in ms (default 10)") args = parser.parse_args() min_ms = int(args.min_ms) tgid = args.tgid debug = 0 # define BPF program bpf_text = """ #include #include #include enum trace_mode { MODE_READ, MODE_WRITE }; struct val_t { u32 sz; u64 ts; u32 name_len; // de->d_name.name may point to de->d_iname so limit len accordingly char name[DNAME_INLINE_LEN]; char comm[TASK_COMM_LEN]; }; struct data_t { enum trace_mode mode; u32 pid; u32 sz; u64 delta_us; u32 name_len; char name[DNAME_INLINE_LEN]; char comm[TASK_COMM_LEN]; }; BPF_HASH(entryinfo, pid_t, struct val_t); BPF_PERF_OUTPUT(events); // store timestamp and size on entry static int trace_rw_entry(struct pt_regs *ctx, struct file *file, char __user *buf, size_t count) { u32 tgid = bpf_get_current_pid_tgid() >> 32; if (TGID_FILTER) return 0; u32 pid = bpf_get_current_pid_tgid(); // skip I/O lacking a filename struct dentry *de = file->f_path.dentry; int mode = file->f_inode->i_mode; if (de->d_name.len == 0 || TYPE_FILTER) return 0; // store size and timestamp by pid struct val_t val = {}; val.sz = count; val.ts = bpf_ktime_get_ns(); val.name_len = de->d_name.len; bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name); bpf_get_current_comm(&val.comm, sizeof(val.comm)); entryinfo.update(&pid, &val); return 0; } int trace_read_entry(struct pt_regs *ctx, struct file *file, char __user *buf, size_t count) { // skip non-sync I/O; see kernel code for __vfs_read() if (!(file->f_op->read_iter)) return 0; return trace_rw_entry(ctx, file, buf, count); } int trace_write_entry(struct pt_regs *ctx, struct file *file, char __user *buf, size_t count) { // skip non-sync I/O; see kernel code for __vfs_write() if (!(file->f_op->write_iter)) return 0; return trace_rw_entry(ctx, file, buf, count); } // output static int trace_rw_return(struct pt_regs *ctx, int type) { struct val_t *valp; u32 pid = bpf_get_current_pid_tgid(); valp = entryinfo.lookup(&pid); if (valp == 0) { // missed tracing issue or filtered return 0; } u64 delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000; entryinfo.delete(&pid); if (delta_us < MIN_US) return 0; struct data_t data = {}; data.mode = type; data.pid = pid; data.sz = valp->sz; data.delta_us = delta_us; data.name_len = valp->name_len; bpf_probe_read(&data.name, sizeof(data.name), valp->name); bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); events.perf_submit(ctx, &data, sizeof(data)); return 0; } int trace_read_return(struct pt_regs *ctx) { return trace_rw_return(ctx, MODE_READ); } int trace_write_return(struct pt_regs *ctx) { return trace_rw_return(ctx, MODE_WRITE); } """ bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) if args.tgid: bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid) else: bpf_text = bpf_text.replace('TGID_FILTER', '0') if args.all_files: bpf_text = bpf_text.replace('TYPE_FILTER', '0') else: bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') if debug: print(bpf_text) # initialize BPF b = BPF(text=bpf_text,) # I'd rather trace these via new_sync_read/new_sync_write (which used to be # do_sync_read/do_sync_write), but those became static. So trace these from # the parent functions, at the cost of more overhead, instead. # Ultimately, we should be using [V]FS tracepoints. b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") try: b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") except: # older kernels don't have __vfs_write so try vfs_write instead b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") TASK_COMM_LEN = 16 # linux/sched.h DNAME_INLINE_LEN = 32 # linux/dcache.h class Data(ct.Structure): _fields_ = [ ("mode", ct.c_int), ("pid", ct.c_uint), ("sz", ct.c_uint), ("delta_us", ct.c_ulonglong), ("name_len", ct.c_uint), ("name", ct.c_char * DNAME_INLINE_LEN), ("comm", ct.c_char * TASK_COMM_LEN), ] mode_s = { 0: 'R', 1: 'W', } # header print("Tracing sync read/writes slower than %d ms" % min_ms) print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D", "BYTES", "LAT(ms)", "FILENAME")) start_ts = time.time() def print_event(cpu, data, size): event = ct.cast(data, ct.POINTER(Data)).contents ms = float(event.delta_us) / 1000 name = event.name.decode() if event.name_len > DNAME_INLINE_LEN: name = name[:-3] + "..." print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( time.time() - start_ts, event.comm.decode(), event.pid, mode_s[event.mode], event.sz, ms, name)) b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll()