Skip to content

Commit

Permalink
biotop
Browse files Browse the repository at this point in the history
  • Loading branch information
brendangregg committed Feb 7, 2016
1 parent 31cd104 commit 6f075b9
Show file tree
Hide file tree
Showing 4 changed files with 489 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Tools:

- tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt).
- tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
- tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
- tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
- tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt).
Expand Down
90 changes: 90 additions & 0 deletions man/man8/biotop.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
.TH biotop 8 "2016-02-06" "USER COMMANDS"
.SH NAME
biotop \- Block device (disk) I/O by process top.
.SH SYNOPSIS
.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
.SH DESCRIPTION
This is top for disks.

This traces block device I/O (disk I/O), and prints a per-process summary every
interval (by default, 1 second). The summary is sorted on the top disk
consumers by throughput (Kbytes). The PID and process name shown are measured
from when the I/O was first created, which usually identifies the responsible
process.

For efficiency, this uses in-kernel eBPF maps to cache process details (PID and
comm) by I/O request, as well as a starting timestamp for calculating I/O
latency, and the final summary.

This works by tracing various kernel blk_*() functions using dynamic tracing,
and will need updating to match any changes to these functions.

Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH EXAMPLES
.TP
Summarize block device I/O by process, 1 second screen refresh:
#
.B biotop
.TP
Don't clear the screen:
#
.B biotop -C
.TP
5 second summaries, 10 times only:
#
.B biotop 5 10
.SH FIELDS
.TP
loadavg:
The contents of /proc/loadavg
.TP
PID
Cached process ID, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
COMM
Cached process name, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
D
Direction: R == read, W == write.
.TP
MAJ
Major device number.
.TP
MIN
Minor device number.
.TP
DISK
Disk device name.
.TP
I/O
Number of I/O during the interval.
.TP
Kbytes
Total Kbytes for these I/O, during the interval.
.TP
AVGms
Average time for the I/O (latency) from the issue to the device, to its
completion, in milliseconds.
.SH OVERHEAD
Since block device I/O usually has a relatively low frequency (< 10,000/s),
the overhead for this tool is expected to be low or negligible. For high IOPS
storage systems, test and quantify before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
biosnoop(8), biolatency(8), iostat(1)
211 changes: 211 additions & 0 deletions tools/biotop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# biotop block device (disk) I/O by process.
# For Linux, uses BCC, eBPF.
#
# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 06-Feb-2016 Brendan Gregg Created this.

from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import signal
from subprocess import call

# arguments
examples = """examples:
./biotop # block device I/O top, 1 second refresh
./biotop -C # don't clear the screen
./biotop 5 # 5 second summaries
./biotop 5 10 # 5 second summaries, 10 times only
"""
parser = argparse.ArgumentParser(
description="Block device (disk) I/O by process",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-C", "--noclear", action="store_true",
help="don't clear the screen")
parser.add_argument("-r", "--maxrows", default=20,
help="maximum rows to print, default 20")
parser.add_argument("interval", nargs="?", default=1,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
interval = int(args.interval)
countdown = int(args.count)
maxrows = int(args.maxrows)
clear = not int(args.noclear)

# linux stats
loadavg = "/proc/loadavg"
diskstats = "/proc/diskstats"

# signal handler
def signal_ignore(signal, frame):
print()

# load BPF program
b = BPF(text="""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
// a value of one map, and a key for another:
struct who_t {
u32 pid;
char name[TASK_COMM_LEN];
};
struct info_t {
u32 pid;
int type;
int major;
int minor;
char name[TASK_COMM_LEN];
};
struct val_t {
u64 bytes;
u64 us;
u32 io;
};
BPF_HASH(start, struct request *);
BPF_HASH(whobyreq, struct request *, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
// cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
struct who_t who = {};
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
who.pid = bpf_get_current_pid_tgid();
whobyreq.update(&req, &who);
}
return 0;
}
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
return 0; // missed tracing issue
}
struct who_t *whop;
struct val_t *valp, zero = {};
u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
// setup info_t key
struct info_t info = {};
info.major = req->rq_disk->major;
info.minor = req->rq_disk->first_minor;
info.type = req->cmd_flags & REQ_WRITE;
whop = whobyreq.lookup(&req);
if (whop == 0) {
// missed pid who, save stats as pid 0
valp = counts.lookup_or_init(&info, &zero);
} else {
info.pid = whop->pid;
__builtin_memcpy(&info.name, whop->name, sizeof(info.name));
valp = counts.lookup_or_init(&info, &zero);
}
// save stats
valp->us += delta_us;
valp->bytes += req->__data_len;
valp->io++;
start.delete(&req);
whobyreq.delete(&req);
return 0;
}
""", debug=0)
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion")

print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)

# cache disk major,minor -> diskname
disklookup = {}
with open(diskstats) as stats:
for line in stats:
a = line.split()
disklookup[a[0] + "," + a[1]] = a[2]

# output
exiting = 0
while 1:
try:
sleep(interval)
except KeyboardInterrupt:
exiting = 1

# header
if clear:
call("clear")
else:
print()
with open(loadavg) as stats:
print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
"D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))

# by-PID output
counts = b.get_table("counts")
line = 0
for k, v in reversed(sorted(counts.items(),
key=lambda counts: counts[1].bytes)):

# lookup disk
disk = str(k.major) + "," + str(k.minor)
if disk in disklookup:
diskname = disklookup[disk]
else:
diskname = "?"

# print line
avg_ms = (float(v.us) / 1000) / v.io
print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, k.name,
"W" if k.type else "R", k.major, k.minor, diskname, v.io,
v.bytes / 1024, avg_ms))

line += 1
if line >= maxrows:
break
counts.clear()

countdown -= 1
if exiting or countdown == 0:
print("Detaching...")
exit()
Loading

0 comments on commit 6f075b9

Please sign in to comment.