#!/bin/bash # # iosnoop - trace block device I/O. # Written using Linux ftrace. # # This traces disk I/O at the block device interface, using the block: # tracepoints. This can help characterize the I/O requested for the storage # devices and their resulting performance. I/O completions can also be studied # event-by-event for debugging disk and controller I/O scheduling issues. # # USAGE: ./iosnoop [-hQst] [-d device] [-i iotype] [-p pid] [-n name] [duration] # # Run "iosnoop -h" for full usage. # # REQUIREMENTS: FTRACE CONFIG, block:block_rq_* tracepoints (you may # already have these on recent kernels). # # OVERHEAD: By default, iosnoop works without buffering, printing I/O events # as they happen (uses trace_pipe), context switching and consuming CPU to do # so. This has a limit of about 10,000 IOPS (depending on your platform), at # which point iosnoop will be consuming 1 CPU. The duration mode uses buffering, # and can handle much higher IOPS rates, however, the buffer has a limit of # about 50,000 I/O, after which events will be dropped. You can tune this with # bufsize_kb, which is per-CPU. Also note that the "-n" option is currently # post-filtered, so all events are traced. # # This was written as a proof of concept for ftrace. It would be better written # using perf_events (after some capabilities are added), which has a better # buffering policy, or a tracer such as SystemTap or ktap. # # From perf-tools: https://github.com/brendangregg/perf-tools # # See the iosnoop(8) man page (in perf-tools) for more info. # # COPYRIGHT: Copyright (c) 2014 Brendan Gregg. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # (http://www.gnu.org/copyleft/gpl.html) # # 12-Jul-2014 Brendan Gregg Created this. ### default variables setglobal tracing = '/sys/kernel/debug/tracing' setglobal flock = '/var/tmp/.ftrace-lock' setglobal bufsize_kb = '4096' setglobal opt_duration = '0'; setglobal duration = ''; setglobal opt_name = '0'; setglobal name = ''; setglobal opt_pid = '0'; setglobal pid = ''; setglobal ftext = '' setglobal opt_start = '0'; setglobal opt_end = '0'; setglobal opt_device = '0'; setglobal device = ''; setglobal opt_iotype = '0'; setglobal iotype = '' setglobal opt_queue = '0' trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section proc usage { cat << """ >&2 USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name] [duration] -d device # device string (eg, "202,1) -i iotype # match type (eg, '*R*' for all reads) -n name # process name to match on I/O issue -p PID # PID to match on I/O issue -Q # use queue insert as start time -s # include start time of I/O (s) -t # include completion time of I/O (s) -h # this usage message duration # duration seconds, and use buffers eg, iosnoop # watch block I/O live (unbuffered) iosnoop 1 # trace 1 sec (buffered) iosnoop -Q # include queueing time in LATms iosnoop -ts # include start and end timestamps iosnoop -i '*R*' # trace reads iosnoop -p 91 # show I/O issued when PID 91 is on-CPU iosnoop -Qp 91 # show I/O queued by PID 91, queue time See the man page and example file for more info. """ > !2 USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name] [duration] -d device # device string (eg, "202,1) -i iotype # match type (eg, '*R*' for all reads) -n name # process name to match on I/O issue -p PID # PID to match on I/O issue -Q # use queue insert as start time -s # include start time of I/O (s) -t # include completion time of I/O (s) -h # this usage message duration # duration seconds, and use buffers eg, iosnoop # watch block I/O live (unbuffered) iosnoop 1 # trace 1 sec (buffered) iosnoop -Q # include queueing time in LATms iosnoop -ts # include start and end timestamps iosnoop -i '*R*' # trace reads iosnoop -p 91 # show I/O issued when PID 91 is on-CPU iosnoop -Qp 91 # show I/O queued by PID 91, queue time See the man page and example file for more info. END exit } proc warn { if ! eval @Argv { echo >&2 "WARNING: command failed \"$ifsjoin(Argv)> !2 "WARNING: command failed \"$@\"" } } proc end { # disable tracing echo !2 >/dev/null echo "Ending tracing..." !2 >/dev/null cd $tracing warn "echo 0 > events/block/$b_start/enable" warn "echo 0 > events/block/block_rq_complete/enable" if sh-expr ' opt_device || opt_iotype || opt_pid ' { warn "echo 0 > events/block/$b_start/filter" warn "echo 0 > events/block/block_rq_complete/filter" } warn "echo > trace" sh-expr ' wroteflock ' && warn "rm $flock" } proc die { echo >&2 @Argv> !2 "$@" exit 1 } proc edie { # die with a quiet end() echo >&2 @Argv> !2 "$@" exec >/dev/null !2 > !1 end exit 1 } ### process options while getopts d:hi:n:p:Qst opt { match $opt { with d setglobal opt_device = '1'; setglobal device = $OPTARG with i setglobal opt_iotype = '1'; setglobal iotype = $OPTARG with n setglobal opt_name = '1'; setglobal name = $OPTARG with p setglobal opt_pid = '1'; setglobal pid = $OPTARG with Q setglobal opt_queue = '1' with s setglobal opt_start = '1' with t setglobal opt_end = '1' with h|? usage } } shift $shExpr(' $OPTIND - 1 ') if sh-expr ' $# ' { setglobal opt_duration = '1' setglobal duration = $1 shift } if sh-expr ' opt_device ' { setglobal major = $(device%,*) setglobal minor = $(device#*,) setglobal dev = $shExpr(' (major << 20) + minor ') } ### option logic sh-expr ' opt_pid && opt_name ' && die "ERROR: use either -p or -n." sh-expr ' opt_pid ' && setglobal ftext = "" issued by PID $pid"" sh-expr ' opt_name ' && setglobal ftext = "" issued by process name \"$name"\"" if sh-expr ' opt_duration ' { echo "Tracing block I/O$ftext for $duration seconds (buffered)..." } else { echo "Tracing block I/O$ftext. Ctrl-C to end." } if sh-expr ' opt_queue ' { setglobal b_start = 'block_rq_insert' } else { setglobal b_start = 'block_rq_issue' } ### select awk sh-expr ' opt_duration ' && setglobal use = 'mawk' || setglobal use = 'gawk' # workaround for mawk fflush() [[ -x /usr/bin/$use ]] && setglobal awk = $use || setglobal awk = 'awk' setglobal wroteflock = '1' ### check permissions cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" ### ftrace lock [[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $[cat $flock] $flock" echo $Pid > $flock || die "ERROR: unable to write $flock." ### setup and begin tracing echo nop > current_tracer warn "echo $bufsize_kb > buffer_size_kb" setglobal filter = '' if sh-expr ' opt_iotype ' { setglobal filter = ""rwbs ~ \"$iotype"\"" } if sh-expr ' opt_device ' { [[ "$filter" != "" ]] && setglobal filter = ""$filter && "" setglobal filter = ""$(filter)dev == $dev"" } setglobal filter_i = $filter if sh-expr ' opt_pid ' { [[ "$filter_i" != "" ]] && setglobal filter_i = ""$filter_i && "" setglobal filter_i = ""$(filter_i)common_pid == $pid"" [[ "$filter" == "" ]] && setglobal filter = '0' } if sh-expr ' opt_iotype || opt_device || opt_pid ' { if ! echo $filter_i > events/block/$b_start/filter || \ ! echo $filter > events/block/block_rq_complete/filter { edie "ERROR: setting -d or -t filter. Exiting." } } if ! echo 1 > events/block/$b_start/enable || \ ! echo 1 > events/block/block_rq_complete/enable { edie "ERROR: enabling block I/O tracepoints. Exiting." } sh-expr ' opt_start ' && printf "%-15s " "STARTs" sh-expr ' opt_end ' && printf "%-15s " "ENDs" printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n" \ "COMM" "PID" "TYPE" "DEV" "BLOCK" "BYTES" "LATms" # # Determine output format. It may be one of the following (newest first): # TASK-PID CPU# |||| TIMESTAMP FUNCTION # TASK-PID CPU# TIMESTAMP FUNCTION # To differentiate between them, the number of header fields is counted, # and an offset set, to skip the extra column when needed. # setglobal offset = $[$awk 'BEGIN { o = 0; } $1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } $2 ~ /TASK/ { print o; exit }' trace] ### print trace buffer warn "echo > trace" shell { if sh-expr ' opt_duration ' { # wait then dump buffer sleep $duration cat trace } else { # print buffer live cat trace_pipe } } | $awk -v o=$offset -v opt_name=$opt_name -v name=$name \ -v opt_duration=$opt_duration -v opt_start=$opt_start -v opt_end=$opt_end \ -v b_start=$b_start ' # common fields $1 != "#" { # task name can contain dashes comm = pid = $1 sub(/-[0-9][0-9]*/, "", comm) sub(/.*-/, "", pid) time = $(3+o); sub(":", "", time) dev = $(5+o) } # block I/O request $1 != "#" && $0 ~ b_start { if (opt_name && match(comm, name) == 0) next # # example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] # The cmd field "()" might contain multiple words (hex), # hence stepping from the right (NF-3). # loc = $(NF-3) starts[dev, loc] = time comms[dev, loc] = comm pids[dev, loc] = pid next } # block I/O completion $1 != "#" && $0 ~ /rq_complete/ { # # example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] # dir = $(6+o) loc = $(NF-3) nsec = $(NF-1) if (starts[dev, loc] > 0) { latency = sprintf("%.2f", 1000 * (time - starts[dev, loc])) comm = comms[dev, loc] pid = pids[dev, loc] if (opt_start) printf "%-15s ", starts[dev, loc] if (opt_end) printf "%-15s ", time printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n", comm, pid, dir, dev, loc, nsec * 512, latency if (!opt_duration) fflush() delete starts[dev, loc] delete comms[dev, loc] delete pids[dev, loc] } next } $0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } ' ### end tracing end