#!/bin/bash # # iolatency - summarize block device I/O latency as a histogram. # Written using Linux ftrace. # # This shows the distribution of latency, allowing modes and latency outliers # to be identified and studied. # # USAGE: ./iolatency [-hQT] [-d device] [-i iotype] [interval [count]] # # REQUIREMENTS: FTRACE CONFIG and block:block_rq_* tracepoints, which you may # already have on recent kernels. # # OVERHEAD: block device I/O issue and completion events are traced and buffered # in-kernel, then processed and summarized in user space. There may be # measurable overhead with this approach, relative to the block device IOPS. # # This was written as a proof of concept for ftrace. # # From perf-tools: https://github.com/brendangregg/perf-tools # # COPYRIGHT: Copyright (c) 2014 Brendan Gregg. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # (http://www.gnu.org/copyleft/gpl.html) # # 20-Jul-2014 Brendan Gregg Created this. ### default variables setglobal tracing = '/sys/kernel/debug/tracing' setglobal flock = '/var/tmp/.ftrace-lock' setglobal bufsize_kb = '4096' setglobal opt_device = '0'; setglobal device = ''; setglobal opt_iotype = '0'; setglobal iotype = ''; setglobal opt_timestamp = '0' setglobal opt_interval = '0'; setglobal interval = '1'; setglobal opt_count = '0'; setglobal count = '0'; setglobal opt_queue = '0' trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section proc usage { cat << """ >&2 USAGE: iolatency [-hQT] [-d device] [-i iotype] [interval [count]] -d device # device string (eg, "202,1) -i iotype # match type (eg, '*R*' for all reads) -Q # use queue insert as start time -T # timestamp on output -h # this usage message interval # summary interval, seconds (default 1) count # number of summaries eg, iolatency # summarize latency every second iolatency -Q # include block I/O queue time iolatency 5 2 # 2 x 5 second summaries iolatency -i '*R*' # trace reads iolatency -d 202,1 # trace device 202,1 only See the man page and example file for more info. """ > !2 USAGE: iolatency [-hQT] [-d device] [-i iotype] [interval [count]] -d device # device string (eg, "202,1) -i iotype # match type (eg, '*R*' for all reads) -Q # use queue insert as start time -T # timestamp on output -h # this usage message interval # summary interval, seconds (default 1) count # number of summaries eg, iolatency # summarize latency every second iolatency -Q # include block I/O queue time iolatency 5 2 # 2 x 5 second summaries iolatency -i '*R*' # trace reads iolatency -d 202,1 # trace device 202,1 only See the man page and example file for more info. END exit } proc warn { if ! eval @Argv { echo >&2 "WARNING: command failed \"$ifsjoin(Argv)> !2 "WARNING: command failed \"$@\"" } } proc end { # disable tracing echo !2 >/dev/null echo "Ending tracing..." !2 >/dev/null cd $tracing warn "echo 0 > events/block/$b_start/enable" warn "echo 0 > events/block/block_rq_complete/enable" if sh-expr ' opt_device || opt_iotype ' { warn "echo 0 > events/block/$b_start/filter" warn "echo 0 > events/block/block_rq_complete/filter" } warn "echo > trace" sh-expr ' wroteflock ' && warn "rm $flock" } proc die { echo >&2 @Argv> !2 "$@" exit 1 } proc edie { # die with a quiet end() echo >&2 @Argv> !2 "$@" exec >/dev/null !2 > !1 end exit 1 } ### process options while getopts d:hi:QT opt { match $opt { with d setglobal opt_device = '1'; setglobal device = $OPTARG with i setglobal opt_iotype = '1'; setglobal iotype = $OPTARG with Q setglobal opt_queue = '1' with T setglobal opt_timestamp = '1' with h|? usage } } shift $shExpr(' $OPTIND - 1 ') if sh-expr ' $# ' { setglobal opt_interval = '1' setglobal interval = $1 shift } if sh-expr ' $# ' { setglobal opt_count = '1' setglobal count = $1 } if sh-expr ' opt_device ' { setglobal major = $(device%,*) setglobal minor = $(device#*,) setglobal dev = $shExpr(' (major << 20) + minor ') } if sh-expr ' opt_queue ' { setglobal b_start = 'block_rq_insert' } else { setglobal b_start = 'block_rq_issue' } ### select awk [[ -x /usr/bin/mawk ]] && setglobal awk = ''mawk -W interactive'' || setglobal awk = 'awk' ### check permissions cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" ### ftrace lock [[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $[cat $flock] $flock" echo $Pid > $flock || die "ERROR: unable to write $flock." setglobal wroteflock = '1' ### setup and begin tracing warn "echo nop > current_tracer" warn "echo $bufsize_kb > buffer_size_kb" setglobal filter = '' if sh-expr ' opt_iotype ' { setglobal filter = ""rwbs ~ \"$iotype"\"" } if sh-expr ' opt_device ' { [[ "$filter" != "" ]] && setglobal filter = ""$filter && "" setglobal filter = ""$(filter)dev == $dev"" } if sh-expr ' opt_iotype || opt_device ' { if ! echo $filter > events/block/$b_start/filter || \ ! echo $filter > events/block/block_rq_complete/filter { edie "ERROR: setting -d or -t filter. Exiting." } } if ! echo 1 > events/block/$b_start/enable || \ ! echo 1 > events/block/block_rq_complete/enable { edie "ERROR: enabling block I/O tracepoints. Exiting." } setglobal etext = '' sh-expr ' !opt_count ' && setglobal etext = '" Ctrl-C to end.'" echo "Tracing block I/O. Output every $interval seconds.$etext" # # Determine output format. It may be one of the following (newest first): # TASK-PID CPU# |||| TIMESTAMP FUNCTION # TASK-PID CPU# TIMESTAMP FUNCTION # To differentiate between them, the number of header fields is counted, # and an offset set, to skip the extra column when needed. # setglobal offset = $[$awk 'BEGIN { o = 0; } $1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } $2 ~ /TASK/ { print o; exit }' trace] ### print trace buffer warn "echo > trace" setglobal i = '0' while sh-expr ' !opt_count || (i < count) ' { sh-expr ' i++ ' sleep $interval # snapshots were added in 3.10 if [[ -x snapshot ]] { echo 1 > snapshot echo > trace cat snapshot } else { cat trace echo > trace } sh-expr ' opt_timestamp ' && printf "time %(%H:%M:%S)T:\n" -1 echo "tick" } | \ $awk -v o=$offset -v opt_timestamp=$opt_timestamp -v b_start=$b_start ' function star(sval, smax, swidth) { stars = "" if (smax == 0) return "" for (si = 0; si < (swidth * sval / smax); si++) { stars = stars "#" } return stars } BEGIN { max_i = 0 } # common fields $1 != "#" { time = $(3+o); sub(":", "", time) dev = $(5+o) } # block I/O request $1 != "#" && $0 ~ b_start { # # example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] # The cmd field "()" might contain multiple words (hex), # hence stepping from the right (NF-3). # loc = $(NF-3) starts[dev, loc] = time next } # block I/O completion $1 != "#" && $0 ~ /rq_complete/ { # # example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] # dir = $(6+o) loc = $(NF-3) if (starts[dev, loc] > 0) { latency_ms = 1000 * (time - starts[dev, loc]) i = 0 for (ms = 1; latency_ms > ms; ms *= 2) { i++ } hist[i]++ if (i > max_i) max_i = i delete starts[dev, loc] } next } # timestamp $1 == "time" { lasttime = $2 } # print summary $1 == "tick" { print "" if (opt_timestamp) print lasttime # find max value max_v = 0 for (i = 0; i <= max_i; i++) { if (hist[i] > max_v) max_v = hist[i] } # print histogram printf "%8s .. %-8s: %-8s |%-38s|\n", ">=(ms)", "<(ms)", "I/O", "Distribution" ms = 1 from = 0 for (i = 0; i <= max_i; i++) { printf "%8d -> %-8d: %-8d |%-38s|\n", from, ms, hist[i], star(hist[i], max_v, 38) from = ms ms *= 2 } fflush() delete hist delete starts # invalid if events missed between snapshots max_i = 0 } $0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } ' ### end tracing end