You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
337 lines
10 KiB
337 lines
10 KiB
#!/usr/bin/env python
|
|
# @lint-avoid-python-3-compatibility-imports
|
|
#
|
|
# funcslower Trace slow kernel or user function calls.
|
|
# For Linux, uses BCC, eBPF.
|
|
#
|
|
# USAGE: funcslower [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS]
|
|
# [-T] [-t] [-v] function [function ...]
|
|
#
|
|
# WARNING: This tool traces function calls by instrumenting the entry and
|
|
# return from each function. For commonly-invoked functions like memory allocs
|
|
# or file writes, this can be extremely expensive. Mind the overhead.
|
|
#
|
|
# NOTE: This tool cannot trace nested functions in the same invocation
|
|
# due to instrumentation specifics, only innermost calls will be visible.
|
|
#
|
|
# By default, a minimum millisecond threshold of 1 is used.
|
|
#
|
|
# Copyright 2017, Sasha Goldshtein
|
|
# Licensed under the Apache License, Version 2.0 (the "License")
|
|
#
|
|
# 30-Mar-2017 Sasha Goldshtein Created this.
|
|
|
|
from __future__ import print_function
|
|
from bcc import BPF
|
|
import argparse
|
|
import ctypes as ct
|
|
import time
|
|
|
|
examples = """examples:
|
|
./funcslower vfs_write # trace vfs_write calls slower than 1ms
|
|
./funcslower -m 10 vfs_write # same, but slower than 10ms
|
|
./funcslower -u 10 c:open # trace open calls slower than 10us
|
|
./funcslower -p 135 c:open # trace pid 135 only
|
|
./funcslower c:malloc c:free # trace both malloc and free slower than 1ms
|
|
./funcslower -a 2 c:open # show first two arguments to open
|
|
./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms
|
|
./funcslower -f -UK c:open # Output in folded format for flame graphs
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Trace slow kernel or user function calls.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=examples)
|
|
parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
|
|
help="trace this PID only")
|
|
parser.add_argument("-m", "--min-ms", type=float, dest="min_ms",
|
|
help="minimum duration to trace (ms)")
|
|
parser.add_argument("-u", "--min-us", type=float, dest="min_us",
|
|
help="minimum duration to trace (us)")
|
|
parser.add_argument("-a", "--arguments", type=int,
|
|
help="print this many entry arguments, as hex")
|
|
parser.add_argument("-T", "--time", action="store_true",
|
|
help="show HH:MM:SS timestamp")
|
|
parser.add_argument("-t", "--timestamp", action="store_true",
|
|
help="show timestamp in seconds at us resolution")
|
|
parser.add_argument("-v", "--verbose", action="store_true",
|
|
help="print the BPF program for debugging purposes")
|
|
parser.add_argument(metavar="function", nargs="+", dest="functions",
|
|
help="function(s) to trace")
|
|
parser.add_argument("--ebpf", action="store_true",
|
|
help=argparse.SUPPRESS)
|
|
parser.add_argument("-f", "--folded", action="store_true",
|
|
help="output folded format, one line per stack (for flame graphs)")
|
|
parser.add_argument("-U", "--user-stack",
|
|
action="store_true", help="output user stack trace")
|
|
parser.add_argument("-K", "--kernel-stack",
|
|
action="store_true", help="output kernel stack trace")
|
|
|
|
args = parser.parse_args()
|
|
# fractions are allowed, but rounded to an integer nanosecond
|
|
if args.min_ms:
|
|
duration_ns = int(args.min_ms * 1000000)
|
|
elif args.min_us:
|
|
duration_ns = int(args.min_us * 1000)
|
|
else:
|
|
duration_ns = 1000000 # default to 1ms
|
|
|
|
bpf_text = """
|
|
#include <uapi/linux/ptrace.h>
|
|
#include <linux/sched.h> // for TASK_COMM_LEN
|
|
|
|
struct entry_t {
|
|
u64 id;
|
|
u64 start_ns;
|
|
#ifdef GRAB_ARGS
|
|
u64 args[6];
|
|
#endif
|
|
};
|
|
|
|
struct data_t {
|
|
u64 id;
|
|
u64 tgid_pid;
|
|
u64 start_ns;
|
|
u64 duration_ns;
|
|
u64 retval;
|
|
char comm[TASK_COMM_LEN];
|
|
#ifdef GRAB_ARGS
|
|
u64 args[6];
|
|
#endif
|
|
#ifdef USER_STACKS
|
|
int user_stack_id;
|
|
#endif
|
|
#ifdef KERNEL_STACKS
|
|
int kernel_stack_id;
|
|
u64 kernel_ip;
|
|
#endif
|
|
};
|
|
|
|
BPF_HASH(entryinfo, u64, struct entry_t);
|
|
BPF_PERF_OUTPUT(events);
|
|
|
|
#if defined(USER_STACKS) || defined(KERNEL_STACKS)
|
|
BPF_STACK_TRACE(stacks, 2048);
|
|
#endif
|
|
|
|
static int trace_entry(struct pt_regs *ctx, int id)
|
|
{
|
|
u64 tgid_pid = bpf_get_current_pid_tgid();
|
|
u32 tgid = tgid_pid >> 32;
|
|
if (TGID_FILTER)
|
|
return 0;
|
|
|
|
u32 pid = tgid_pid;
|
|
|
|
struct entry_t entry = {};
|
|
entry.start_ns = bpf_ktime_get_ns();
|
|
entry.id = id;
|
|
#ifdef GRAB_ARGS
|
|
entry.args[0] = PT_REGS_PARM1(ctx);
|
|
entry.args[1] = PT_REGS_PARM2(ctx);
|
|
entry.args[2] = PT_REGS_PARM3(ctx);
|
|
entry.args[3] = PT_REGS_PARM4(ctx);
|
|
entry.args[4] = PT_REGS_PARM5(ctx);
|
|
entry.args[5] = PT_REGS_PARM6(ctx);
|
|
#endif
|
|
|
|
entryinfo.update(&tgid_pid, &entry);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int trace_return(struct pt_regs *ctx)
|
|
{
|
|
struct entry_t *entryp;
|
|
u64 tgid_pid = bpf_get_current_pid_tgid();
|
|
|
|
entryp = entryinfo.lookup(&tgid_pid);
|
|
if (entryp == 0) {
|
|
return 0;
|
|
}
|
|
|
|
u64 delta_ns = bpf_ktime_get_ns() - entryp->start_ns;
|
|
entryinfo.delete(&tgid_pid);
|
|
|
|
if (delta_ns < DURATION_NS)
|
|
return 0;
|
|
|
|
struct data_t data = {};
|
|
data.id = entryp->id;
|
|
data.tgid_pid = tgid_pid;
|
|
data.start_ns = entryp->start_ns;
|
|
data.duration_ns = delta_ns;
|
|
data.retval = PT_REGS_RC(ctx);
|
|
|
|
#ifdef USER_STACKS
|
|
data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
|
|
#endif
|
|
|
|
#ifdef KERNEL_STACKS
|
|
data.kernel_stack_id = stacks.get_stackid(ctx, 0);
|
|
|
|
if (data.kernel_stack_id >= 0) {
|
|
u64 ip = PT_REGS_IP(ctx);
|
|
u64 page_offset;
|
|
|
|
// if ip isn't sane, leave key ips as zero for later checking
|
|
#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
|
|
// x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
|
|
page_offset = __PAGE_OFFSET_BASE;
|
|
#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
|
|
// x64, 4.17, and later
|
|
#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
|
|
page_offset = __PAGE_OFFSET_BASE_L5;
|
|
#else
|
|
page_offset = __PAGE_OFFSET_BASE_L4;
|
|
#endif
|
|
#else
|
|
// earlier x86_64 kernels, e.g., 4.6, comes here
|
|
// arm64, s390, powerpc, x86_32
|
|
page_offset = PAGE_OFFSET;
|
|
#endif
|
|
|
|
if (ip > page_offset) {
|
|
data.kernel_ip = ip;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef GRAB_ARGS
|
|
bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args);
|
|
#endif
|
|
bpf_get_current_comm(&data.comm, sizeof(data.comm));
|
|
events.perf_submit(ctx, &data, sizeof(data));
|
|
|
|
return 0;
|
|
}
|
|
"""
|
|
|
|
bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns))
|
|
if args.arguments:
|
|
bpf_text = "#define GRAB_ARGS\n" + bpf_text
|
|
if args.user_stack:
|
|
bpf_text = "#define USER_STACKS\n" + bpf_text
|
|
if args.kernel_stack:
|
|
bpf_text = "#define KERNEL_STACKS\n" + bpf_text
|
|
if args.tgid:
|
|
bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
|
|
else:
|
|
bpf_text = bpf_text.replace('TGID_FILTER', '0')
|
|
|
|
for i in range(len(args.functions)):
|
|
bpf_text += """
|
|
int trace_%d(struct pt_regs *ctx) {
|
|
return trace_entry(ctx, %d);
|
|
}
|
|
""" % (i, i)
|
|
|
|
if args.verbose or args.ebpf:
|
|
print(bpf_text)
|
|
if args.ebpf:
|
|
exit()
|
|
|
|
b = BPF(text=bpf_text)
|
|
|
|
for i, function in enumerate(args.functions):
|
|
if ":" in function:
|
|
library, func = function.split(":")
|
|
b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i)
|
|
b.attach_uretprobe(name=library, sym=func, fn_name="trace_return")
|
|
else:
|
|
b.attach_kprobe(event=function, fn_name="trace_%d" % i)
|
|
b.attach_kretprobe(event=function, fn_name="trace_return")
|
|
|
|
TASK_COMM_LEN = 16 # linux/sched.h
|
|
|
|
class Data(ct.Structure):
|
|
_fields_ = [
|
|
("id", ct.c_ulonglong),
|
|
("tgid_pid", ct.c_ulonglong),
|
|
("start_ns", ct.c_ulonglong),
|
|
("duration_ns", ct.c_ulonglong),
|
|
("retval", ct.c_ulonglong),
|
|
("comm", ct.c_char * TASK_COMM_LEN)
|
|
] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \
|
|
([("user_stack_id", ct.c_int)] if args.user_stack else []) + \
|
|
([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
|
|
|
|
time_designator = "us" if args.min_us else "ms"
|
|
time_value = args.min_us or args.min_ms or 1
|
|
time_multiplier = 1000 if args.min_us else 1000000
|
|
time_col = args.time or args.timestamp
|
|
|
|
# Do not print header when folded
|
|
if not args.folded:
|
|
print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
|
|
(time_value, time_designator))
|
|
print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
|
|
("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
|
|
"FUNC" + (" ARGS" if args.arguments else "")))
|
|
|
|
earliest_ts = 0
|
|
|
|
def time_str(event):
|
|
if args.time:
|
|
return "%-10s " % time.strftime("%H:%M:%S")
|
|
if args.timestamp:
|
|
global earliest_ts
|
|
if earliest_ts == 0:
|
|
earliest_ts = event.start_ns
|
|
return "%-10.6f " % ((event.start_ns - earliest_ts) / 1000000000.0)
|
|
return ""
|
|
|
|
def args_str(event):
|
|
if not args.arguments:
|
|
return ""
|
|
return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]])
|
|
|
|
def print_stack(event):
|
|
user_stack = []
|
|
stack_traces = b.get_table("stacks")
|
|
|
|
if args.user_stack and event.user_stack_id > 0:
|
|
user_stack = stack_traces.walk(event.user_stack_id)
|
|
|
|
kernel_stack = []
|
|
if args.kernel_stack and event.kernel_stack_id > 0:
|
|
kernel_tmp = stack_traces.walk(event.kernel_stack_id)
|
|
|
|
# fix kernel stack
|
|
for addr in kernel_tmp:
|
|
kernel_stack.append(addr)
|
|
|
|
do_delimiter = user_stack and kernel_stack
|
|
|
|
if args.folded:
|
|
# print folded stack output
|
|
user_stack = list(user_stack)
|
|
kernel_stack = list(kernel_stack)
|
|
line = [event.comm.decode('utf-8', 'replace')] + \
|
|
[b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
|
|
(do_delimiter and ["-"] or []) + \
|
|
[b.ksym(addr) for addr in reversed(kernel_stack)]
|
|
print("%s %d" % (";".join(line), 1))
|
|
else:
|
|
# print default multi-line stack output.
|
|
for addr in kernel_stack:
|
|
print(" %s" % b.ksym(addr))
|
|
for addr in user_stack:
|
|
print(" %s" % b.sym(addr, event.tgid_pid))
|
|
|
|
def print_event(cpu, data, size):
|
|
event = ct.cast(data, ct.POINTER(Data)).contents
|
|
ts = float(event.duration_ns) / time_multiplier
|
|
if not args.folded:
|
|
print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
|
|
(event.comm.decode('utf-8', 'replace'), event.tgid_pid >> 32,
|
|
ts, event.retval, args.functions[event.id], args_str(event)))
|
|
if args.user_stack or args.kernel_stack:
|
|
print_stack(event)
|
|
|
|
b["events"].open_perf_buffer(print_event, page_cnt=64)
|
|
while True:
|
|
try:
|
|
b.perf_buffer_poll()
|
|
except KeyboardInterrupt:
|
|
exit()
|