You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
557 lines
20 KiB
557 lines
20 KiB
#!/usr/bin/env python
|
|
#
|
|
# deadlock_detector Detects potential deadlocks (lock order inversions)
|
|
# on a running process. For Linux, uses BCC, eBPF.
|
|
#
|
|
# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH]
|
|
# [--verbose] [--lock-symbols LOCK_SYMBOLS]
|
|
# [--unlock-symbols UNLOCK_SYMBOLS]
|
|
# pid
|
|
#
|
|
# This traces pthread mutex lock and unlock calls to build a directed graph
|
|
# representing the mutex wait graph:
|
|
#
|
|
# - Nodes in the graph represent mutexes.
|
|
# - Edge (A, B) exists if there exists some thread T where lock(A) was called
|
|
# and lock(B) was called before unlock(A) was called.
|
|
#
|
|
# If the program finds a potential lock order inversion, the program will dump
|
|
# the cycle of mutexes and the stack traces where each mutex was acquired, and
|
|
# then exit.
|
|
#
|
|
# This program can only find potential deadlocks that occur while the program
|
|
# is tracing the process. It cannot find deadlocks that may have occurred
|
|
# before the program was attached to the process.
|
|
#
|
|
# Since this traces all mutex lock and unlock events and all thread creation
|
|
# events on the traced process, the overhead of this bpf program can be very
|
|
# high if the process has many threads and mutexes. You should only run this on
|
|
# a process where the slowdown is acceptable.
|
|
#
|
|
# Note: This tool does not work for shared mutexes or recursive mutexes.
|
|
#
|
|
# For shared (read-write) mutexes, a deadlock requires a cycle in the wait
|
|
# graph where at least one of the mutexes in the cycle is acquiring exclusive
|
|
# (write) ownership.
|
|
#
|
|
# For recursive mutexes, lock() is called multiple times on the same mutex.
|
|
# However, there is no way to determine if a mutex is a recursive mutex
|
|
# after the mutex has been created. As a result, this tool will not find
|
|
# potential deadlocks that involve only one mutex.
|
|
#
|
|
# Copyright 2017 Facebook, Inc.
|
|
# Licensed under the Apache License, Version 2.0 (the "License")
|
|
#
|
|
# 01-Feb-2017 Kenny Yu Created this.
|
|
|
|
from __future__ import (
|
|
absolute_import, division, unicode_literals, print_function
|
|
)
|
|
from bcc import BPF
|
|
from collections import defaultdict
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
|
|
class DiGraph(object):
|
|
'''
|
|
Adapted from networkx: http://networkx.github.io/
|
|
Represents a directed graph. Edges can store (key, value) attributes.
|
|
'''
|
|
|
|
def __init__(self):
|
|
# Map of node -> set of nodes
|
|
self.adjacency_map = {}
|
|
# Map of (node1, node2) -> map string -> arbitrary attribute
|
|
# This will not be copied in subgraph()
|
|
self.attributes_map = {}
|
|
|
|
def neighbors(self, node):
|
|
return self.adjacency_map.get(node, set())
|
|
|
|
def edges(self):
|
|
edges = []
|
|
for node, neighbors in self.adjacency_map.items():
|
|
for neighbor in neighbors:
|
|
edges.append((node, neighbor))
|
|
return edges
|
|
|
|
def nodes(self):
|
|
return self.adjacency_map.keys()
|
|
|
|
def attributes(self, node1, node2):
|
|
return self.attributes_map[(node1, node2)]
|
|
|
|
def add_edge(self, node1, node2, **kwargs):
|
|
if node1 not in self.adjacency_map:
|
|
self.adjacency_map[node1] = set()
|
|
if node2 not in self.adjacency_map:
|
|
self.adjacency_map[node2] = set()
|
|
self.adjacency_map[node1].add(node2)
|
|
self.attributes_map[(node1, node2)] = kwargs
|
|
|
|
def remove_node(self, node):
|
|
self.adjacency_map.pop(node, None)
|
|
for _, neighbors in self.adjacency_map.items():
|
|
neighbors.discard(node)
|
|
|
|
def subgraph(self, nodes):
|
|
graph = DiGraph()
|
|
for node in nodes:
|
|
for neighbor in self.neighbors(node):
|
|
if neighbor in nodes:
|
|
graph.add_edge(node, neighbor)
|
|
return graph
|
|
|
|
def node_link_data(self):
|
|
'''
|
|
Returns the graph as a dictionary in a format that can be
|
|
serialized.
|
|
'''
|
|
data = {
|
|
'directed': True,
|
|
'multigraph': False,
|
|
'graph': {},
|
|
'links': [],
|
|
'nodes': [],
|
|
}
|
|
|
|
# Do one pass to build a map of node -> position in nodes
|
|
node_to_number = {}
|
|
for node in self.adjacency_map.keys():
|
|
node_to_number[node] = len(data['nodes'])
|
|
data['nodes'].append({'id': node})
|
|
|
|
# Do another pass to build the link information
|
|
for node, neighbors in self.adjacency_map.items():
|
|
for neighbor in neighbors:
|
|
link = self.attributes_map[(node, neighbor)].copy()
|
|
link['source'] = node_to_number[node]
|
|
link['target'] = node_to_number[neighbor]
|
|
data['links'].append(link)
|
|
return data
|
|
|
|
|
|
def strongly_connected_components(G):
|
|
'''
|
|
Adapted from networkx: http://networkx.github.io/
|
|
Parameters
|
|
----------
|
|
G : DiGraph
|
|
Returns
|
|
-------
|
|
comp : generator of sets
|
|
A generator of sets of nodes, one for each strongly connected
|
|
component of G.
|
|
'''
|
|
preorder = {}
|
|
lowlink = {}
|
|
scc_found = {}
|
|
scc_queue = []
|
|
i = 0 # Preorder counter
|
|
for source in G.nodes():
|
|
if source not in scc_found:
|
|
queue = [source]
|
|
while queue:
|
|
v = queue[-1]
|
|
if v not in preorder:
|
|
i = i + 1
|
|
preorder[v] = i
|
|
done = 1
|
|
v_nbrs = G.neighbors(v)
|
|
for w in v_nbrs:
|
|
if w not in preorder:
|
|
queue.append(w)
|
|
done = 0
|
|
break
|
|
if done == 1:
|
|
lowlink[v] = preorder[v]
|
|
for w in v_nbrs:
|
|
if w not in scc_found:
|
|
if preorder[w] > preorder[v]:
|
|
lowlink[v] = min([lowlink[v], lowlink[w]])
|
|
else:
|
|
lowlink[v] = min([lowlink[v], preorder[w]])
|
|
queue.pop()
|
|
if lowlink[v] == preorder[v]:
|
|
scc_found[v] = True
|
|
scc = {v}
|
|
while (
|
|
scc_queue and preorder[scc_queue[-1]] > preorder[v]
|
|
):
|
|
k = scc_queue.pop()
|
|
scc_found[k] = True
|
|
scc.add(k)
|
|
yield scc
|
|
else:
|
|
scc_queue.append(v)
|
|
|
|
|
|
def simple_cycles(G):
|
|
'''
|
|
Adapted from networkx: http://networkx.github.io/
|
|
Parameters
|
|
----------
|
|
G : DiGraph
|
|
Returns
|
|
-------
|
|
cycle_generator: generator
|
|
A generator that produces elementary cycles of the graph.
|
|
Each cycle is represented by a list of nodes along the cycle.
|
|
'''
|
|
|
|
def _unblock(thisnode, blocked, B):
|
|
stack = set([thisnode])
|
|
while stack:
|
|
node = stack.pop()
|
|
if node in blocked:
|
|
blocked.remove(node)
|
|
stack.update(B[node])
|
|
B[node].clear()
|
|
|
|
# Johnson's algorithm requires some ordering of the nodes.
|
|
# We assign the arbitrary ordering given by the strongly connected comps
|
|
# There is no need to track the ordering as each node removed as processed.
|
|
# save the actual graph so we can mutate it here
|
|
# We only take the edges because we do not want to
|
|
# copy edge and node attributes here.
|
|
subG = G.subgraph(G.nodes())
|
|
sccs = list(strongly_connected_components(subG))
|
|
while sccs:
|
|
scc = sccs.pop()
|
|
# order of scc determines ordering of nodes
|
|
startnode = scc.pop()
|
|
# Processing node runs 'circuit' routine from recursive version
|
|
path = [startnode]
|
|
blocked = set() # vertex: blocked from search?
|
|
closed = set() # nodes involved in a cycle
|
|
blocked.add(startnode)
|
|
B = defaultdict(set) # graph portions that yield no elementary circuit
|
|
stack = [(startnode, list(subG.neighbors(startnode)))]
|
|
while stack:
|
|
thisnode, nbrs = stack[-1]
|
|
if nbrs:
|
|
nextnode = nbrs.pop()
|
|
if nextnode == startnode:
|
|
yield path[:]
|
|
closed.update(path)
|
|
elif nextnode not in blocked:
|
|
path.append(nextnode)
|
|
stack.append((nextnode, list(subG.neighbors(nextnode))))
|
|
closed.discard(nextnode)
|
|
blocked.add(nextnode)
|
|
continue
|
|
# done with nextnode... look for more neighbors
|
|
if not nbrs: # no more nbrs
|
|
if thisnode in closed:
|
|
_unblock(thisnode, blocked, B)
|
|
else:
|
|
for nbr in subG.neighbors(thisnode):
|
|
if thisnode not in B[nbr]:
|
|
B[nbr].add(thisnode)
|
|
stack.pop()
|
|
path.pop()
|
|
# done processing this node
|
|
subG.remove_node(startnode)
|
|
H = subG.subgraph(scc) # make smaller to avoid work in SCC routine
|
|
sccs.extend(list(strongly_connected_components(H)))
|
|
|
|
|
|
def find_cycle(graph):
|
|
'''
|
|
Looks for a cycle in the graph. If found, returns the first cycle.
|
|
If nodes a1, a2, ..., an are in a cycle, then this returns:
|
|
[(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
|
|
Otherwise returns an empty list.
|
|
'''
|
|
cycles = list(simple_cycles(graph))
|
|
if cycles:
|
|
nodes = cycles[0]
|
|
nodes.append(nodes[0])
|
|
edges = []
|
|
prev = nodes[0]
|
|
for node in nodes[1:]:
|
|
edges.append((prev, node))
|
|
prev = node
|
|
return edges
|
|
else:
|
|
return []
|
|
|
|
|
|
def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn):
|
|
'''
|
|
Prints the cycle in the mutex graph in the following format:
|
|
|
|
Potential Deadlock Detected!
|
|
|
|
Cycle in lock order graph: M0 => M1 => M2 => M0
|
|
|
|
for (m, n) in cycle:
|
|
Mutex n acquired here while holding Mutex m in thread T:
|
|
[ stack trace ]
|
|
|
|
Mutex m previously acquired by thread T here:
|
|
[ stack trace ]
|
|
|
|
for T in all threads:
|
|
Thread T was created here:
|
|
[ stack trace ]
|
|
'''
|
|
|
|
# List of mutexes in the cycle, first and last repeated
|
|
nodes_in_order = []
|
|
# Map mutex address -> readable alias
|
|
node_addr_to_name = {}
|
|
for counter, (m, n) in enumerate(edges):
|
|
nodes_in_order.append(m)
|
|
# For global or static variables, try to symbolize the mutex address.
|
|
symbol = symbolize_with_objdump(binary, m)
|
|
if symbol:
|
|
symbol += ' '
|
|
node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m)
|
|
nodes_in_order.append(nodes_in_order[0])
|
|
|
|
print('----------------\nPotential Deadlock Detected!\n')
|
|
print(
|
|
'Cycle in lock order graph: %s\n' %
|
|
(' => '.join([node_addr_to_name[n] for n in nodes_in_order]))
|
|
)
|
|
|
|
# Set of threads involved in the lock inversion
|
|
thread_pids = set()
|
|
|
|
# For each edge in the cycle, print where the two mutexes were held
|
|
for (m, n) in edges:
|
|
thread_pid = graph.attributes(m, n)['thread_pid']
|
|
thread_comm = graph.attributes(m, n)['thread_comm']
|
|
first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id']
|
|
second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id']
|
|
thread_pids.add(thread_pid)
|
|
print(
|
|
'%s acquired here while holding %s in Thread %d (%s):' % (
|
|
node_addr_to_name[n], node_addr_to_name[m], thread_pid,
|
|
thread_comm
|
|
)
|
|
)
|
|
print_stack_trace_fn(second_mutex_stack_id)
|
|
print('')
|
|
print(
|
|
'%s previously acquired by the same Thread %d (%s) here:' %
|
|
(node_addr_to_name[m], thread_pid, thread_comm)
|
|
)
|
|
print_stack_trace_fn(first_mutex_stack_id)
|
|
print('')
|
|
|
|
# Print where the threads were created, if available
|
|
for thread_pid in thread_pids:
|
|
parent_pid, stack_id, parent_comm = thread_info.get(
|
|
thread_pid, (None, None, None)
|
|
)
|
|
if parent_pid:
|
|
print(
|
|
'Thread %d created by Thread %d (%s) here: ' %
|
|
(thread_pid, parent_pid, parent_comm)
|
|
)
|
|
print_stack_trace_fn(stack_id)
|
|
else:
|
|
print(
|
|
'Could not find stack trace where Thread %d was created' %
|
|
thread_pid
|
|
)
|
|
print('')
|
|
|
|
|
|
def symbolize_with_objdump(binary, addr):
|
|
'''
|
|
Searches the binary for the address using objdump. Returns the symbol if
|
|
it is found, otherwise returns empty string.
|
|
'''
|
|
try:
|
|
command = (
|
|
'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' %
|
|
(binary, addr)
|
|
)
|
|
output = subprocess.check_output(command, shell=True)
|
|
return output.decode('utf-8').strip()
|
|
except subprocess.CalledProcessError:
|
|
return ''
|
|
|
|
|
|
def strlist(s):
|
|
'''Given a comma-separated string, returns a list of substrings'''
|
|
return s.strip().split(',')
|
|
|
|
|
|
def main():
|
|
examples = '''Examples:
|
|
deadlock_detector 181 # Analyze PID 181
|
|
|
|
deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
|
|
# Analyze PID 181 and locks from this binary.
|
|
# If tracing a process that is running from
|
|
# a dynamically-linked binary, this argument
|
|
# is required and should be the path to the
|
|
# pthread library.
|
|
|
|
deadlock_detector 181 --verbose
|
|
# Analyze PID 181 and print statistics about
|
|
# the mutex wait graph.
|
|
|
|
deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\
|
|
--unlock-symbols my_mutex_unlock1,my_mutex_unlock2
|
|
# Analyze PID 181 and trace custom mutex
|
|
# symbols instead of pthread mutexes.
|
|
|
|
deadlock_detector 181 --dump-graph graph.json
|
|
# Analyze PID 181 and dump the mutex wait
|
|
# graph to graph.json.
|
|
'''
|
|
parser = argparse.ArgumentParser(
|
|
description=(
|
|
'Detect potential deadlocks (lock inversions) in a running binary.'
|
|
'\nMust be run as root.'
|
|
),
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=examples,
|
|
)
|
|
parser.add_argument('pid', type=int, help='Pid to trace')
|
|
# Binaries with `:` in the path will fail to attach uprobes on kernels
|
|
# running without this patch: https://lkml.org/lkml/2017/1/13/585.
|
|
# Symlinks to the binary without `:` in the path can get around this issue.
|
|
parser.add_argument(
|
|
'--binary',
|
|
type=str,
|
|
default='',
|
|
help='If set, trace the mutexes from the binary at this path. '
|
|
'For statically-linked binaries, this argument is not required. '
|
|
'For dynamically-linked binaries, this argument is required and '
|
|
'should be the path of the pthread library the binary is using. '
|
|
'Example: /lib/x86_64-linux-gnu/libpthread.so.0',
|
|
)
|
|
parser.add_argument(
|
|
'--dump-graph',
|
|
type=str,
|
|
default='',
|
|
help='If set, this will dump the mutex graph to the specified file.',
|
|
)
|
|
parser.add_argument(
|
|
'--verbose',
|
|
action='store_true',
|
|
help='Print statistics about the mutex wait graph.',
|
|
)
|
|
parser.add_argument(
|
|
'--lock-symbols',
|
|
type=strlist,
|
|
default=['pthread_mutex_lock'],
|
|
help='Comma-separated list of lock symbols to trace. Default is '
|
|
'pthread_mutex_lock. These symbols cannot be inlined in the binary.',
|
|
)
|
|
parser.add_argument(
|
|
'--unlock-symbols',
|
|
type=strlist,
|
|
default=['pthread_mutex_unlock'],
|
|
help='Comma-separated list of unlock symbols to trace. Default is '
|
|
'pthread_mutex_unlock. These symbols cannot be inlined in the binary.',
|
|
)
|
|
args = parser.parse_args()
|
|
if not args.binary:
|
|
try:
|
|
args.binary = os.readlink('/proc/%d/exe' % args.pid)
|
|
except OSError as e:
|
|
print('%s. Is the process (pid=%d) running?' % (str(e), args.pid))
|
|
sys.exit(1)
|
|
|
|
bpf = BPF(src_file=b'deadlock_detector.c')
|
|
|
|
# Trace where threads are created
|
|
bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone')
|
|
|
|
# We must trace unlock first, otherwise in the time we attached the probe
|
|
# on lock() and have not yet attached the probe on unlock(), a thread can
|
|
# acquire mutexes and release them, but the release events will not be
|
|
# traced, resulting in noisy reports.
|
|
for symbol in args.unlock_symbols:
|
|
try:
|
|
bpf.attach_uprobe(
|
|
name=args.binary,
|
|
sym=symbol,
|
|
fn_name='trace_mutex_release',
|
|
pid=args.pid,
|
|
)
|
|
except Exception as e:
|
|
print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
|
|
sys.exit(1)
|
|
for symbol in args.lock_symbols:
|
|
try:
|
|
bpf.attach_uprobe(
|
|
name=args.binary,
|
|
sym=symbol,
|
|
fn_name='trace_mutex_acquire',
|
|
pid=args.pid,
|
|
)
|
|
except Exception as e:
|
|
print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
|
|
sys.exit(1)
|
|
|
|
def print_stack_trace(stack_id):
|
|
'''Closure that prints the symbolized stack trace.'''
|
|
for addr in bpf.get_table('stack_traces').walk(stack_id):
|
|
line = bpf.sym(addr, args.pid)
|
|
# Try to symbolize with objdump if we cannot with bpf.
|
|
if line == '[unknown]':
|
|
symbol = symbolize_with_objdump(args.binary, addr)
|
|
if symbol:
|
|
line = symbol
|
|
print('@ %016x %s' % (addr, line))
|
|
|
|
print('Tracing... Hit Ctrl-C to end.')
|
|
while True:
|
|
try:
|
|
# Map of child thread pid -> parent info
|
|
thread_info = {
|
|
child.value: (parent.parent_pid, parent.stack_id, parent.comm)
|
|
for child, parent in bpf.get_table('thread_to_parent').items()
|
|
}
|
|
|
|
# Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists
|
|
# if there exists some thread T where lock(A) was called and
|
|
# lock(B) was called before unlock(A) was called.
|
|
graph = DiGraph()
|
|
for key, leaf in bpf.get_table('edges').items():
|
|
graph.add_edge(
|
|
key.mutex1,
|
|
key.mutex2,
|
|
thread_pid=leaf.thread_pid,
|
|
thread_comm=leaf.comm.decode('utf-8'),
|
|
first_mutex_stack_id=leaf.mutex1_stack_id,
|
|
second_mutex_stack_id=leaf.mutex2_stack_id,
|
|
)
|
|
if args.verbose:
|
|
print(
|
|
'Mutexes: %d, Edges: %d' %
|
|
(len(graph.nodes()), len(graph.edges()))
|
|
)
|
|
if args.dump_graph:
|
|
with open(args.dump_graph, 'w') as f:
|
|
data = graph.node_link_data()
|
|
f.write(json.dumps(data, indent=2))
|
|
|
|
cycle = find_cycle(graph)
|
|
if cycle:
|
|
print_cycle(
|
|
args.binary, graph, cycle, thread_info, print_stack_trace
|
|
)
|
|
sys.exit(1)
|
|
|
|
time.sleep(1)
|
|
except KeyboardInterrupt:
|
|
break
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|