BPF List
 help / color / mirror / Atom feed
* [PATCH 0/4] perf lock: Tracing contention lock owner call stack
@ 2025-01-10  5:11 Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 1/4] perf lock: Add bpf maps for owner stack tracing Chun-Tse Shao
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10  5:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Chun-Tse Shao, peterz, mingo, acme, namhyung, mark.rutland,
	alexander.shishkin, jolsa, irogers, adrian.hunter, kan.liang,
	linux-perf-users, bpf

For perf lock contention, the current owner tracking (-o option) only
works with per-thread mode (-t option). Enabling call stack mode for
owner can be useful for diagnosing why a system running slow in
lock contention.

Example output:
  $ sudo ~/linux/tools/perf/perf lock con -abvo -Y mutex-spin -E16 perf bench sched pipe
   ...
   contended   total wait     max wait     avg wait         type   caller

         171      1.55 ms     20.26 us      9.06 us        mutex   pipe_read+0x57
                          0xffffffffac6318e7  pipe_read+0x57
                          0xffffffffac623862  vfs_read+0x332
                          0xffffffffac62434b  ksys_read+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
          36    193.71 us     15.27 us      5.38 us        mutex   pipe_write+0x50
                          0xffffffffac631ee0  pipe_write+0x50
                          0xffffffffac6241db  vfs_write+0x3bb
                          0xffffffffac6244ab  ksys_write+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
           4     51.22 us     16.47 us     12.80 us        mutex   do_epoll_wait+0x24d
                          0xffffffffac691f0d  do_epoll_wait+0x24d
                          0xffffffffac69249b  do_epoll_pwait.part.0+0xb
                          0xffffffffac693ba5  __x64_sys_epoll_pwait+0x95
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
           2     20.88 us     11.95 us     10.44 us        mutex   do_epoll_wait+0x24d
                          0xffffffffac691f0d  do_epoll_wait+0x24d
                          0xffffffffac693943  __x64_sys_epoll_wait+0x73
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
           1      7.33 us      7.33 us      7.33 us        mutex   do_epoll_ctl+0x6c1
                          0xffffffffac692e01  do_epoll_ctl+0x6c1
                          0xffffffffac6937e0  __x64_sys_epoll_ctl+0x70
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
           1      6.64 us      6.64 us      6.64 us        mutex   do_epoll_ctl+0x3d4
                          0xffffffffac692b14  do_epoll_ctl+0x3d4
                          0xffffffffac6937e0  __x64_sys_epoll_ctl+0x70
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76

  === owner stack trace ===

           3     31.24 us     15.27 us     10.41 us        mutex   pipe_read+0x348
                          0xffffffffac631bd8  pipe_read+0x348
                          0xffffffffac623862  vfs_read+0x332
                          0xffffffffac62434b  ksys_read+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
  ...

Chun-Tse Shao (4):
  perf lock: Add bpf maps for owner stack tracing
  perf lock: Retrieve owner callstack in bpf program
  perf lock: Make rb_tree helper functions generic
  perf lock: Report owner stack in usermode

 tools/perf/builtin-lock.c                     |  55 +++--
 tools/perf/util/bpf_lock_contention.c         |  58 +++++-
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 189 +++++++++++++++++-
 tools/perf/util/bpf_skel/lock_data.h          |   6 +
 tools/perf/util/lock-contention.h             |   2 +
 5 files changed, 290 insertions(+), 20 deletions(-)

--
2.47.1.688.g23fc6f90ad-goog


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v1 1/4] perf lock: Add bpf maps for owner stack tracing
  2025-01-10  5:11 [PATCH 0/4] perf lock: Tracing contention lock owner call stack Chun-Tse Shao
@ 2025-01-10  5:11 ` Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program Chun-Tse Shao
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10  5:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Chun-Tse Shao, peterz, mingo, acme, namhyung, mark.rutland,
	alexander.shishkin, jolsa, irogers, adrian.hunter, kan.liang,
	linux-perf-users, bpf

Add few bpf maps in order to tracing owner stack.

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 tools/perf/util/bpf_lock_contention.c         | 17 ++++++--
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 40 +++++++++++++++++--
 tools/perf/util/bpf_skel/lock_data.h          |  6 +++
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 41a1ad087895..c9c58f243ceb 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -41,9 +41,20 @@ int lock_contention_prepare(struct lock_contention *con)
 	else
 		bpf_map__set_max_entries(skel->maps.task_data, 1);
 
-	if (con->save_callstack)
-		bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
-	else
+	if (con->save_callstack) {
+		bpf_map__set_max_entries(skel->maps.stacks,
+					 con->map_nr_entries);
+		if (con->owner) {
+			bpf_map__set_value_size(skel->maps.owner_stacks_entries,
+						con->max_stack * sizeof(u64));
+			bpf_map__set_value_size(
+				skel->maps.contention_owner_stacks,
+				con->max_stack * sizeof(u64));
+			bpf_map__set_key_size(skel->maps.owner_lock_stat,
+						con->max_stack * sizeof(u64));
+			skel->rodata->max_stack = con->max_stack;
+		}
+	} else
 		bpf_map__set_max_entries(skel->maps.stacks, 1);
 
 	if (target__has_cpu(target)) {
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 1069bda5d733..05da19fdab23 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -19,13 +19,37 @@
 #define LCB_F_PERCPU	(1U << 4)
 #define LCB_F_MUTEX	(1U << 5)
 
-/* callstack storage  */
+ /* tmp buffer for owner callstack */
 struct {
-	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, 1);
+} owner_stacks_entries SEC(".maps");
+
+/* a map for tracing lock address to owner data */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64)); // lock address
+	__uint(value_size, sizeof(cotd));
 	__uint(max_entries, MAX_ENTRIES);
-} stacks SEC(".maps");
+} contention_owner_tracing SEC(".maps");
+
+/* a map for tracing lock address to owner stacktrace */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64)); // lock address
+	__uint(value_size, sizeof(__u64)); // straktrace
+	__uint(max_entries, MAX_ENTRIES);
+} contention_owner_stacks SEC(".maps");
+
+/* owner callstack to contention data storage */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(struct contention_data));
+	__uint(max_entries, MAX_ENTRIES);
+} owner_lock_stat SEC(".maps");
 
 /* maintain timestamp at the beginning of contention */
 struct {
@@ -43,6 +67,14 @@ struct {
 	__uint(max_entries, 1);
 } tstamp_cpu SEC(".maps");
 
+/* callstack storage  */
+struct {
+	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
 /* actual lock contention statistics */
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -126,6 +158,7 @@ const volatile int needs_callstack;
 const volatile int stack_skip;
 const volatile int lock_owner;
 const volatile int use_cgroup_v2;
+const volatile int max_stack;
 
 /* determine the key of lock stat */
 const volatile int aggr_mode;
@@ -436,7 +469,6 @@ int contention_end(u64 *ctx)
 			return 0;
 		need_delete = true;
 	}
-
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 	if ((__s64)duration < 0) {
 		__sync_fetch_and_add(&time_fail, 1);
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index de12892f992f..1ef0bca9860e 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -3,6 +3,12 @@
 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
+typedef struct contention_owner_tracing_data {
+	u32 pid; // Who has the lock.
+	u64 timestamp; // The time while the owner acquires lock and contention is going on.
+	u32 count; // How many waiters for this lock.
+} cotd;
+
 struct tstamp_data {
 	u64 timestamp;
 	u64 lock;
-- 
2.47.1.688.g23fc6f90ad-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program
  2025-01-10  5:11 [PATCH 0/4] perf lock: Tracing contention lock owner call stack Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 1/4] perf lock: Add bpf maps for owner stack tracing Chun-Tse Shao
@ 2025-01-10  5:11 ` Chun-Tse Shao
  2025-01-10 20:07   ` Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 3/4] perf lock: Make rb_tree helper functions generic Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 4/4] perf lock: Report owner stack in usermode Chun-Tse Shao
  3 siblings, 1 reply; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10  5:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Chun-Tse Shao, peterz, mingo, acme, namhyung, mark.rutland,
	alexander.shishkin, jolsa, irogers, adrian.hunter, kan.liang,
	linux-perf-users, bpf

Tracing owner callstack in `contention_begin()` and `contention_end()`,
and storing in `owner_lock_stat` bpf map.

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 149 +++++++++++++++++-
 1 file changed, 148 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 05da19fdab23..79b641d7beb2 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -7,6 +7,7 @@
 #include <asm-generic/errno-base.h>
 
 #include "lock_data.h"
+#include <time.h>
 
 /* for collect_lock_syms().  4096 was rejected by the verifier */
 #define MAX_CPUS  1024
@@ -178,6 +179,9 @@ int data_fail;
 int task_map_full;
 int data_map_full;
 
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
 static inline __u64 get_current_cgroup_id(void)
 {
 	struct task_struct *task;
@@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
 	pelem->flags = (__u32)ctx[1];
 
 	if (needs_callstack) {
+		u32 i = 0;
+		int owner_pid;
+		unsigned long *entries;
+		struct task_struct *task;
+		cotd *data;
+
+		if (!lock_owner)
+			goto contention_begin_skip_owner_callstack;
+
+		task = get_lock_owner(pelem->lock, pelem->flags);
+		if (!task)
+			goto contention_begin_skip_owner_callstack;
+
+		owner_pid = BPF_CORE_READ(task, pid);
+
+		entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
+		if (!entries)
+			goto contention_begin_skip_owner_callstack;
+		for (i = 0; i < max_stack; i++)
+			entries[i] = 0x0;
+
+		task = bpf_task_from_pid(owner_pid);
+		if (task) {
+			bpf_get_task_stack(task, entries,
+					   max_stack * sizeof(unsigned long),
+					   0);
+			bpf_task_release(task);
+		}
+
+		data = bpf_map_lookup_elem(&contention_owner_tracing,
+					   &(pelem->lock));
+
+		// Contention just happens, or corner case `lock` is owned by
+		// process not `owner_pid`.
+		if (!data || data->pid != owner_pid) {
+			cotd first = {
+				.pid = owner_pid,
+				.timestamp = pelem->timestamp,
+				.count = 1,
+			};
+			bpf_map_update_elem(&contention_owner_tracing,
+					    &(pelem->lock), &first, BPF_ANY);
+			bpf_map_update_elem(&contention_owner_stacks,
+					    &(pelem->lock), entries, BPF_ANY);
+		}
+		// Contention is going on and new waiter joins.
+		else {
+			__sync_fetch_and_add(&data->count, 1);
+			// TODO: Since for owner the callstack would change at
+			// different time, We should check and report if the
+			// callstack is different with the recorded one in
+			// `contention_owner_stacks`.
+		}
+contention_begin_skip_owner_callstack:
 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
 						  BPF_F_FAST_STACK_CMP | stack_skip);
 		if (pelem->stack_id < 0)
@@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
 	struct tstamp_data *pelem;
 	struct contention_key key = {};
 	struct contention_data *data;
+	__u64 timestamp;
 	__u64 duration;
 	bool need_delete = false;
 
@@ -469,12 +528,100 @@ int contention_end(u64 *ctx)
 			return 0;
 		need_delete = true;
 	}
-	duration = bpf_ktime_get_ns() - pelem->timestamp;
+	timestamp = bpf_ktime_get_ns();
+	duration = timestamp - pelem->timestamp;
 	if ((__s64)duration < 0) {
 		__sync_fetch_and_add(&time_fail, 1);
 		goto out;
 	}
 
+	if (needs_callstack && lock_owner) {
+		u64 owner_contention_time;
+		unsigned long *owner_stack;
+		struct contention_data *cdata;
+		cotd *otdata;
+
+		otdata = bpf_map_lookup_elem(&contention_owner_tracing,
+					     &(pelem->lock));
+		owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
+						  &(pelem->lock));
+		if (!otdata || !owner_stack)
+			goto contention_end_skip_owner_callstack;
+
+		owner_contention_time = timestamp - otdata->timestamp;
+
+		// Update `owner_lock_stat` if `owner_stack` is
+		// available.
+		if (owner_stack[0] != 0x0) {
+			cdata = bpf_map_lookup_elem(&owner_lock_stat,
+						    owner_stack);
+			if (!cdata) {
+				struct contention_data first = {
+					.total_time = owner_contention_time,
+					.max_time = owner_contention_time,
+					.min_time = owner_contention_time,
+					.count = 1,
+					.flags = pelem->flags,
+				};
+				bpf_map_update_elem(&owner_lock_stat,
+						    owner_stack, &first,
+						    BPF_ANY);
+			} else {
+				__sync_fetch_and_add(&cdata->total_time,
+						     owner_contention_time);
+				__sync_fetch_and_add(&cdata->count, 1);
+
+				/* FIXME: need atomic operations */
+				if (cdata->max_time < owner_contention_time)
+					cdata->max_time = owner_contention_time;
+				if (cdata->min_time > owner_contention_time)
+					cdata->min_time = owner_contention_time;
+			}
+		}
+
+		// `pid` in `otdata` is not lock owner anymore, delete
+		// this entry.
+		bpf_map_delete_elem(&contention_owner_stacks, &(otdata->pid));
+
+		//  No contention is going on, delete `lock` in
+		//  `contention_owner_tracing` and
+		//  `contention_owner_stacks`
+		if (otdata->count <= 1) {
+			bpf_map_delete_elem(&contention_owner_tracing,
+					    &(pelem->lock));
+			bpf_map_delete_elem(&contention_owner_stacks,
+					    &(pelem->lock));
+		}
+		// Contention is still going on, with a new owner
+		// (current task). `otdata` should be updated accordingly.
+		// If ctx[1] is not 0, the current task terminate lock waiting
+		// without acquiring it.
+		else if (ctx[1] == 0) {
+			unsigned long *entries;
+			u32 i = 0;
+
+			otdata->pid = pid;
+			otdata->timestamp = timestamp;
+			(otdata->count)--;
+			bpf_map_update_elem(&contention_owner_tracing,
+					    &(pelem->lock), otdata, BPF_ANY);
+
+			entries =
+				bpf_map_lookup_elem(&owner_stacks_entries, &i);
+			if (!entries)
+				goto contention_end_skip_owner_callstack;
+			for (i = 0; i < (u32)max_stack; i++)
+				entries[i] = 0x0;
+
+			bpf_get_task_stack(bpf_get_current_task_btf(), entries,
+					   max_stack * sizeof(unsigned long),
+					   0);
+			bpf_map_update_elem(&contention_owner_stacks,
+					    &(pelem->lock), entries, BPF_ANY);
+		}
+	}
+contention_end_skip_owner_callstack:
+
 	switch (aggr_mode) {
 	case LOCK_AGGR_CALLER:
 		key.stack_id = pelem->stack_id;
-- 
2.47.1.688.g23fc6f90ad-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v1 3/4] perf lock: Make rb_tree helper functions generic
  2025-01-10  5:11 [PATCH 0/4] perf lock: Tracing contention lock owner call stack Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 1/4] perf lock: Add bpf maps for owner stack tracing Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program Chun-Tse Shao
@ 2025-01-10  5:11 ` Chun-Tse Shao
  2025-01-10  5:11 ` [PATCH v1 4/4] perf lock: Report owner stack in usermode Chun-Tse Shao
  3 siblings, 0 replies; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10  5:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Chun-Tse Shao, peterz, mingo, acme, namhyung, mark.rutland,
	alexander.shishkin, jolsa, irogers, adrian.hunter, kan.liang,
	linux-perf-users, bpf

The rb_tree helper functions can be reused for parsing `owner_lock_stat`
into rb tree for sorting.

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 tools/perf/builtin-lock.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 062e2b56a2ab..f9b7620444c0 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -438,16 +438,13 @@ static void combine_lock_stats(struct lock_stat *st)
 	rb_insert_color(&st->rb, &sorted);
 }
 
-static void insert_to_result(struct lock_stat *st,
-			     int (*bigger)(struct lock_stat *, struct lock_stat *))
+static void insert_to(struct rb_root *rr, struct lock_stat *st,
+		      int (*bigger)(struct lock_stat *, struct lock_stat *))
 {
-	struct rb_node **rb = &result.rb_node;
+	struct rb_node **rb = &rr->rb_node;
 	struct rb_node *parent = NULL;
 	struct lock_stat *p;
 
-	if (combine_locks && st->combined)
-		return;
-
 	while (*rb) {
 		p = container_of(*rb, struct lock_stat, rb);
 		parent = *rb;
@@ -459,13 +456,21 @@ static void insert_to_result(struct lock_stat *st,
 	}
 
 	rb_link_node(&st->rb, parent, rb);
-	rb_insert_color(&st->rb, &result);
+	rb_insert_color(&st->rb, rr);
 }
 
-/* returns left most element of result, and erase it */
-static struct lock_stat *pop_from_result(void)
+static inline void insert_to_result(struct lock_stat *st,
+				    int (*bigger)(struct lock_stat *,
+						  struct lock_stat *))
+{
+	if (combine_locks && st->combined)
+		return;
+	insert_to(&result, st, bigger);
+}
+
+static inline struct lock_stat *pop_from(struct rb_root *rr)
 {
-	struct rb_node *node = result.rb_node;
+	struct rb_node *node = rr->rb_node;
 
 	if (!node)
 		return NULL;
@@ -473,8 +478,15 @@ static struct lock_stat *pop_from_result(void)
 	while (node->rb_left)
 		node = node->rb_left;
 
-	rb_erase(node, &result);
+	rb_erase(node, rr);
 	return container_of(node, struct lock_stat, rb);
+
+}
+
+/* returns left most element of result, and erase it */
+static struct lock_stat *pop_from_result(void)
+{
+	return pop_from(&result);
 }
 
 struct lock_stat *lock_stat_find(u64 addr)
-- 
2.47.1.688.g23fc6f90ad-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v1 4/4] perf lock: Report owner stack in usermode
  2025-01-10  5:11 [PATCH 0/4] perf lock: Tracing contention lock owner call stack Chun-Tse Shao
                   ` (2 preceding siblings ...)
  2025-01-10  5:11 ` [PATCH v1 3/4] perf lock: Make rb_tree helper functions generic Chun-Tse Shao
@ 2025-01-10  5:11 ` Chun-Tse Shao
  3 siblings, 0 replies; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10  5:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Chun-Tse Shao, peterz, mingo, acme, namhyung, mark.rutland,
	alexander.shishkin, jolsa, irogers, adrian.hunter, kan.liang,
	linux-perf-users, bpf

Parse `owner_lock_stat` into a rb tree, and report owner lock stats with
stack trace in order.

Example output:
  $ sudo ~/linux/tools/perf/perf lock con -abvo -Y mutex-spin -E3 perf bench sched pipe
  ...
   contended   total wait     max wait     avg wait         type   caller

         171      1.55 ms     20.26 us      9.06 us        mutex   pipe_read+0x57
                          0xffffffffac6318e7  pipe_read+0x57
                          0xffffffffac623862  vfs_read+0x332
                          0xffffffffac62434b  ksys_read+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
          36    193.71 us     15.27 us      5.38 us        mutex   pipe_write+0x50
                          0xffffffffac631ee0  pipe_write+0x50
                          0xffffffffac6241db  vfs_write+0x3bb
                          0xffffffffac6244ab  ksys_write+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
           4     51.22 us     16.47 us     12.80 us        mutex   do_epoll_wait+0x24d
                          0xffffffffac691f0d  do_epoll_wait+0x24d
                          0xffffffffac69249b  do_epoll_pwait.part.0+0xb
                          0xffffffffac693ba5  __x64_sys_epoll_pwait+0x95
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76

  === owner stack trace ===

           3     31.24 us     15.27 us     10.41 us        mutex   pipe_read+0x348
                          0xffffffffac631bd8  pipe_read+0x348
                          0xffffffffac623862  vfs_read+0x332
                          0xffffffffac62434b  ksys_read+0xbb
                          0xfffffffface604b2  do_syscall_64+0x82
                          0xffffffffad00012f  entry_SYSCALL_64_after_hwframe+0x76
  ...

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 tools/perf/builtin-lock.c             | 21 +++++++++++++-
 tools/perf/util/bpf_lock_contention.c | 41 +++++++++++++++++++++++++++
 tools/perf/util/lock-contention.h     |  2 ++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index f9b7620444c0..b20687325d49 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -42,6 +42,7 @@
 #include <linux/zalloc.h>
 #include <linux/err.h>
 #include <linux/stringify.h>
+#include <linux/rbtree.h>
 
 static struct perf_session *session;
 static struct target target;
@@ -1926,6 +1927,24 @@ static void print_contention_result(struct lock_contention *con)
 			break;
 	}
 
+	if (con->owner && con->save_callstack) {
+		struct rb_root root = RB_ROOT;
+
+
+		if (symbol_conf.field_sep)
+			fprintf(lock_output, "# owner stack trace:\n");
+		else
+			fprintf(lock_output, "\n=== owner stack trace ===\n\n");
+		while ((st = pop_owner_stack_trace(con))) {
+			insert_to(&root, st, compare);
+		}
+
+		while ((st = pop_from(&root))) {
+			print_lock_stat(con, st);
+			zfree(st);
+		}
+	}
+
 	if (print_nr_entries) {
 		/* update the total/bad stats */
 		while ((st = pop_from_result())) {
@@ -2071,7 +2090,7 @@ static int check_lock_contention_options(const struct option *options,
 		}
 	}
 
-	if (show_lock_owner)
+	if (show_lock_owner && !verbose)
 		show_thread_stats = true;
 
 	return 0;
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index c9c58f243ceb..a63d5ffac386 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -414,6 +414,47 @@ static const char *lock_contention_get_name(struct lock_contention *con,
 	return name_buf;
 }
 
+struct lock_stat *pop_owner_stack_trace(struct lock_contention *con)
+{
+	int fd;
+	u64 *stack_trace;
+	struct contention_data data = {};
+	size_t stack_size = con->max_stack * sizeof(*stack_trace);
+	struct lock_stat *st;
+	char name[KSYM_NAME_LEN];
+
+	fd = bpf_map__fd(skel->maps.owner_lock_stat);
+
+	stack_trace = zalloc(stack_size);
+	if (stack_trace == NULL)
+		return NULL;
+
+	if (bpf_map_get_next_key(fd, NULL, stack_trace))
+		return NULL;
+
+	bpf_map_lookup_elem(fd, stack_trace, &data);
+	st = zalloc(sizeof(struct lock_stat));
+
+	strcpy(name, stack_trace[0] ? lock_contention_get_name(con, NULL,
+							       stack_trace, 0) :
+				      "unknown");
+
+	st->name = strdup(name);
+	st->flags = data.flags;
+	st->nr_contended = data.count;
+	st->wait_time_total = data.total_time;
+	st->wait_time_max = data.max_time;
+	st->wait_time_min = data.min_time;
+	st->callstack = memdup(stack_trace, stack_size);
+
+	if (data.count)
+		st->avg_wait_time = data.total_time / data.count;
+
+	bpf_map_delete_elem(fd, stack_trace);
+	free(stack_trace);
+
+	return st;
+}
 int lock_contention_read(struct lock_contention *con)
 {
 	int fd, stack, err = 0;
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index 1a7248ff3889..83b400a36137 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -156,6 +156,8 @@ int lock_contention_stop(void);
 int lock_contention_read(struct lock_contention *con);
 int lock_contention_finish(struct lock_contention *con);
 
+struct lock_stat *pop_owner_stack_trace(struct lock_contention *con);
+
 #else  /* !HAVE_BPF_SKEL */
 
 static inline int lock_contention_prepare(struct lock_contention *con __maybe_unused)
-- 
2.47.1.688.g23fc6f90ad-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program
  2025-01-10  5:11 ` [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program Chun-Tse Shao
@ 2025-01-10 20:07   ` Chun-Tse Shao
  0 siblings, 0 replies; 6+ messages in thread
From: Chun-Tse Shao @ 2025-01-10 20:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, acme, namhyung, mark.rutland, alexander.shishkin,
	jolsa, irogers, adrian.hunter, kan.liang, linux-perf-users, bpf

On Thu, Jan 9, 2025 at 9:14 PM Chun-Tse Shao <ctshao@google.com> wrote:
>
> Tracing owner callstack in `contention_begin()` and `contention_end()`,
> and storing in `owner_lock_stat` bpf map.
>
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 149 +++++++++++++++++-
>  1 file changed, 148 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 05da19fdab23..79b641d7beb2 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -7,6 +7,7 @@
>  #include <asm-generic/errno-base.h>
>
>  #include "lock_data.h"
> +#include <time.h>
>
>  /* for collect_lock_syms().  4096 was rejected by the verifier */
>  #define MAX_CPUS  1024
> @@ -178,6 +179,9 @@ int data_fail;
>  int task_map_full;
>  int data_map_full;
>
> +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> +void bpf_task_release(struct task_struct *p) __ksym;
> +
>  static inline __u64 get_current_cgroup_id(void)
>  {
>         struct task_struct *task;
> @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
>         pelem->flags = (__u32)ctx[1];
>
>         if (needs_callstack) {
> +               u32 i = 0;
> +               int owner_pid;
> +               unsigned long *entries;
> +               struct task_struct *task;
> +               cotd *data;
> +
> +               if (!lock_owner)
> +                       goto contention_begin_skip_owner_callstack;
> +
> +               task = get_lock_owner(pelem->lock, pelem->flags);
> +               if (!task)
> +                       goto contention_begin_skip_owner_callstack;
> +
> +               owner_pid = BPF_CORE_READ(task, pid);
> +
> +               entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> +               if (!entries)
> +                       goto contention_begin_skip_owner_callstack;
> +               for (i = 0; i < max_stack; i++)
> +                       entries[i] = 0x0;
> +
> +               task = bpf_task_from_pid(owner_pid);
> +               if (task) {
> +                       bpf_get_task_stack(task, entries,
> +                                          max_stack * sizeof(unsigned long),
> +                                          0);
> +                       bpf_task_release(task);
> +               }
> +
> +               data = bpf_map_lookup_elem(&contention_owner_tracing,
> +                                          &(pelem->lock));
> +
> +               // Contention just happens, or corner case `lock` is owned by
> +               // process not `owner_pid`.
> +               if (!data || data->pid != owner_pid) {
> +                       cotd first = {
> +                               .pid = owner_pid,
> +                               .timestamp = pelem->timestamp,
> +                               .count = 1,
> +                       };
> +                       bpf_map_update_elem(&contention_owner_tracing,
> +                                           &(pelem->lock), &first, BPF_ANY);
> +                       bpf_map_update_elem(&contention_owner_stacks,
> +                                           &(pelem->lock), entries, BPF_ANY);
> +               }
> +               // Contention is going on and new waiter joins.
> +               else {
> +                       __sync_fetch_and_add(&data->count, 1);
> +                       // TODO: Since for owner the callstack would change at
> +                       // different time, We should check and report if the
> +                       // callstack is different with the recorded one in
> +                       // `contention_owner_stacks`.
> +               }
> +contention_begin_skip_owner_callstack:
>                 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
>                                                   BPF_F_FAST_STACK_CMP | stack_skip);
>                 if (pelem->stack_id < 0)
> @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
>         struct tstamp_data *pelem;
>         struct contention_key key = {};
>         struct contention_data *data;
> +       __u64 timestamp;
>         __u64 duration;
>         bool need_delete = false;
>
> @@ -469,12 +528,100 @@ int contention_end(u64 *ctx)
>                         return 0;
>                 need_delete = true;
>         }
> -       duration = bpf_ktime_get_ns() - pelem->timestamp;
> +       timestamp = bpf_ktime_get_ns();
> +       duration = timestamp - pelem->timestamp;
>         if ((__s64)duration < 0) {
>                 __sync_fetch_and_add(&time_fail, 1);
>                 goto out;
>         }
>
> +       if (needs_callstack && lock_owner) {
> +               u64 owner_contention_time;
> +               unsigned long *owner_stack;
> +               struct contention_data *cdata;
> +               cotd *otdata;
> +
> +               otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> +                                            &(pelem->lock));
> +               owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> +                                                 &(pelem->lock));
> +               if (!otdata || !owner_stack)
> +                       goto contention_end_skip_owner_callstack;
> +
> +               owner_contention_time = timestamp - otdata->timestamp;
> +
> +               // Update `owner_lock_stat` if `owner_stack` is
> +               // available.
> +               if (owner_stack[0] != 0x0) {
> +                       cdata = bpf_map_lookup_elem(&owner_lock_stat,
> +                                                   owner_stack);
> +                       if (!cdata) {
> +                               struct contention_data first = {
> +                                       .total_time = owner_contention_time,
> +                                       .max_time = owner_contention_time,
> +                                       .min_time = owner_contention_time,
> +                                       .count = 1,
> +                                       .flags = pelem->flags,
> +                               };
> +                               bpf_map_update_elem(&owner_lock_stat,
> +                                                   owner_stack, &first,
> +                                                   BPF_ANY);
> +                       } else {
> +                               __sync_fetch_and_add(&cdata->total_time,
> +                                                    owner_contention_time);
> +                               __sync_fetch_and_add(&cdata->count, 1);
> +
> +                               /* FIXME: need atomic operations */
> +                               if (cdata->max_time < owner_contention_time)
> +                                       cdata->max_time = owner_contention_time;
> +                               if (cdata->min_time > owner_contention_time)
> +                                       cdata->min_time = owner_contention_time;
> +                       }
> +               }
> +
> +               // `pid` in `otdata` is not lock owner anymore, delete
> +               // this entry.
> +               bpf_map_delete_elem(&contention_owner_stacks, &(otdata->pid));

I just realized the above three lines are unnecessary and should be
deleted, please ignore them and I will fix them in my next patch.

> +
> +               //  No contention is going on, delete `lock` in
> +               //  `contention_owner_tracing` and
> +               //  `contention_owner_stacks`
> +               if (otdata->count <= 1) {
> +                       bpf_map_delete_elem(&contention_owner_tracing,
> +                                           &(pelem->lock));
> +                       bpf_map_delete_elem(&contention_owner_stacks,
> +                                           &(pelem->lock));
> +               }
> +               // Contention is still going on, with a new owner
> +               // (current task). `otdata` should be updated accordingly.
> +               // If ctx[1] is not 0, the current task terminate lock waiting
> +               // without acquiring it.
> +               else if (ctx[1] == 0) {
> +                       unsigned long *entries;
> +                       u32 i = 0;
> +
> +                       otdata->pid = pid;
> +                       otdata->timestamp = timestamp;
> +                       (otdata->count)--;
> +                       bpf_map_update_elem(&contention_owner_tracing,
> +                                           &(pelem->lock), otdata, BPF_ANY);
> +
> +                       entries =
> +                               bpf_map_lookup_elem(&owner_stacks_entries, &i);
> +                       if (!entries)
> +                               goto contention_end_skip_owner_callstack;
> +                       for (i = 0; i < (u32)max_stack; i++)
> +                               entries[i] = 0x0;
> +
> +                       bpf_get_task_stack(bpf_get_current_task_btf(), entries,
> +                                          max_stack * sizeof(unsigned long),
> +                                          0);
> +                       bpf_map_update_elem(&contention_owner_stacks,
> +                                           &(pelem->lock), entries, BPF_ANY);
> +               }

The logic above is a bit flawed. It should be:
                // Contention is still going on, with a new owner
                // (current task). `otdata` should be updated accordingly.
                else {
                        (otdata->count)--;

                        // If ctx[1] is not 0, the current task
terminates lock waiting
                        // without acquiring it. Owner is not changed.
                        if (ctx[1] == 0) {
                                u32 i = 0;
                                otdata->pid = pid;
                                otdata->timestamp = timestamp;

                                entries =

bpf_map_lookup_elem(&owner_stacks_entries, &i);
                                if (!entries) {

bpf_map_update_elem(&contention_owner_tracing,

       &(pelem->lock), otdata, BPF_ANY);
                                        goto
contention_end_skip_owner_callstack;
                                }
                                for (i = 0; i < (u32)max_stack; i++)
                                        entries[i] = 0x0;


bpf_get_task_stack(bpf_get_current_task_btf(), entries,
                                                   max_stack *
sizeof(unsigned long),
                                                   0);
                                bpf_map_update_elem(&contention_owner_stacks,
                                                    &(pelem->lock),
entries, BPF_ANY);
                        }

                        bpf_map_update_elem(&contention_owner_tracing,
                                            &(pelem->lock), otdata, BPF_ANY);
                }
I will update this in my next patch.

> +       }
> +contention_end_skip_owner_callstack:
> +
>         switch (aggr_mode) {
>         case LOCK_AGGR_CALLER:
>                 key.stack_id = pelem->stack_id;
> --
> 2.47.1.688.g23fc6f90ad-goog
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-01-10 20:07 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-10  5:11 [PATCH 0/4] perf lock: Tracing contention lock owner call stack Chun-Tse Shao
2025-01-10  5:11 ` [PATCH v1 1/4] perf lock: Add bpf maps for owner stack tracing Chun-Tse Shao
2025-01-10  5:11 ` [PATCH v1 2/4] perf lock: Retrieve owner callstack in bpf program Chun-Tse Shao
2025-01-10 20:07   ` Chun-Tse Shao
2025-01-10  5:11 ` [PATCH v1 3/4] perf lock: Make rb_tree helper functions generic Chun-Tse Shao
2025-01-10  5:11 ` [PATCH v1 4/4] perf lock: Report owner stack in usermode Chun-Tse Shao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox