* [PATCH] perf lock contention: Symbolize zone->lock using BTF
@ 2025-04-01 6:30 Namhyung Kim
2025-04-28 4:42 ` Namhyung Kim
0 siblings, 1 reply; 3+ messages in thread
From: Namhyung Kim @ 2025-04-01 6:30 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
linux-perf-users, Song Liu, bpf, Stephane Eranian, linux-mm
The struct zone is embedded in struct pglist_data which can be allocated
for each NUMA node early in the boot process. As it's not a slab object
nor a global lock, this was not symbolized.
Since the zone->lock is often contended, it'd be nice if we can
symbolize it. On NUMA systems, node_data array will have pointers for
struct pglist_data. By following the pointer, it can calculate the
address of each zone and its lock using BTF. On UMA, it can just use
contig_page_data and its zones.
The following example shows the zone lock contention at the end.
$ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run
Total time: 0.038 [sec]
contended total wait max wait avg wait address symbol
5167 18.17 ms 10.27 us 3.52 us ffff953340052d00 &kmem_cache_node (spinlock)
38 11.75 ms 465.49 us 309.13 us ffff95334060c480 &sock_inode_cache (spinlock)
3916 10.13 ms 10.43 us 2.59 us ffff953342aecb40 &kmem_cache_node (spinlock)
2963 10.02 ms 13.75 us 3.38 us ffff9533d2344098 &kmalloc-rnd-08-2k (spinlock)
216 5.05 ms 99.49 us 23.39 us ffff9542bf7d65d0 zone_lock (spinlock)
Cc: linux-mm@kvack.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
tools/perf/util/bpf_lock_contention.c | 88 +++++++++++++++++--
.../perf/util/bpf_skel/lock_contention.bpf.c | 64 ++++++++++++++
tools/perf/util/bpf_skel/lock_data.h | 1 +
tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 9 ++
tools/perf/util/lock-contention.h | 1 +
5 files changed, 157 insertions(+), 6 deletions(-)
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 5af8f6d1bc952613..98395667220e58ee 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -12,6 +12,7 @@
#include "util/lock-contention.h"
#include <linux/zalloc.h>
#include <linux/string.h>
+#include <api/fs/fs.h>
#include <bpf/bpf.h>
#include <bpf/btf.h>
#include <inttypes.h>
@@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)
static void check_slab_cache_iter(struct lock_contention *con)
{
- struct btf *btf = btf__load_vmlinux_btf();
s32 ret;
hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);
- if (btf == NULL) {
+ con->btf = btf__load_vmlinux_btf();
+ if (con->btf == NULL) {
pr_debug("BTF loading failed: %s\n", strerror(errno));
return;
}
- ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
+ ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
if (ret < 0) {
bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
pr_debug("slab cache iterator is not available: %d\n", ret);
- goto out;
+ return;
}
has_slab_iter = true;
bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
-out:
- btf__free(btf);
}
static void run_slab_cache_iter(void)
@@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
hashmap__clear(&slab_hash);
}
+static void init_numa_data(struct lock_contention *con)
+{
+ struct symbol *sym;
+ struct map *kmap;
+ char *buf = NULL, *p;
+ size_t len;
+ long last = -1;
+ int ret;
+
+ /*
+ * 'struct zone' is embedded in 'struct pglist_data' as an array.
+ * As we may not have full information of the struct zone in the
+ * (fake) vmlinux.h, let's get the actual size from BTF.
+ */
+ ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
+ if (ret < 0) {
+ pr_debug("cannot get type of struct zone: %d\n", ret);
+ return;
+ }
+
+ ret = btf__resolve_size(con->btf, ret);
+ if (ret < 0) {
+ pr_debug("cannot get size of struct zone: %d\n", ret);
+ return;
+ }
+ skel->rodata->sizeof_zone = ret;
+
+ /* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
+ sym = machine__find_kernel_symbol_by_name(con->machine,
+ "contig_page_data",
+ &kmap);
+ if (sym) {
+ skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
+ map__put(kmap);
+ return;
+ }
+
+ /*
+ * The 'node_data' is an array of pointers to struct pglist_data.
+ * It needs to follow the pointer for each node in BPF to get the
+ * address of struct pglist_data and its zones.
+ */
+ sym = machine__find_kernel_symbol_by_name(con->machine,
+ "node_data",
+ &kmap);
+ if (sym == NULL)
+ return;
+
+ skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
+ map__put(kmap);
+
+ /* get the number of online nodes using the last node number + 1 */
+ ret = sysfs__read_str("devices/system/node/online", &buf, &len);
+ if (ret < 0) {
+ pr_debug("failed to read online node: %d\n", ret);
+ return;
+ }
+
+ p = buf;
+ while (p && *p) {
+ last = strtol(p, &p, 0);
+
+ if (p && (*p == ',' || *p == '-' || *p == '\n'))
+ p++;
+ }
+ skel->rodata->nr_nodes = last + 1;
+ free(buf);
+}
+
int lock_contention_prepare(struct lock_contention *con)
{
int i, fd;
@@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)
bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);
+ init_numa_data(con);
+
if (lock_contention_bpf__load(skel) < 0) {
pr_err("Failed to load lock-contention BPF skeleton\n");
return -1;
@@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
return "rq_lock";
}
+ if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
+ if (flags == LOCK_CLASS_ZONE_LOCK)
+ return "zone_lock";
+ }
+
/* look slab_hash for dynamic locks in a slab object */
if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
@@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
}
exit_slab_cache_iter();
+ btf__free(con->btf);
return 0;
}
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 69be7a4234e076e8..6f12c7d978a2e015 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -11,6 +11,9 @@
/* for collect_lock_syms(). 4096 was rejected by the verifier */
#define MAX_CPUS 1024
+/* for collect_zone_lock(). It should be more than the actual zones. */
+#define MAX_ZONES 10
+
/* lock contention flags from include/trace/events/lock.h */
#define LCB_F_SPIN (1U << 0)
#define LCB_F_READ (1U << 1)
@@ -801,6 +804,11 @@ int contention_end(u64 *ctx)
extern struct rq runqueues __ksym;
+const volatile __u64 contig_page_data_addr;
+const volatile __u64 node_data_addr;
+const volatile int nr_nodes;
+const volatile int sizeof_zone;
+
struct rq___old {
raw_spinlock_t lock;
} __attribute__((preserve_access_index));
@@ -809,6 +817,59 @@ struct rq___new {
raw_spinlock_t __lock;
} __attribute__((preserve_access_index));
+static void collect_zone_lock(void)
+{
+ __u64 nr_zones, zone_off;
+ __u64 lock_addr, lock_off;
+ __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
+
+ zone_off = offsetof(struct pglist_data, node_zones);
+ lock_off = offsetof(struct zone, lock);
+
+ if (contig_page_data_addr) {
+ struct pglist_data *contig_page_data;
+
+ contig_page_data = (void *)(long)contig_page_data_addr;
+ nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
+
+ for (int i = 0; i < MAX_ZONES; i++) {
+ __u64 zone_addr;
+
+ if (i >= nr_zones)
+ break;
+
+ zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
+ lock_addr = zone_addr + lock_off;
+
+ bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+ }
+ } else if (nr_nodes > 0) {
+ struct pglist_data **node_data = (void *)(long)node_data_addr;
+
+ for (int i = 0; i < nr_nodes; i++) {
+ struct pglist_data *pgdat = NULL;
+ int err;
+
+ err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
+ if (err < 0 || pgdat == NULL)
+ break;
+
+ nr_zones = BPF_CORE_READ(pgdat, nr_zones);
+ for (int k = 0; k < MAX_ZONES; k++) {
+ __u64 zone_addr;
+
+ if (k >= nr_zones)
+ break;
+
+ zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
+ lock_addr = zone_addr + lock_off;
+
+ bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+ }
+ }
+ }
+}
+
SEC("raw_tp/bpf_test_finish")
int BPF_PROG(collect_lock_syms)
{
@@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
lock_flag = LOCK_CLASS_RQLOCK;
bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
}
+
+ collect_zone_lock();
+
return 0;
}
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 15f5743bd409f2f9..28c5e5aced7fcc91 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -67,6 +67,7 @@ enum lock_aggr_mode {
enum lock_class_sym {
LOCK_CLASS_NONE,
LOCK_CLASS_RQLOCK,
+ LOCK_CLASS_ZONE_LOCK,
};
struct slab_cache_data {
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index 7b81d3173917fdb5..a59ce912be18cd0f 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
struct kmem_cache *s;
} __attribute__((preserve_access_index));
+struct zone {
+ spinlock_t lock;
+} __attribute__((preserve_access_index));
+
+struct pglist_data {
+ struct zone node_zones[6]; /* value for all possible config */
+ int nr_zones;
+} __attribute__((preserve_access_index));
+
#endif // __VMLINUX_H
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index b5d916aa49df6424..d331ce8e3caad4cb 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -142,6 +142,7 @@ struct lock_contention {
struct lock_filter *filters;
struct lock_contention_fails fails;
struct rb_root cgroups;
+ void *btf;
unsigned long map_nr_entries;
int max_stack;
int stack_skip;
--
2.49.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] perf lock contention: Symbolize zone->lock using BTF
2025-04-01 6:30 [PATCH] perf lock contention: Symbolize zone->lock using BTF Namhyung Kim
@ 2025-04-28 4:42 ` Namhyung Kim
2025-04-29 15:28 ` Arnaldo Carvalho de Melo
0 siblings, 1 reply; 3+ messages in thread
From: Namhyung Kim @ 2025-04-28 4:42 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
linux-perf-users, Song Liu, bpf, Stephane Eranian, linux-mm
Ping!
On Mon, Mar 31, 2025 at 11:30:55PM -0700, Namhyung Kim wrote:
> The struct zone is embedded in struct pglist_data which can be allocated
> for each NUMA node early in the boot process. As it's not a slab object
> nor a global lock, this was not symbolized.
>
> Since the zone->lock is often contended, it'd be nice if we can
> symbolize it. On NUMA systems, node_data array will have pointers for
> struct pglist_data. By following the pointer, it can calculate the
> address of each zone and its lock using BTF. On UMA, it can just use
> contig_page_data and its zones.
>
> The following example shows the zone lock contention at the end.
>
> $ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.038 [sec]
> contended total wait max wait avg wait address symbol
>
> 5167 18.17 ms 10.27 us 3.52 us ffff953340052d00 &kmem_cache_node (spinlock)
> 38 11.75 ms 465.49 us 309.13 us ffff95334060c480 &sock_inode_cache (spinlock)
> 3916 10.13 ms 10.43 us 2.59 us ffff953342aecb40 &kmem_cache_node (spinlock)
> 2963 10.02 ms 13.75 us 3.38 us ffff9533d2344098 &kmalloc-rnd-08-2k (spinlock)
> 216 5.05 ms 99.49 us 23.39 us ffff9542bf7d65d0 zone_lock (spinlock)
>
> Cc: linux-mm@kvack.org
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> tools/perf/util/bpf_lock_contention.c | 88 +++++++++++++++++--
> .../perf/util/bpf_skel/lock_contention.bpf.c | 64 ++++++++++++++
> tools/perf/util/bpf_skel/lock_data.h | 1 +
> tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 9 ++
> tools/perf/util/lock-contention.h | 1 +
> 5 files changed, 157 insertions(+), 6 deletions(-)
>
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 5af8f6d1bc952613..98395667220e58ee 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -12,6 +12,7 @@
> #include "util/lock-contention.h"
> #include <linux/zalloc.h>
> #include <linux/string.h>
> +#include <api/fs/fs.h>
> #include <bpf/bpf.h>
> #include <bpf/btf.h>
> #include <inttypes.h>
> @@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)
>
> static void check_slab_cache_iter(struct lock_contention *con)
> {
> - struct btf *btf = btf__load_vmlinux_btf();
> s32 ret;
>
> hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);
>
> - if (btf == NULL) {
> + con->btf = btf__load_vmlinux_btf();
> + if (con->btf == NULL) {
> pr_debug("BTF loading failed: %s\n", strerror(errno));
> return;
> }
>
> - ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
> + ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
> if (ret < 0) {
> bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
> pr_debug("slab cache iterator is not available: %d\n", ret);
> - goto out;
> + return;
> }
>
> has_slab_iter = true;
>
> bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
> -out:
> - btf__free(btf);
> }
>
> static void run_slab_cache_iter(void)
> @@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
> hashmap__clear(&slab_hash);
> }
>
> +static void init_numa_data(struct lock_contention *con)
> +{
> + struct symbol *sym;
> + struct map *kmap;
> + char *buf = NULL, *p;
> + size_t len;
> + long last = -1;
> + int ret;
> +
> + /*
> + * 'struct zone' is embedded in 'struct pglist_data' as an array.
> + * As we may not have full information of the struct zone in the
> + * (fake) vmlinux.h, let's get the actual size from BTF.
> + */
> + ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
> + if (ret < 0) {
> + pr_debug("cannot get type of struct zone: %d\n", ret);
> + return;
> + }
> +
> + ret = btf__resolve_size(con->btf, ret);
> + if (ret < 0) {
> + pr_debug("cannot get size of struct zone: %d\n", ret);
> + return;
> + }
> + skel->rodata->sizeof_zone = ret;
> +
> + /* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
> + sym = machine__find_kernel_symbol_by_name(con->machine,
> + "contig_page_data",
> + &kmap);
> + if (sym) {
> + skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
> + map__put(kmap);
> + return;
> + }
> +
> + /*
> + * The 'node_data' is an array of pointers to struct pglist_data.
> + * It needs to follow the pointer for each node in BPF to get the
> + * address of struct pglist_data and its zones.
> + */
> + sym = machine__find_kernel_symbol_by_name(con->machine,
> + "node_data",
> + &kmap);
> + if (sym == NULL)
> + return;
> +
> + skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
> + map__put(kmap);
> +
> + /* get the number of online nodes using the last node number + 1 */
> + ret = sysfs__read_str("devices/system/node/online", &buf, &len);
> + if (ret < 0) {
> + pr_debug("failed to read online node: %d\n", ret);
> + return;
> + }
> +
> + p = buf;
> + while (p && *p) {
> + last = strtol(p, &p, 0);
> +
> + if (p && (*p == ',' || *p == '-' || *p == '\n'))
> + p++;
> + }
> + skel->rodata->nr_nodes = last + 1;
> + free(buf);
> +}
> +
> int lock_contention_prepare(struct lock_contention *con)
> {
> int i, fd;
> @@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)
>
> bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);
>
> + init_numa_data(con);
> +
> if (lock_contention_bpf__load(skel) < 0) {
> pr_err("Failed to load lock-contention BPF skeleton\n");
> return -1;
> @@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
> return "rq_lock";
> }
>
> + if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
> + if (flags == LOCK_CLASS_ZONE_LOCK)
> + return "zone_lock";
> + }
> +
> /* look slab_hash for dynamic locks in a slab object */
> if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
> snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
> @@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
> }
>
> exit_slab_cache_iter();
> + btf__free(con->btf);
>
> return 0;
> }
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 69be7a4234e076e8..6f12c7d978a2e015 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -11,6 +11,9 @@
> /* for collect_lock_syms(). 4096 was rejected by the verifier */
> #define MAX_CPUS 1024
>
> +/* for collect_zone_lock(). It should be more than the actual zones. */
> +#define MAX_ZONES 10
> +
> /* lock contention flags from include/trace/events/lock.h */
> #define LCB_F_SPIN (1U << 0)
> #define LCB_F_READ (1U << 1)
> @@ -801,6 +804,11 @@ int contention_end(u64 *ctx)
>
> extern struct rq runqueues __ksym;
>
> +const volatile __u64 contig_page_data_addr;
> +const volatile __u64 node_data_addr;
> +const volatile int nr_nodes;
> +const volatile int sizeof_zone;
> +
> struct rq___old {
> raw_spinlock_t lock;
> } __attribute__((preserve_access_index));
> @@ -809,6 +817,59 @@ struct rq___new {
> raw_spinlock_t __lock;
> } __attribute__((preserve_access_index));
>
> +static void collect_zone_lock(void)
> +{
> + __u64 nr_zones, zone_off;
> + __u64 lock_addr, lock_off;
> + __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
> +
> + zone_off = offsetof(struct pglist_data, node_zones);
> + lock_off = offsetof(struct zone, lock);
> +
> + if (contig_page_data_addr) {
> + struct pglist_data *contig_page_data;
> +
> + contig_page_data = (void *)(long)contig_page_data_addr;
> + nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
> +
> + for (int i = 0; i < MAX_ZONES; i++) {
> + __u64 zone_addr;
> +
> + if (i >= nr_zones)
> + break;
> +
> + zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
> + lock_addr = zone_addr + lock_off;
> +
> + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> + }
> + } else if (nr_nodes > 0) {
> + struct pglist_data **node_data = (void *)(long)node_data_addr;
> +
> + for (int i = 0; i < nr_nodes; i++) {
> + struct pglist_data *pgdat = NULL;
> + int err;
> +
> + err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
> + if (err < 0 || pgdat == NULL)
> + break;
> +
> + nr_zones = BPF_CORE_READ(pgdat, nr_zones);
> + for (int k = 0; k < MAX_ZONES; k++) {
> + __u64 zone_addr;
> +
> + if (k >= nr_zones)
> + break;
> +
> + zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
> + lock_addr = zone_addr + lock_off;
> +
> + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> + }
> + }
> + }
> +}
> +
> SEC("raw_tp/bpf_test_finish")
> int BPF_PROG(collect_lock_syms)
> {
> @@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
> lock_flag = LOCK_CLASS_RQLOCK;
> bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> }
> +
> + collect_zone_lock();
> +
> return 0;
> }
>
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index 15f5743bd409f2f9..28c5e5aced7fcc91 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -67,6 +67,7 @@ enum lock_aggr_mode {
> enum lock_class_sym {
> LOCK_CLASS_NONE,
> LOCK_CLASS_RQLOCK,
> + LOCK_CLASS_ZONE_LOCK,
> };
>
> struct slab_cache_data {
> diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> index 7b81d3173917fdb5..a59ce912be18cd0f 100644
> --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> @@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
> struct kmem_cache *s;
> } __attribute__((preserve_access_index));
>
> +struct zone {
> + spinlock_t lock;
> +} __attribute__((preserve_access_index));
> +
> +struct pglist_data {
> + struct zone node_zones[6]; /* value for all possible config */
> + int nr_zones;
> +} __attribute__((preserve_access_index));
> +
> #endif // __VMLINUX_H
> diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> index b5d916aa49df6424..d331ce8e3caad4cb 100644
> --- a/tools/perf/util/lock-contention.h
> +++ b/tools/perf/util/lock-contention.h
> @@ -142,6 +142,7 @@ struct lock_contention {
> struct lock_filter *filters;
> struct lock_contention_fails fails;
> struct rb_root cgroups;
> + void *btf;
> unsigned long map_nr_entries;
> int max_stack;
> int stack_skip;
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] perf lock contention: Symbolize zone->lock using BTF
2025-04-28 4:42 ` Namhyung Kim
@ 2025-04-29 15:28 ` Arnaldo Carvalho de Melo
0 siblings, 0 replies; 3+ messages in thread
From: Arnaldo Carvalho de Melo @ 2025-04-29 15:28 UTC (permalink / raw)
To: Namhyung Kim
Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf,
Stephane Eranian, linux-mm
On Sun, Apr 27, 2025 at 09:42:10PM -0700, Namhyung Kim wrote:
> Ping!
Thanks!
Applied. :-)
- Arnaldo
> On Mon, Mar 31, 2025 at 11:30:55PM -0700, Namhyung Kim wrote:
> > The struct zone is embedded in struct pglist_data which can be allocated
> > for each NUMA node early in the boot process. As it's not a slab object
> > nor a global lock, this was not symbolized.
> >
> > Since the zone->lock is often contended, it'd be nice if we can
> > symbolize it. On NUMA systems, node_data array will have pointers for
> > struct pglist_data. By following the pointer, it can calculate the
> > address of each zone and its lock using BTF. On UMA, it can just use
> > contig_page_data and its zones.
> >
> > The following example shows the zone lock contention at the end.
> >
> > $ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
> > # Running 'sched/messaging' benchmark:
> > # 20 sender and receiver processes per group
> > # 10 groups == 400 processes run
> >
> > Total time: 0.038 [sec]
> > contended total wait max wait avg wait address symbol
> >
> > 5167 18.17 ms 10.27 us 3.52 us ffff953340052d00 &kmem_cache_node (spinlock)
> > 38 11.75 ms 465.49 us 309.13 us ffff95334060c480 &sock_inode_cache (spinlock)
> > 3916 10.13 ms 10.43 us 2.59 us ffff953342aecb40 &kmem_cache_node (spinlock)
> > 2963 10.02 ms 13.75 us 3.38 us ffff9533d2344098 &kmalloc-rnd-08-2k (spinlock)
> > 216 5.05 ms 99.49 us 23.39 us ffff9542bf7d65d0 zone_lock (spinlock)
> >
> > Cc: linux-mm@kvack.org
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > tools/perf/util/bpf_lock_contention.c | 88 +++++++++++++++++--
> > .../perf/util/bpf_skel/lock_contention.bpf.c | 64 ++++++++++++++
> > tools/perf/util/bpf_skel/lock_data.h | 1 +
> > tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 9 ++
> > tools/perf/util/lock-contention.h | 1 +
> > 5 files changed, 157 insertions(+), 6 deletions(-)
> >
> > diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> > index 5af8f6d1bc952613..98395667220e58ee 100644
> > --- a/tools/perf/util/bpf_lock_contention.c
> > +++ b/tools/perf/util/bpf_lock_contention.c
> > @@ -12,6 +12,7 @@
> > #include "util/lock-contention.h"
> > #include <linux/zalloc.h>
> > #include <linux/string.h>
> > +#include <api/fs/fs.h>
> > #include <bpf/bpf.h>
> > #include <bpf/btf.h>
> > #include <inttypes.h>
> > @@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)
> >
> > static void check_slab_cache_iter(struct lock_contention *con)
> > {
> > - struct btf *btf = btf__load_vmlinux_btf();
> > s32 ret;
> >
> > hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);
> >
> > - if (btf == NULL) {
> > + con->btf = btf__load_vmlinux_btf();
> > + if (con->btf == NULL) {
> > pr_debug("BTF loading failed: %s\n", strerror(errno));
> > return;
> > }
> >
> > - ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
> > + ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
> > if (ret < 0) {
> > bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
> > pr_debug("slab cache iterator is not available: %d\n", ret);
> > - goto out;
> > + return;
> > }
> >
> > has_slab_iter = true;
> >
> > bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
> > -out:
> > - btf__free(btf);
> > }
> >
> > static void run_slab_cache_iter(void)
> > @@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
> > hashmap__clear(&slab_hash);
> > }
> >
> > +static void init_numa_data(struct lock_contention *con)
> > +{
> > + struct symbol *sym;
> > + struct map *kmap;
> > + char *buf = NULL, *p;
> > + size_t len;
> > + long last = -1;
> > + int ret;
> > +
> > + /*
> > + * 'struct zone' is embedded in 'struct pglist_data' as an array.
> > + * As we may not have full information of the struct zone in the
> > + * (fake) vmlinux.h, let's get the actual size from BTF.
> > + */
> > + ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
> > + if (ret < 0) {
> > + pr_debug("cannot get type of struct zone: %d\n", ret);
> > + return;
> > + }
> > +
> > + ret = btf__resolve_size(con->btf, ret);
> > + if (ret < 0) {
> > + pr_debug("cannot get size of struct zone: %d\n", ret);
> > + return;
> > + }
> > + skel->rodata->sizeof_zone = ret;
> > +
> > + /* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
> > + sym = machine__find_kernel_symbol_by_name(con->machine,
> > + "contig_page_data",
> > + &kmap);
> > + if (sym) {
> > + skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
> > + map__put(kmap);
> > + return;
> > + }
> > +
> > + /*
> > + * The 'node_data' is an array of pointers to struct pglist_data.
> > + * It needs to follow the pointer for each node in BPF to get the
> > + * address of struct pglist_data and its zones.
> > + */
> > + sym = machine__find_kernel_symbol_by_name(con->machine,
> > + "node_data",
> > + &kmap);
> > + if (sym == NULL)
> > + return;
> > +
> > + skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
> > + map__put(kmap);
> > +
> > + /* get the number of online nodes using the last node number + 1 */
> > + ret = sysfs__read_str("devices/system/node/online", &buf, &len);
> > + if (ret < 0) {
> > + pr_debug("failed to read online node: %d\n", ret);
> > + return;
> > + }
> > +
> > + p = buf;
> > + while (p && *p) {
> > + last = strtol(p, &p, 0);
> > +
> > + if (p && (*p == ',' || *p == '-' || *p == '\n'))
> > + p++;
> > + }
> > + skel->rodata->nr_nodes = last + 1;
> > + free(buf);
> > +}
> > +
> > int lock_contention_prepare(struct lock_contention *con)
> > {
> > int i, fd;
> > @@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)
> >
> > bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);
> >
> > + init_numa_data(con);
> > +
> > if (lock_contention_bpf__load(skel) < 0) {
> > pr_err("Failed to load lock-contention BPF skeleton\n");
> > return -1;
> > @@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
> > return "rq_lock";
> > }
> >
> > + if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
> > + if (flags == LOCK_CLASS_ZONE_LOCK)
> > + return "zone_lock";
> > + }
> > +
> > /* look slab_hash for dynamic locks in a slab object */
> > if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
> > snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
> > @@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
> > }
> >
> > exit_slab_cache_iter();
> > + btf__free(con->btf);
> >
> > return 0;
> > }
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 69be7a4234e076e8..6f12c7d978a2e015 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -11,6 +11,9 @@
> > /* for collect_lock_syms(). 4096 was rejected by the verifier */
> > #define MAX_CPUS 1024
> >
> > +/* for collect_zone_lock(). It should be more than the actual zones. */
> > +#define MAX_ZONES 10
> > +
> > /* lock contention flags from include/trace/events/lock.h */
> > #define LCB_F_SPIN (1U << 0)
> > #define LCB_F_READ (1U << 1)
> > @@ -801,6 +804,11 @@ int contention_end(u64 *ctx)
> >
> > extern struct rq runqueues __ksym;
> >
> > +const volatile __u64 contig_page_data_addr;
> > +const volatile __u64 node_data_addr;
> > +const volatile int nr_nodes;
> > +const volatile int sizeof_zone;
> > +
> > struct rq___old {
> > raw_spinlock_t lock;
> > } __attribute__((preserve_access_index));
> > @@ -809,6 +817,59 @@ struct rq___new {
> > raw_spinlock_t __lock;
> > } __attribute__((preserve_access_index));
> >
> > +static void collect_zone_lock(void)
> > +{
> > + __u64 nr_zones, zone_off;
> > + __u64 lock_addr, lock_off;
> > + __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
> > +
> > + zone_off = offsetof(struct pglist_data, node_zones);
> > + lock_off = offsetof(struct zone, lock);
> > +
> > + if (contig_page_data_addr) {
> > + struct pglist_data *contig_page_data;
> > +
> > + contig_page_data = (void *)(long)contig_page_data_addr;
> > + nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
> > +
> > + for (int i = 0; i < MAX_ZONES; i++) {
> > + __u64 zone_addr;
> > +
> > + if (i >= nr_zones)
> > + break;
> > +
> > + zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
> > + lock_addr = zone_addr + lock_off;
> > +
> > + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> > + }
> > + } else if (nr_nodes > 0) {
> > + struct pglist_data **node_data = (void *)(long)node_data_addr;
> > +
> > + for (int i = 0; i < nr_nodes; i++) {
> > + struct pglist_data *pgdat = NULL;
> > + int err;
> > +
> > + err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
> > + if (err < 0 || pgdat == NULL)
> > + break;
> > +
> > + nr_zones = BPF_CORE_READ(pgdat, nr_zones);
> > + for (int k = 0; k < MAX_ZONES; k++) {
> > + __u64 zone_addr;
> > +
> > + if (k >= nr_zones)
> > + break;
> > +
> > + zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
> > + lock_addr = zone_addr + lock_off;
> > +
> > + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> > + }
> > + }
> > + }
> > +}
> > +
> > SEC("raw_tp/bpf_test_finish")
> > int BPF_PROG(collect_lock_syms)
> > {
> > @@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
> > lock_flag = LOCK_CLASS_RQLOCK;
> > bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
> > }
> > +
> > + collect_zone_lock();
> > +
> > return 0;
> > }
> >
> > diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> > index 15f5743bd409f2f9..28c5e5aced7fcc91 100644
> > --- a/tools/perf/util/bpf_skel/lock_data.h
> > +++ b/tools/perf/util/bpf_skel/lock_data.h
> > @@ -67,6 +67,7 @@ enum lock_aggr_mode {
> > enum lock_class_sym {
> > LOCK_CLASS_NONE,
> > LOCK_CLASS_RQLOCK,
> > + LOCK_CLASS_ZONE_LOCK,
> > };
> >
> > struct slab_cache_data {
> > diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> > index 7b81d3173917fdb5..a59ce912be18cd0f 100644
> > --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> > +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
> > @@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
> > struct kmem_cache *s;
> > } __attribute__((preserve_access_index));
> >
> > +struct zone {
> > + spinlock_t lock;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct pglist_data {
> > + struct zone node_zones[6]; /* value for all possible config */
> > + int nr_zones;
> > +} __attribute__((preserve_access_index));
> > +
> > #endif // __VMLINUX_H
> > diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> > index b5d916aa49df6424..d331ce8e3caad4cb 100644
> > --- a/tools/perf/util/lock-contention.h
> > +++ b/tools/perf/util/lock-contention.h
> > @@ -142,6 +142,7 @@ struct lock_contention {
> > struct lock_filter *filters;
> > struct lock_contention_fails fails;
> > struct rb_root cgroups;
> > + void *btf;
> > unsigned long map_nr_entries;
> > int max_stack;
> > int stack_skip;
> > --
> > 2.49.0
> >
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-04-29 15:28 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-01 6:30 [PATCH] perf lock contention: Symbolize zone->lock using BTF Namhyung Kim
2025-04-28 4:42 ` Namhyung Kim
2025-04-29 15:28 ` Arnaldo Carvalho de Melo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).