[RFC/PATCH] perf lock contention: Add -J/--inject-delay option

linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
@ 2025-02-25  7:59 Namhyung Kim
  2025-02-25 16:44 ` Ian Rogers
  2025-05-08 14:45 ` Arnaldo Carvalho de Melo
  0 siblings, 2 replies; 6+ messages in thread
From: Namhyung Kim @ 2025-02-25  7:59 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Song Liu, bpf, Stephane Eranian

This is to slow down lock acquistion (on contention locks) deliberately.
A possible use case is to estimate impact on application performance by
optimization of kernel locking behavior.  By delaying the lock it can
simulate the worse condition as a control group, and then compare with
the current behavior as a optimized condition.

The syntax is 'time@function' and the time can have unit suffix like
"us" and "ms".  For example, I ran a simple test like below.

  $ sudo perf lock con -abl -L tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

          92      1.18 ms    199.54 us     12.79 us   ffffffff8a806080   tasklist_lock (rwlock)

The contention count was 92 and the average wait time was around 10 us.
But if I add 100 usec of delay to the tasklist_lock,

  $ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

         190     15.67 ms    230.10 us     82.46 us   ffffffff8a806080   tasklist_lock (rwlock)

The contention count increased and the average wait time was up closed
to 100 usec.  If I increase the delay even more,

  $ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

        1002      2.80 s       3.01 ms      2.80 ms   ffffffff8a806080   tasklist_lock (rwlock)

Now every sleep process had contention and the wait time was more than 1
msec.  This is on my 4 CPU laptop so I guess one CPU has the lock while
other 3 are waiting for it mostly.

For simplicity, it only supports global locks for now.

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-lock.txt        | 11 +++
 tools/perf/builtin-lock.c                     | 74 +++++++++++++++++++
 tools/perf/util/bpf_lock_contention.c         | 28 +++++++
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 43 +++++++++++
 tools/perf/util/lock-contention.h             |  8 ++
 5 files changed, 164 insertions(+)

diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index d3793054f7d35626..151fc837587b216e 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -215,6 +215,17 @@ CONTENTION OPTIONS
 --cgroup-filter=<value>::
 	Show lock contention only in the given cgroups (comma separated list).
 
+-J::
+--inject-delay=<time@function>::
+	Add delays to the given lock.  It's added to the contention-end part so
+	that the (new) owner of the lock will be delayed.  But by slowing down
+	the owner, the waiters will also be delayed as well.  This is working
+	only with -b/--use-bpf.
+
+	The 'time' is specified in nsec but it can have a unit suffix.  Available
+	units are "ms" and "us".  Note that it will busy-wait after it gets the
+	lock.  Please use it at your own risk.
+
 
 SEE ALSO
 --------
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 5d405cd8e696d21b..3ef452d5d9f5679d 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -62,6 +62,8 @@ static const char *output_name = NULL;
 static FILE *lock_output;
 
 static struct lock_filter filters;
+static struct lock_delay *delays;
+static int nr_delays;
 
 static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
 
@@ -1971,6 +1973,8 @@ static int __cmd_contention(int argc, const char **argv)
 		.max_stack = max_stack_depth,
 		.stack_skip = stack_skip,
 		.filters = &filters,
+		.delays = delays,
+		.nr_delays = nr_delays,
 		.save_callstack = needs_callstack(),
 		.owner = show_lock_owner,
 		.cgroups = RB_ROOT,
@@ -2474,6 +2478,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
 	return ret;
 }
 
+static bool add_lock_delay(char *spec)
+{
+	char *at, *pos;
+	struct lock_delay *tmp;
+	unsigned long duration;
+
+	at = strchr(spec, '@');
+	if (at == NULL) {
+		pr_err("lock delay should have '@' sign: %s\n", spec);
+		return false;
+	}
+	if (at == spec) {
+		pr_err("lock delay should have time before '@': %s\n", spec);
+		return false;
+	}
+
+	*at = '\0';
+	duration = strtoul(spec, &pos, 0);
+	if (!strcmp(pos, "ns"))
+		duration *= 1;
+	else if (!strcmp(pos, "us"))
+		duration *= 1000;
+	else if (!strcmp(pos, "ms"))
+		duration *= 1000 * 1000;
+	else if (*pos) {
+		pr_err("invalid delay time: %s@%s\n", spec, at + 1);
+		return false;
+	}
+
+	tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
+	if (tmp == NULL) {
+		pr_err("Memory allocation failure\n");
+		return false;
+	}
+	delays = tmp;
+
+	delays[nr_delays].sym = strdup(at + 1);
+	if (delays[nr_delays].sym == NULL) {
+		pr_err("Memory allocation failure\n");
+		return false;
+	}
+	delays[nr_delays].time = duration;
+
+	nr_delays++;
+	return true;
+}
+
+static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
+			    int unset __maybe_unused)
+{
+	char *s, *tmp, *tok;
+	int ret = 0;
+
+	s = strdup(str);
+	if (s == NULL)
+		return -1;
+
+	for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (!add_lock_delay(tok)) {
+			ret = -1;
+			break;
+		}
+	}
+
+	free(s);
+	return ret;
+}
+
 int cmd_lock(int argc, const char **argv)
 {
 	const struct option lock_options[] = {
@@ -2550,6 +2622,8 @@ int cmd_lock(int argc, const char **argv)
 	OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
 	OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
 		     "Filter specific cgroups", parse_cgroup_filter),
+	OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
+		     "Inject delays to specific locks", parse_lock_delay),
 	OPT_PARENT(lock_options)
 	};
 
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index fc8666222399c995..99b64f303e761cbc 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -183,6 +183,27 @@ int lock_contention_prepare(struct lock_contention *con)
 		skel->rodata->has_addr = 1;
 	}
 
+	/* resolve lock name in delays */
+	if (con->nr_delays) {
+		struct symbol *sym;
+		struct map *kmap;
+
+		for (i = 0; i < con->nr_delays; i++) {
+			sym = machine__find_kernel_symbol_by_name(con->machine,
+								  con->delays[i].sym,
+								  &kmap);
+			if (sym == NULL) {
+				pr_warning("ignore unknown symbol: %s\n",
+					   con->delays[i].sym);
+				continue;
+			}
+
+			con->delays[i].addr = map__unmap_ip(kmap, sym->start);
+		}
+		skel->rodata->lock_delay = 1;
+		bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
+	}
+
 	bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
 	bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
 	bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
@@ -272,6 +293,13 @@ int lock_contention_prepare(struct lock_contention *con)
 			bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
 	}
 
+	if (con->nr_delays) {
+		fd = bpf_map__fd(skel->maps.lock_delays);
+
+		for (i = 0; i < con->nr_delays; i++)
+			bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
+	}
+
 	if (con->aggr_mode == LOCK_AGGR_CGROUP)
 		read_all_cgroups(&con->cgroups);
 
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 6533ea9b044c71d1..0ac9ae2f1711a129 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -11,6 +11,9 @@
 /* for collect_lock_syms().  4096 was rejected by the verifier */
 #define MAX_CPUS  1024
 
+/* for do_lock_delay().  Arbitrarily set to 1 million. */
+#define MAX_LOOP  (1U << 20)
+
 /* lock contention flags from include/trace/events/lock.h */
 #define LCB_F_SPIN	(1U << 0)
 #define LCB_F_READ	(1U << 1)
@@ -114,6 +117,13 @@ struct {
 	__uint(max_entries, 1);
 } slab_caches SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, 1);
+} lock_delays SEC(".maps");
+
 struct rw_semaphore___old {
 	struct task_struct *owner;
 } __attribute__((preserve_access_index));
@@ -143,6 +153,7 @@ const volatile int needs_callstack;
 const volatile int stack_skip;
 const volatile int lock_owner;
 const volatile int use_cgroup_v2;
+const volatile int lock_delay;
 
 /* determine the key of lock stat */
 const volatile int aggr_mode;
@@ -348,6 +359,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
 	return 0;
 }
 
+static inline long delay_callback(__u64 idx, void *arg)
+{
+	__u64 target = *(__u64 *)arg;
+
+	if (target <= bpf_ktime_get_ns())
+		return 1;
+
+	/* just to kill time */
+	(void)bpf_get_prandom_u32();
+
+	return 0;
+}
+
+static inline void do_lock_delay(__u64 duration)
+{
+	__u64 target = bpf_ktime_get_ns() + duration;
+
+	bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
+}
+
+static inline void check_lock_delay(__u64 lock)
+{
+	__u64 *delay;
+
+	delay = bpf_map_lookup_elem(&lock_delays, &lock);
+	if (delay)
+		do_lock_delay(*delay);
+}
+
 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
 {
 	__u32 pid;
@@ -566,6 +606,9 @@ int contention_end(u64 *ctx)
 		data->min_time = duration;
 
 out:
+	if (lock_delay)
+		check_lock_delay(pelem->lock);
+
 	pelem->lock = 0;
 	if (need_delete)
 		bpf_map_delete_elem(&tstamp, &pid);
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index a09f7fe877df8184..12f6cb789ada1bc7 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -18,6 +18,12 @@ struct lock_filter {
 	char			**slabs;
 };
 
+struct lock_delay {
+	char			*sym;
+	unsigned long		addr;
+	unsigned long		time;
+};
+
 struct lock_stat {
 	struct hlist_node	hash_entry;
 	struct rb_node		rb;		/* used for sorting */
@@ -140,6 +146,7 @@ struct lock_contention {
 	struct machine *machine;
 	struct hlist_head *result;
 	struct lock_filter *filters;
+	struct lock_delay *delays;
 	struct lock_contention_fails fails;
 	struct rb_root cgroups;
 	unsigned long map_nr_entries;
@@ -148,6 +155,7 @@ struct lock_contention {
 	int aggr_mode;
 	int owner;
 	int nr_filtered;
+	int nr_delays;
 	bool save_callstack;
 };
 
-- 
2.48.1.658.g4767266eb4-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
  2025-02-25  7:59 [RFC/PATCH] perf lock contention: Add -J/--inject-delay option Namhyung Kim
@ 2025-02-25 16:44 ` Ian Rogers
  2025-05-08 14:45 ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 6+ messages in thread
From: Ian Rogers @ 2025-02-25 16:44 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: emery, Arnaldo Carvalho de Melo, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Song Liu, bpf, Stephane Eranian

On Mon, Feb 24, 2025 at 11:59 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> This is to slow down lock acquistion (on contention locks) deliberately.
> A possible use case is to estimate impact on application performance by
> optimization of kernel locking behavior.  By delaying the lock it can
> simulate the worse condition as a control group, and then compare with
> the current behavior as a optimized condition.

This is great! There are similarities to me with coz (Emery Berger
cc-ed for a headsup and possible input):
https://github.com/plasma-umass/coz

> The syntax is 'time@function' and the time can have unit suffix like
> "us" and "ms".  For example, I ran a simple test like below.
>
>   $ sudo perf lock con -abl -L tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
>
>           92      1.18 ms    199.54 us     12.79 us   ffffffff8a806080   tasklist_lock (rwlock)
>
> The contention count was 92 and the average wait time was around 10 us.
> But if I add 100 usec of delay to the tasklist_lock,
>
>   $ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
>
>          190     15.67 ms    230.10 us     82.46 us   ffffffff8a806080   tasklist_lock (rwlock)
>
> The contention count increased and the average wait time was up closed
> to 100 usec.  If I increase the delay even more,
>
>   $ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
>
>         1002      2.80 s       3.01 ms      2.80 ms   ffffffff8a806080   tasklist_lock (rwlock)
>
> Now every sleep process had contention and the wait time was more than 1
> msec.  This is on my 4 CPU laptop so I guess one CPU has the lock while
> other 3 are waiting for it mostly.
>
> For simplicity, it only supports global locks for now.
>
> Suggested-by: Stephane Eranian <eranian@google.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/Documentation/perf-lock.txt        | 11 +++
>  tools/perf/builtin-lock.c                     | 74 +++++++++++++++++++
>  tools/perf/util/bpf_lock_contention.c         | 28 +++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 43 +++++++++++
>  tools/perf/util/lock-contention.h             |  8 ++
>  5 files changed, 164 insertions(+)
>
> diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
> index d3793054f7d35626..151fc837587b216e 100644
> --- a/tools/perf/Documentation/perf-lock.txt
> +++ b/tools/perf/Documentation/perf-lock.txt
> @@ -215,6 +215,17 @@ CONTENTION OPTIONS
>  --cgroup-filter=<value>::
>         Show lock contention only in the given cgroups (comma separated list).
>
> +-J::
> +--inject-delay=<time@function>::
> +       Add delays to the given lock.  It's added to the contention-end part so
> +       that the (new) owner of the lock will be delayed.  But by slowing down
> +       the owner, the waiters will also be delayed as well.  This is working
> +       only with -b/--use-bpf.
> +
> +       The 'time' is specified in nsec but it can have a unit suffix.  Available
> +       units are "ms" and "us".  Note that it will busy-wait after it gets the
> +       lock.  Please use it at your own risk.
> +
>
>  SEE ALSO
>  --------
> diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
> index 5d405cd8e696d21b..3ef452d5d9f5679d 100644
> --- a/tools/perf/builtin-lock.c
> +++ b/tools/perf/builtin-lock.c
> @@ -62,6 +62,8 @@ static const char *output_name = NULL;
>  static FILE *lock_output;
>
>  static struct lock_filter filters;
> +static struct lock_delay *delays;
> +static int nr_delays;
>
>  static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
>
> @@ -1971,6 +1973,8 @@ static int __cmd_contention(int argc, const char **argv)
>                 .max_stack = max_stack_depth,
>                 .stack_skip = stack_skip,
>                 .filters = &filters,
> +               .delays = delays,
> +               .nr_delays = nr_delays,
>                 .save_callstack = needs_callstack(),
>                 .owner = show_lock_owner,
>                 .cgroups = RB_ROOT,
> @@ -2474,6 +2478,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
>         return ret;
>  }
>
> +static bool add_lock_delay(char *spec)
> +{
> +       char *at, *pos;
> +       struct lock_delay *tmp;
> +       unsigned long duration;
> +
> +       at = strchr(spec, '@');
> +       if (at == NULL) {
> +               pr_err("lock delay should have '@' sign: %s\n", spec);
> +               return false;
> +       }
> +       if (at == spec) {
> +               pr_err("lock delay should have time before '@': %s\n", spec);
> +               return false;
> +       }
> +
> +       *at = '\0';
> +       duration = strtoul(spec, &pos, 0);
> +       if (!strcmp(pos, "ns"))
> +               duration *= 1;
> +       else if (!strcmp(pos, "us"))
> +               duration *= 1000;
> +       else if (!strcmp(pos, "ms"))
> +               duration *= 1000 * 1000;
> +       else if (*pos) {
> +               pr_err("invalid delay time: %s@%s\n", spec, at + 1);
> +               return false;
> +       }
> +
> +       tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
> +       if (tmp == NULL) {
> +               pr_err("Memory allocation failure\n");
> +               return false;
> +       }
> +       delays = tmp;
> +
> +       delays[nr_delays].sym = strdup(at + 1);
> +       if (delays[nr_delays].sym == NULL) {
> +               pr_err("Memory allocation failure\n");
> +               return false;
> +       }
> +       delays[nr_delays].time = duration;
> +
> +       nr_delays++;
> +       return true;
> +}
> +
> +static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
> +                           int unset __maybe_unused)
> +{
> +       char *s, *tmp, *tok;
> +       int ret = 0;
> +
> +       s = strdup(str);
> +       if (s == NULL)
> +               return -1;
> +
> +       for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
> +               if (!add_lock_delay(tok)) {
> +                       ret = -1;
> +                       break;
> +               }
> +       }
> +
> +       free(s);
> +       return ret;
> +}
> +
>  int cmd_lock(int argc, const char **argv)
>  {
>         const struct option lock_options[] = {
> @@ -2550,6 +2622,8 @@ int cmd_lock(int argc, const char **argv)
>         OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
>         OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
>                      "Filter specific cgroups", parse_cgroup_filter),
> +       OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
> +                    "Inject delays to specific locks", parse_lock_delay),
>         OPT_PARENT(lock_options)
>         };
>
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index fc8666222399c995..99b64f303e761cbc 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -183,6 +183,27 @@ int lock_contention_prepare(struct lock_contention *con)
>                 skel->rodata->has_addr = 1;
>         }
>
> +       /* resolve lock name in delays */
> +       if (con->nr_delays) {
> +               struct symbol *sym;
> +               struct map *kmap;
> +
> +               for (i = 0; i < con->nr_delays; i++) {
> +                       sym = machine__find_kernel_symbol_by_name(con->machine,
> +                                                                 con->delays[i].sym,
> +                                                                 &kmap);
> +                       if (sym == NULL) {
> +                               pr_warning("ignore unknown symbol: %s\n",
> +                                          con->delays[i].sym);
> +                               continue;
> +                       }
> +
> +                       con->delays[i].addr = map__unmap_ip(kmap, sym->start);
> +               }
> +               skel->rodata->lock_delay = 1;
> +               bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
> +       }
> +
>         bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
>         bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
>         bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
> @@ -272,6 +293,13 @@ int lock_contention_prepare(struct lock_contention *con)
>                         bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
>         }
>
> +       if (con->nr_delays) {
> +               fd = bpf_map__fd(skel->maps.lock_delays);
> +
> +               for (i = 0; i < con->nr_delays; i++)
> +                       bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
> +       }
> +
>         if (con->aggr_mode == LOCK_AGGR_CGROUP)
>                 read_all_cgroups(&con->cgroups);
>
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 6533ea9b044c71d1..0ac9ae2f1711a129 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -11,6 +11,9 @@
>  /* for collect_lock_syms().  4096 was rejected by the verifier */
>  #define MAX_CPUS  1024
>
> +/* for do_lock_delay().  Arbitrarily set to 1 million. */
> +#define MAX_LOOP  (1U << 20)
> +
>  /* lock contention flags from include/trace/events/lock.h */
>  #define LCB_F_SPIN     (1U << 0)
>  #define LCB_F_READ     (1U << 1)
> @@ -114,6 +117,13 @@ struct {
>         __uint(max_entries, 1);
>  } slab_caches SEC(".maps");
>
> +struct {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __uint(key_size, sizeof(__u64));
> +       __uint(value_size, sizeof(__u64));
> +       __uint(max_entries, 1);
> +} lock_delays SEC(".maps");
> +
>  struct rw_semaphore___old {
>         struct task_struct *owner;
>  } __attribute__((preserve_access_index));
> @@ -143,6 +153,7 @@ const volatile int needs_callstack;
>  const volatile int stack_skip;
>  const volatile int lock_owner;
>  const volatile int use_cgroup_v2;
> +const volatile int lock_delay;
>
>  /* determine the key of lock stat */
>  const volatile int aggr_mode;
> @@ -348,6 +359,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
>         return 0;
>  }
>
> +static inline long delay_callback(__u64 idx, void *arg)
> +{
> +       __u64 target = *(__u64 *)arg;
> +
> +       if (target <= bpf_ktime_get_ns())
> +               return 1;
> +
> +       /* just to kill time */
> +       (void)bpf_get_prandom_u32();
> +
> +       return 0;
> +}
> +
> +static inline void do_lock_delay(__u64 duration)
> +{
> +       __u64 target = bpf_ktime_get_ns() + duration;
> +
> +       bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
> +}
> +
> +static inline void check_lock_delay(__u64 lock)
> +{
> +       __u64 *delay;
> +
> +       delay = bpf_map_lookup_elem(&lock_delays, &lock);
> +       if (delay)
> +               do_lock_delay(*delay);
> +}
> +
>  static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
>  {
>         __u32 pid;
> @@ -566,6 +606,9 @@ int contention_end(u64 *ctx)
>                 data->min_time = duration;
>
>  out:
> +       if (lock_delay)
> +               check_lock_delay(pelem->lock);
> +

So the delay happens when a lock is coming out of contending, ideally
I think the code should slow releasing the lock as you are trying to
see how different tasks are impacted (how sensitive they are) to the
critical sections done when a lock is held. I don't believe there is a
way to do this with a regular kernel and BPF due to not having
tracepoints on the lock fast paths, it may be possible if the LOCKDEP
build option has been enabled for the kernel. Perhaps this work can
motivate the tracepoints on the fast paths.

Thanks,
Ian

>         pelem->lock = 0;
>         if (need_delete)
>                 bpf_map_delete_elem(&tstamp, &pid);
> diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> index a09f7fe877df8184..12f6cb789ada1bc7 100644
> --- a/tools/perf/util/lock-contention.h
> +++ b/tools/perf/util/lock-contention.h
> @@ -18,6 +18,12 @@ struct lock_filter {
>         char                    **slabs;
>  };
>
> +struct lock_delay {
> +       char                    *sym;
> +       unsigned long           addr;
> +       unsigned long           time;
> +};
> +
>  struct lock_stat {
>         struct hlist_node       hash_entry;
>         struct rb_node          rb;             /* used for sorting */
> @@ -140,6 +146,7 @@ struct lock_contention {
>         struct machine *machine;
>         struct hlist_head *result;
>         struct lock_filter *filters;
> +       struct lock_delay *delays;
>         struct lock_contention_fails fails;
>         struct rb_root cgroups;
>         unsigned long map_nr_entries;
> @@ -148,6 +155,7 @@ struct lock_contention {
>         int aggr_mode;
>         int owner;
>         int nr_filtered;
> +       int nr_delays;
>         bool save_callstack;
>  };
>
> --
> 2.48.1.658.g4767266eb4-goog
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
  2025-02-25  7:59 [RFC/PATCH] perf lock contention: Add -J/--inject-delay option Namhyung Kim
  2025-02-25 16:44 ` Ian Rogers
@ 2025-05-08 14:45 ` Arnaldo Carvalho de Melo
  2025-05-08 17:30   ` Namhyung Kim
  1 sibling, 1 reply; 6+ messages in thread
From: Arnaldo Carvalho de Melo @ 2025-05-08 14:45 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf,
	Stephane Eranian

On Mon, Feb 24, 2025 at 11:59:29PM -0800, Namhyung Kim wrote:
> This is to slow down lock acquistion (on contention locks) deliberately.
> A possible use case is to estimate impact on application performance by
> optimization of kernel locking behavior.  By delaying the lock it can
> simulate the worse condition as a control group, and then compare with
> the current behavior as a optimized condition.

So this looks useful, I guess we can proceed and merge it?

- Arnaldo
 
> The syntax is 'time@function' and the time can have unit suffix like
> "us" and "ms".  For example, I ran a simple test like below.
> 
>   $ sudo perf lock con -abl -L tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
> 
>           92      1.18 ms    199.54 us     12.79 us   ffffffff8a806080   tasklist_lock (rwlock)
> 
> The contention count was 92 and the average wait time was around 10 us.
> But if I add 100 usec of delay to the tasklist_lock,
> 
>   $ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
> 
>          190     15.67 ms    230.10 us     82.46 us   ffffffff8a806080   tasklist_lock (rwlock)
> 
> The contention count increased and the average wait time was up closed
> to 100 usec.  If I increase the delay even more,
> 
>   $ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
>     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
>    contended   total wait     max wait     avg wait            address   symbol
> 
>         1002      2.80 s       3.01 ms      2.80 ms   ffffffff8a806080   tasklist_lock (rwlock)
> 
> Now every sleep process had contention and the wait time was more than 1
> msec.  This is on my 4 CPU laptop so I guess one CPU has the lock while
> other 3 are waiting for it mostly.
> 
> For simplicity, it only supports global locks for now.
> 
> Suggested-by: Stephane Eranian <eranian@google.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/Documentation/perf-lock.txt        | 11 +++
>  tools/perf/builtin-lock.c                     | 74 +++++++++++++++++++
>  tools/perf/util/bpf_lock_contention.c         | 28 +++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 43 +++++++++++
>  tools/perf/util/lock-contention.h             |  8 ++
>  5 files changed, 164 insertions(+)
> 
> diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
> index d3793054f7d35626..151fc837587b216e 100644
> --- a/tools/perf/Documentation/perf-lock.txt
> +++ b/tools/perf/Documentation/perf-lock.txt
> @@ -215,6 +215,17 @@ CONTENTION OPTIONS
>  --cgroup-filter=<value>::
>  	Show lock contention only in the given cgroups (comma separated list).
>  
> +-J::
> +--inject-delay=<time@function>::
> +	Add delays to the given lock.  It's added to the contention-end part so
> +	that the (new) owner of the lock will be delayed.  But by slowing down
> +	the owner, the waiters will also be delayed as well.  This is working
> +	only with -b/--use-bpf.
> +
> +	The 'time' is specified in nsec but it can have a unit suffix.  Available
> +	units are "ms" and "us".  Note that it will busy-wait after it gets the
> +	lock.  Please use it at your own risk.
> +
>  
>  SEE ALSO
>  --------
> diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
> index 5d405cd8e696d21b..3ef452d5d9f5679d 100644
> --- a/tools/perf/builtin-lock.c
> +++ b/tools/perf/builtin-lock.c
> @@ -62,6 +62,8 @@ static const char *output_name = NULL;
>  static FILE *lock_output;
>  
>  static struct lock_filter filters;
> +static struct lock_delay *delays;
> +static int nr_delays;
>  
>  static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
>  
> @@ -1971,6 +1973,8 @@ static int __cmd_contention(int argc, const char **argv)
>  		.max_stack = max_stack_depth,
>  		.stack_skip = stack_skip,
>  		.filters = &filters,
> +		.delays = delays,
> +		.nr_delays = nr_delays,
>  		.save_callstack = needs_callstack(),
>  		.owner = show_lock_owner,
>  		.cgroups = RB_ROOT,
> @@ -2474,6 +2478,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
>  	return ret;
>  }
>  
> +static bool add_lock_delay(char *spec)
> +{
> +	char *at, *pos;
> +	struct lock_delay *tmp;
> +	unsigned long duration;
> +
> +	at = strchr(spec, '@');
> +	if (at == NULL) {
> +		pr_err("lock delay should have '@' sign: %s\n", spec);
> +		return false;
> +	}
> +	if (at == spec) {
> +		pr_err("lock delay should have time before '@': %s\n", spec);
> +		return false;
> +	}
> +
> +	*at = '\0';
> +	duration = strtoul(spec, &pos, 0);
> +	if (!strcmp(pos, "ns"))
> +		duration *= 1;
> +	else if (!strcmp(pos, "us"))
> +		duration *= 1000;
> +	else if (!strcmp(pos, "ms"))
> +		duration *= 1000 * 1000;
> +	else if (*pos) {
> +		pr_err("invalid delay time: %s@%s\n", spec, at + 1);
> +		return false;
> +	}
> +
> +	tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
> +	if (tmp == NULL) {
> +		pr_err("Memory allocation failure\n");
> +		return false;
> +	}
> +	delays = tmp;
> +
> +	delays[nr_delays].sym = strdup(at + 1);
> +	if (delays[nr_delays].sym == NULL) {
> +		pr_err("Memory allocation failure\n");
> +		return false;
> +	}
> +	delays[nr_delays].time = duration;
> +
> +	nr_delays++;
> +	return true;
> +}
> +
> +static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
> +			    int unset __maybe_unused)
> +{
> +	char *s, *tmp, *tok;
> +	int ret = 0;
> +
> +	s = strdup(str);
> +	if (s == NULL)
> +		return -1;
> +
> +	for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
> +		if (!add_lock_delay(tok)) {
> +			ret = -1;
> +			break;
> +		}
> +	}
> +
> +	free(s);
> +	return ret;
> +}
> +
>  int cmd_lock(int argc, const char **argv)
>  {
>  	const struct option lock_options[] = {
> @@ -2550,6 +2622,8 @@ int cmd_lock(int argc, const char **argv)
>  	OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
>  	OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
>  		     "Filter specific cgroups", parse_cgroup_filter),
> +	OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
> +		     "Inject delays to specific locks", parse_lock_delay),
>  	OPT_PARENT(lock_options)
>  	};
>  
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index fc8666222399c995..99b64f303e761cbc 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -183,6 +183,27 @@ int lock_contention_prepare(struct lock_contention *con)
>  		skel->rodata->has_addr = 1;
>  	}
>  
> +	/* resolve lock name in delays */
> +	if (con->nr_delays) {
> +		struct symbol *sym;
> +		struct map *kmap;
> +
> +		for (i = 0; i < con->nr_delays; i++) {
> +			sym = machine__find_kernel_symbol_by_name(con->machine,
> +								  con->delays[i].sym,
> +								  &kmap);
> +			if (sym == NULL) {
> +				pr_warning("ignore unknown symbol: %s\n",
> +					   con->delays[i].sym);
> +				continue;
> +			}
> +
> +			con->delays[i].addr = map__unmap_ip(kmap, sym->start);
> +		}
> +		skel->rodata->lock_delay = 1;
> +		bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
> +	}
> +
>  	bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
>  	bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
>  	bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
> @@ -272,6 +293,13 @@ int lock_contention_prepare(struct lock_contention *con)
>  			bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
>  	}
>  
> +	if (con->nr_delays) {
> +		fd = bpf_map__fd(skel->maps.lock_delays);
> +
> +		for (i = 0; i < con->nr_delays; i++)
> +			bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
> +	}
> +
>  	if (con->aggr_mode == LOCK_AGGR_CGROUP)
>  		read_all_cgroups(&con->cgroups);
>  
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 6533ea9b044c71d1..0ac9ae2f1711a129 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -11,6 +11,9 @@
>  /* for collect_lock_syms().  4096 was rejected by the verifier */
>  #define MAX_CPUS  1024
>  
> +/* for do_lock_delay().  Arbitrarily set to 1 million. */
> +#define MAX_LOOP  (1U << 20)
> +
>  /* lock contention flags from include/trace/events/lock.h */
>  #define LCB_F_SPIN	(1U << 0)
>  #define LCB_F_READ	(1U << 1)
> @@ -114,6 +117,13 @@ struct {
>  	__uint(max_entries, 1);
>  } slab_caches SEC(".maps");
>  
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(key_size, sizeof(__u64));
> +	__uint(value_size, sizeof(__u64));
> +	__uint(max_entries, 1);
> +} lock_delays SEC(".maps");
> +
>  struct rw_semaphore___old {
>  	struct task_struct *owner;
>  } __attribute__((preserve_access_index));
> @@ -143,6 +153,7 @@ const volatile int needs_callstack;
>  const volatile int stack_skip;
>  const volatile int lock_owner;
>  const volatile int use_cgroup_v2;
> +const volatile int lock_delay;
>  
>  /* determine the key of lock stat */
>  const volatile int aggr_mode;
> @@ -348,6 +359,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
>  	return 0;
>  }
>  
> +static inline long delay_callback(__u64 idx, void *arg)
> +{
> +	__u64 target = *(__u64 *)arg;
> +
> +	if (target <= bpf_ktime_get_ns())
> +		return 1;
> +
> +	/* just to kill time */
> +	(void)bpf_get_prandom_u32();
> +
> +	return 0;
> +}
> +
> +static inline void do_lock_delay(__u64 duration)
> +{
> +	__u64 target = bpf_ktime_get_ns() + duration;
> +
> +	bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
> +}
> +
> +static inline void check_lock_delay(__u64 lock)
> +{
> +	__u64 *delay;
> +
> +	delay = bpf_map_lookup_elem(&lock_delays, &lock);
> +	if (delay)
> +		do_lock_delay(*delay);
> +}
> +
>  static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
>  {
>  	__u32 pid;
> @@ -566,6 +606,9 @@ int contention_end(u64 *ctx)
>  		data->min_time = duration;
>  
>  out:
> +	if (lock_delay)
> +		check_lock_delay(pelem->lock);
> +
>  	pelem->lock = 0;
>  	if (need_delete)
>  		bpf_map_delete_elem(&tstamp, &pid);
> diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> index a09f7fe877df8184..12f6cb789ada1bc7 100644
> --- a/tools/perf/util/lock-contention.h
> +++ b/tools/perf/util/lock-contention.h
> @@ -18,6 +18,12 @@ struct lock_filter {
>  	char			**slabs;
>  };
>  
> +struct lock_delay {
> +	char			*sym;
> +	unsigned long		addr;
> +	unsigned long		time;
> +};
> +
>  struct lock_stat {
>  	struct hlist_node	hash_entry;
>  	struct rb_node		rb;		/* used for sorting */
> @@ -140,6 +146,7 @@ struct lock_contention {
>  	struct machine *machine;
>  	struct hlist_head *result;
>  	struct lock_filter *filters;
> +	struct lock_delay *delays;
>  	struct lock_contention_fails fails;
>  	struct rb_root cgroups;
>  	unsigned long map_nr_entries;
> @@ -148,6 +155,7 @@ struct lock_contention {
>  	int aggr_mode;
>  	int owner;
>  	int nr_filtered;
> +	int nr_delays;
>  	bool save_callstack;
>  };
>  
> -- 
> 2.48.1.658.g4767266eb4-goog

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
  2025-05-08 14:45 ` Arnaldo Carvalho de Melo
@ 2025-05-08 17:30   ` Namhyung Kim
  2025-05-08 19:55     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 6+ messages in thread
From: Namhyung Kim @ 2025-05-08 17:30 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf,
	Stephane Eranian

Hi Arnaldo,

On Thu, May 08, 2025 at 11:45:58AM -0300, Arnaldo Carvalho de Melo wrote:
> On Mon, Feb 24, 2025 at 11:59:29PM -0800, Namhyung Kim wrote:
> > This is to slow down lock acquistion (on contention locks) deliberately.
> > A possible use case is to estimate impact on application performance by
> > optimization of kernel locking behavior.  By delaying the lock it can
> > simulate the worse condition as a control group, and then compare with
> > the current behavior as a optimized condition.
> 
> So this looks useful, I guess we can proceed and merge it?

That'd be great. :)

Thanks,
Namhyung

>  
> > The syntax is 'time@function' and the time can have unit suffix like
> > "us" and "ms".  For example, I ran a simple test like below.
> > 
> >   $ sudo perf lock con -abl -L tasklist_lock -- \
> >     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> >    contended   total wait     max wait     avg wait            address   symbol
> > 
> >           92      1.18 ms    199.54 us     12.79 us   ffffffff8a806080   tasklist_lock (rwlock)
> > 
> > The contention count was 92 and the average wait time was around 10 us.
> > But if I add 100 usec of delay to the tasklist_lock,
> > 
> >   $ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
> >     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> >    contended   total wait     max wait     avg wait            address   symbol
> > 
> >          190     15.67 ms    230.10 us     82.46 us   ffffffff8a806080   tasklist_lock (rwlock)
> > 
> > The contention count increased and the average wait time was up closed
> > to 100 usec.  If I increase the delay even more,
> > 
> >   $ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
> >     sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> >    contended   total wait     max wait     avg wait            address   symbol
> > 
> >         1002      2.80 s       3.01 ms      2.80 ms   ffffffff8a806080   tasklist_lock (rwlock)
> > 
> > Now every sleep process had contention and the wait time was more than 1
> > msec.  This is on my 4 CPU laptop so I guess one CPU has the lock while
> > other 3 are waiting for it mostly.
> > 
> > For simplicity, it only supports global locks for now.
> > 
> > Suggested-by: Stephane Eranian <eranian@google.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/perf/Documentation/perf-lock.txt        | 11 +++
> >  tools/perf/builtin-lock.c                     | 74 +++++++++++++++++++
> >  tools/perf/util/bpf_lock_contention.c         | 28 +++++++
> >  .../perf/util/bpf_skel/lock_contention.bpf.c  | 43 +++++++++++
> >  tools/perf/util/lock-contention.h             |  8 ++
> >  5 files changed, 164 insertions(+)
> > 
> > diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
> > index d3793054f7d35626..151fc837587b216e 100644
> > --- a/tools/perf/Documentation/perf-lock.txt
> > +++ b/tools/perf/Documentation/perf-lock.txt
> > @@ -215,6 +215,17 @@ CONTENTION OPTIONS
> >  --cgroup-filter=<value>::
> >  	Show lock contention only in the given cgroups (comma separated list).
> >  
> > +-J::
> > +--inject-delay=<time@function>::
> > +	Add delays to the given lock.  It's added to the contention-end part so
> > +	that the (new) owner of the lock will be delayed.  But by slowing down
> > +	the owner, the waiters will also be delayed as well.  This is working
> > +	only with -b/--use-bpf.
> > +
> > +	The 'time' is specified in nsec but it can have a unit suffix.  Available
> > +	units are "ms" and "us".  Note that it will busy-wait after it gets the
> > +	lock.  Please use it at your own risk.
> > +
> >  
> >  SEE ALSO
> >  --------
> > diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
> > index 5d405cd8e696d21b..3ef452d5d9f5679d 100644
> > --- a/tools/perf/builtin-lock.c
> > +++ b/tools/perf/builtin-lock.c
> > @@ -62,6 +62,8 @@ static const char *output_name = NULL;
> >  static FILE *lock_output;
> >  
> >  static struct lock_filter filters;
> > +static struct lock_delay *delays;
> > +static int nr_delays;
> >  
> >  static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
> >  
> > @@ -1971,6 +1973,8 @@ static int __cmd_contention(int argc, const char **argv)
> >  		.max_stack = max_stack_depth,
> >  		.stack_skip = stack_skip,
> >  		.filters = &filters,
> > +		.delays = delays,
> > +		.nr_delays = nr_delays,
> >  		.save_callstack = needs_callstack(),
> >  		.owner = show_lock_owner,
> >  		.cgroups = RB_ROOT,
> > @@ -2474,6 +2478,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
> >  	return ret;
> >  }
> >  
> > +static bool add_lock_delay(char *spec)
> > +{
> > +	char *at, *pos;
> > +	struct lock_delay *tmp;
> > +	unsigned long duration;
> > +
> > +	at = strchr(spec, '@');
> > +	if (at == NULL) {
> > +		pr_err("lock delay should have '@' sign: %s\n", spec);
> > +		return false;
> > +	}
> > +	if (at == spec) {
> > +		pr_err("lock delay should have time before '@': %s\n", spec);
> > +		return false;
> > +	}
> > +
> > +	*at = '\0';
> > +	duration = strtoul(spec, &pos, 0);
> > +	if (!strcmp(pos, "ns"))
> > +		duration *= 1;
> > +	else if (!strcmp(pos, "us"))
> > +		duration *= 1000;
> > +	else if (!strcmp(pos, "ms"))
> > +		duration *= 1000 * 1000;
> > +	else if (*pos) {
> > +		pr_err("invalid delay time: %s@%s\n", spec, at + 1);
> > +		return false;
> > +	}
> > +
> > +	tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
> > +	if (tmp == NULL) {
> > +		pr_err("Memory allocation failure\n");
> > +		return false;
> > +	}
> > +	delays = tmp;
> > +
> > +	delays[nr_delays].sym = strdup(at + 1);
> > +	if (delays[nr_delays].sym == NULL) {
> > +		pr_err("Memory allocation failure\n");
> > +		return false;
> > +	}
> > +	delays[nr_delays].time = duration;
> > +
> > +	nr_delays++;
> > +	return true;
> > +}
> > +
> > +static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
> > +			    int unset __maybe_unused)
> > +{
> > +	char *s, *tmp, *tok;
> > +	int ret = 0;
> > +
> > +	s = strdup(str);
> > +	if (s == NULL)
> > +		return -1;
> > +
> > +	for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
> > +		if (!add_lock_delay(tok)) {
> > +			ret = -1;
> > +			break;
> > +		}
> > +	}
> > +
> > +	free(s);
> > +	return ret;
> > +}
> > +
> >  int cmd_lock(int argc, const char **argv)
> >  {
> >  	const struct option lock_options[] = {
> > @@ -2550,6 +2622,8 @@ int cmd_lock(int argc, const char **argv)
> >  	OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
> >  	OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
> >  		     "Filter specific cgroups", parse_cgroup_filter),
> > +	OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
> > +		     "Inject delays to specific locks", parse_lock_delay),
> >  	OPT_PARENT(lock_options)
> >  	};
> >  
> > diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> > index fc8666222399c995..99b64f303e761cbc 100644
> > --- a/tools/perf/util/bpf_lock_contention.c
> > +++ b/tools/perf/util/bpf_lock_contention.c
> > @@ -183,6 +183,27 @@ int lock_contention_prepare(struct lock_contention *con)
> >  		skel->rodata->has_addr = 1;
> >  	}
> >  
> > +	/* resolve lock name in delays */
> > +	if (con->nr_delays) {
> > +		struct symbol *sym;
> > +		struct map *kmap;
> > +
> > +		for (i = 0; i < con->nr_delays; i++) {
> > +			sym = machine__find_kernel_symbol_by_name(con->machine,
> > +								  con->delays[i].sym,
> > +								  &kmap);
> > +			if (sym == NULL) {
> > +				pr_warning("ignore unknown symbol: %s\n",
> > +					   con->delays[i].sym);
> > +				continue;
> > +			}
> > +
> > +			con->delays[i].addr = map__unmap_ip(kmap, sym->start);
> > +		}
> > +		skel->rodata->lock_delay = 1;
> > +		bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
> > +	}
> > +
> >  	bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
> >  	bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
> >  	bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
> > @@ -272,6 +293,13 @@ int lock_contention_prepare(struct lock_contention *con)
> >  			bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
> >  	}
> >  
> > +	if (con->nr_delays) {
> > +		fd = bpf_map__fd(skel->maps.lock_delays);
> > +
> > +		for (i = 0; i < con->nr_delays; i++)
> > +			bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
> > +	}
> > +
> >  	if (con->aggr_mode == LOCK_AGGR_CGROUP)
> >  		read_all_cgroups(&con->cgroups);
> >  
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 6533ea9b044c71d1..0ac9ae2f1711a129 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -11,6 +11,9 @@
> >  /* for collect_lock_syms().  4096 was rejected by the verifier */
> >  #define MAX_CPUS  1024
> >  
> > +/* for do_lock_delay().  Arbitrarily set to 1 million. */
> > +#define MAX_LOOP  (1U << 20)
> > +
> >  /* lock contention flags from include/trace/events/lock.h */
> >  #define LCB_F_SPIN	(1U << 0)
> >  #define LCB_F_READ	(1U << 1)
> > @@ -114,6 +117,13 @@ struct {
> >  	__uint(max_entries, 1);
> >  } slab_caches SEC(".maps");
> >  
> > +struct {
> > +	__uint(type, BPF_MAP_TYPE_HASH);
> > +	__uint(key_size, sizeof(__u64));
> > +	__uint(value_size, sizeof(__u64));
> > +	__uint(max_entries, 1);
> > +} lock_delays SEC(".maps");
> > +
> >  struct rw_semaphore___old {
> >  	struct task_struct *owner;
> >  } __attribute__((preserve_access_index));
> > @@ -143,6 +153,7 @@ const volatile int needs_callstack;
> >  const volatile int stack_skip;
> >  const volatile int lock_owner;
> >  const volatile int use_cgroup_v2;
> > +const volatile int lock_delay;
> >  
> >  /* determine the key of lock stat */
> >  const volatile int aggr_mode;
> > @@ -348,6 +359,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
> >  	return 0;
> >  }
> >  
> > +static inline long delay_callback(__u64 idx, void *arg)
> > +{
> > +	__u64 target = *(__u64 *)arg;
> > +
> > +	if (target <= bpf_ktime_get_ns())
> > +		return 1;
> > +
> > +	/* just to kill time */
> > +	(void)bpf_get_prandom_u32();
> > +
> > +	return 0;
> > +}
> > +
> > +static inline void do_lock_delay(__u64 duration)
> > +{
> > +	__u64 target = bpf_ktime_get_ns() + duration;
> > +
> > +	bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
> > +}
> > +
> > +static inline void check_lock_delay(__u64 lock)
> > +{
> > +	__u64 *delay;
> > +
> > +	delay = bpf_map_lookup_elem(&lock_delays, &lock);
> > +	if (delay)
> > +		do_lock_delay(*delay);
> > +}
> > +
> >  static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
> >  {
> >  	__u32 pid;
> > @@ -566,6 +606,9 @@ int contention_end(u64 *ctx)
> >  		data->min_time = duration;
> >  
> >  out:
> > +	if (lock_delay)
> > +		check_lock_delay(pelem->lock);
> > +
> >  	pelem->lock = 0;
> >  	if (need_delete)
> >  		bpf_map_delete_elem(&tstamp, &pid);
> > diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> > index a09f7fe877df8184..12f6cb789ada1bc7 100644
> > --- a/tools/perf/util/lock-contention.h
> > +++ b/tools/perf/util/lock-contention.h
> > @@ -18,6 +18,12 @@ struct lock_filter {
> >  	char			**slabs;
> >  };
> >  
> > +struct lock_delay {
> > +	char			*sym;
> > +	unsigned long		addr;
> > +	unsigned long		time;
> > +};
> > +
> >  struct lock_stat {
> >  	struct hlist_node	hash_entry;
> >  	struct rb_node		rb;		/* used for sorting */
> > @@ -140,6 +146,7 @@ struct lock_contention {
> >  	struct machine *machine;
> >  	struct hlist_head *result;
> >  	struct lock_filter *filters;
> > +	struct lock_delay *delays;
> >  	struct lock_contention_fails fails;
> >  	struct rb_root cgroups;
> >  	unsigned long map_nr_entries;
> > @@ -148,6 +155,7 @@ struct lock_contention {
> >  	int aggr_mode;
> >  	int owner;
> >  	int nr_filtered;
> > +	int nr_delays;
> >  	bool save_callstack;
> >  };
> >  
> > -- 
> > 2.48.1.658.g4767266eb4-goog

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
  2025-05-08 17:30   ` Namhyung Kim
@ 2025-05-08 19:55     ` Arnaldo Carvalho de Melo
  2025-05-09 15:48       ` Namhyung Kim
  0 siblings, 1 reply; 6+ messages in thread
From: Arnaldo Carvalho de Melo @ 2025-05-08 19:55 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf,
	Stephane Eranian

On Thu, May 08, 2025 at 10:30:02AM -0700, Namhyung Kim wrote:
> On Thu, May 08, 2025 at 11:45:58AM -0300, Arnaldo Carvalho de Melo wrote:
> > On Mon, Feb 24, 2025 at 11:59:29PM -0800, Namhyung Kim wrote:
> > > This is to slow down lock acquistion (on contention locks) deliberately.
> > > A possible use case is to estimate impact on application performance by
> > > optimization of kernel locking behavior.  By delaying the lock it can
> > > simulate the worse condition as a control group, and then compare with
> > > the current behavior as a optimized condition.

> > So this looks useful, I guess we can proceed and merge it?

> That'd be great. :)

Can you please refresh it?

⬢ [acme@toolbox perf-tools-next]$        git am ./20250224_namhyung_perf_lock_contention_add_j_inject_delay_option.mbx
Applying: perf lock contention: Add -J/--inject-delay option
error: patch failed: tools/perf/util/bpf_skel/lock_contention.bpf.c:11
error: tools/perf/util/bpf_skel/lock_contention.bpf.c: patch does not apply
error: patch failed: tools/perf/util/lock-contention.h:140
error: tools/perf/util/lock-contention.h: patch does not apply
Patch failed at 0001 perf lock contention: Add -J/--inject-delay option
hint: Use 'git am --show-current-patch=diff' to see the failed patch
hint: When you have resolved this problem, run "git am --continue".
hint: If you prefer to skip this patch, run "git am --skip" instead.
hint: To restore the original branch and stop patching, run "git am --abort".
hint: Disable this message with "git config set advice.mergeConflict false"
⬢ [acme@toolbox perf-tools-next]$ git am --abort
⬢ [acme@toolbox perf-tools-next]$ 
⬢ [acme@toolbox perf-tools-next]$ patch -p1 < ./20250224_namhyung_perf_lock_contention_add_j_inject_delay_option.mbx
patching file tools/perf/Documentation/perf-lock.txt
Hunk #1 succeeded at 216 (offset 1 line).
patching file tools/perf/builtin-lock.c
Hunk #2 succeeded at 2003 (offset 30 lines).
Hunk #3 succeeded at 2508 (offset 30 lines).
Hunk #4 succeeded at 2652 (offset 30 lines).
patching file tools/perf/util/bpf_lock_contention.c
Hunk #1 succeeded at 261 (offset 78 lines).
Hunk #2 succeeded at 373 (offset 80 lines).
patching file tools/perf/util/bpf_skel/lock_contention.bpf.c
Hunk #1 succeeded at 14 with fuzz 2 (offset 3 lines).
Hunk #2 succeeded at 152 (offset 35 lines).
Hunk #3 FAILED at 153.
Hunk #4 succeeded at 397 (offset 39 lines).
Hunk #5 succeeded at 835 with fuzz 1 (offset 230 lines).
1 out of 5 hunks FAILED -- saving rejects to file tools/perf/util/bpf_skel/lock_contention.bpf.c.rej
patching file tools/perf/util/lock-contention.h
Hunk #2 succeeded at 146 with fuzz 1.
Hunk #3 succeeded at 156 (offset 1 line).
⬢ [acme@toolbox perf-tools-next]$

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
  2025-05-08 19:55     ` Arnaldo Carvalho de Melo
@ 2025-05-09 15:48       ` Namhyung Kim
  0 siblings, 0 replies; 6+ messages in thread
From: Namhyung Kim @ 2025-05-09 15:48 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf,
	Stephane Eranian

On Thu, May 08, 2025 at 04:55:54PM -0300, Arnaldo Carvalho de Melo wrote:
> On Thu, May 08, 2025 at 10:30:02AM -0700, Namhyung Kim wrote:
> > On Thu, May 08, 2025 at 11:45:58AM -0300, Arnaldo Carvalho de Melo wrote:
> > > On Mon, Feb 24, 2025 at 11:59:29PM -0800, Namhyung Kim wrote:
> > > > This is to slow down lock acquistion (on contention locks) deliberately.
> > > > A possible use case is to estimate impact on application performance by
> > > > optimization of kernel locking behavior.  By delaying the lock it can
> > > > simulate the worse condition as a control group, and then compare with
> > > > the current behavior as a optimized condition.
> 
> > > So this looks useful, I guess we can proceed and merge it?
> 
> > That'd be great. :)
> 
> Can you please refresh it?

Sure, will send v2.

Thanks,
Namhyung

> 
> ⬢ [acme@toolbox perf-tools-next]$        git am ./20250224_namhyung_perf_lock_contention_add_j_inject_delay_option.mbx
> Applying: perf lock contention: Add -J/--inject-delay option
> error: patch failed: tools/perf/util/bpf_skel/lock_contention.bpf.c:11
> error: tools/perf/util/bpf_skel/lock_contention.bpf.c: patch does not apply
> error: patch failed: tools/perf/util/lock-contention.h:140
> error: tools/perf/util/lock-contention.h: patch does not apply
> Patch failed at 0001 perf lock contention: Add -J/--inject-delay option
> hint: Use 'git am --show-current-patch=diff' to see the failed patch
> hint: When you have resolved this problem, run "git am --continue".
> hint: If you prefer to skip this patch, run "git am --skip" instead.
> hint: To restore the original branch and stop patching, run "git am --abort".
> hint: Disable this message with "git config set advice.mergeConflict false"
> ⬢ [acme@toolbox perf-tools-next]$ git am --abort
> ⬢ [acme@toolbox perf-tools-next]$ 
> ⬢ [acme@toolbox perf-tools-next]$ patch -p1 < ./20250224_namhyung_perf_lock_contention_add_j_inject_delay_option.mbx
> patching file tools/perf/Documentation/perf-lock.txt
> Hunk #1 succeeded at 216 (offset 1 line).
> patching file tools/perf/builtin-lock.c
> Hunk #2 succeeded at 2003 (offset 30 lines).
> Hunk #3 succeeded at 2508 (offset 30 lines).
> Hunk #4 succeeded at 2652 (offset 30 lines).
> patching file tools/perf/util/bpf_lock_contention.c
> Hunk #1 succeeded at 261 (offset 78 lines).
> Hunk #2 succeeded at 373 (offset 80 lines).
> patching file tools/perf/util/bpf_skel/lock_contention.bpf.c
> Hunk #1 succeeded at 14 with fuzz 2 (offset 3 lines).
> Hunk #2 succeeded at 152 (offset 35 lines).
> Hunk #3 FAILED at 153.
> Hunk #4 succeeded at 397 (offset 39 lines).
> Hunk #5 succeeded at 835 with fuzz 1 (offset 230 lines).
> 1 out of 5 hunks FAILED -- saving rejects to file tools/perf/util/bpf_skel/lock_contention.bpf.c.rej
> patching file tools/perf/util/lock-contention.h
> Hunk #2 succeeded at 146 with fuzz 1.
> Hunk #3 succeeded at 156 (offset 1 line).
> ⬢ [acme@toolbox perf-tools-next]$

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-05-09 15:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-25  7:59 [RFC/PATCH] perf lock contention: Add -J/--inject-delay option Namhyung Kim
2025-02-25 16:44 ` Ian Rogers
2025-05-08 14:45 ` Arnaldo Carvalho de Melo
2025-05-08 17:30   ` Namhyung Kim
2025-05-08 19:55     ` Arnaldo Carvalho de Melo
2025-05-09 15:48       ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).