Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* Re: [PATCHv4 bpf-next 25/25] selftests/bpf: Add tracing multi attach rollback tests
From: Leon Hwang @ 2026-03-25  6:45 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-26-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding tests for the rollback code when the tracing_multi
> link won't get attached, covering 2 reasons:
> 
>   - wrong btf id passed by user, where all previously allocated
>     trampolines will be released
>   - trampoline for requested function is fully attached (has already
>     maximum programs attached) and the link fails, the rollback code
>     needs to release all previously link-ed trampolines and release
>     them
> 
> We need the bpf_fentry_test* unattached for the tests to pass,
> so the rollback tests are serial.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  .../selftests/bpf/prog_tests/tracing_multi.c  | 213 ++++++++++++++++++
>  .../bpf/progs/tracing_multi_rollback.c        |  43 ++++
>  2 files changed, 256 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> index 6917471e329c..6ff0f72f8c46 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> @@ -10,6 +10,7 @@
>  #include "tracing_multi_session.skel.h"
>  #include "tracing_multi_fail.skel.h"
>  #include "tracing_multi_bench.skel.h"
> +#include "tracing_multi_rollback.skel.h"
>  #include "trace_helpers.h"
>  
>  static __u64 bpf_fentry_test_cookies[] = {
> @@ -669,6 +670,218 @@ void serial_test_tracing_multi_bench_attach(void)
>  	free(ids);
>  }
>  
> +static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel)
> +{
> +	LIBBPF_OPTS(bpf_test_run_opts, topts);
> +	int err, prog_fd;
> +
> +	prog_fd = bpf_program__fd(skel->progs.test_fentry);
> +	err = bpf_prog_test_run_opts(prog_fd, &topts);
> +	ASSERT_OK(err, "test_run");
> +
> +	/* make sure the rollback code did not leave any program attached */
> +	ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry");
> +	ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit");
> +}
> +
> +static void test_rollback_put(void)
> +{
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_rollback *skel = NULL;
> +	size_t cnt = FUNCS_CNT;
> +	__u32 *ids = NULL;
> +	int err;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		return;
> +
> +	bpf_program__set_autoload(skel->progs.test_fentry, true);
> +	bpf_program__set_autoload(skel->progs.test_fexit, true);
> +
> +	err = tracing_multi_rollback__load(skel);
> +	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +		goto cleanup;
> +
> +	ids = get_ids(bpf_fentry_test, cnt, NULL);
> +	if (!ASSERT_OK_PTR(ids, "get_ids"))
> +		goto cleanup;
> +
> +	/*
> +	 * Mangle last id to trigger rollback, which needs to do put
> +	 * on get-ed trampolines.
> +	 */
> +	ids[9] = 0;
> +
> +	opts.ids = ids;
> +	opts.cnt = cnt;
> +
> +	skel->bss->pid = getpid();
> +
> +	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
> +						NULL, &opts);
> +	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
> +						NULL, &opts);
> +	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	/* We don't really attach any program, but let's make sure. */
> +	tracing_multi_rollback_run(skel);
> +
> +cleanup:
> +	tracing_multi_rollback__destroy(skel);
> +	free(ids);
> +}
> +
> +

NIT: keep one blank line here.

> +static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt)
> +{
> +	int i;
> +
> +	for (i = 0; i < cnt; i++)
> +		tracing_multi_rollback__destroy(skels[i]);
> +
> +	free(skels);
> +}
> +
> +static struct tracing_multi_rollback *extra_load_and_link(void)
> +{
> +	struct tracing_multi_rollback *skel;
> +	int err;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		goto cleanup;
> +
> +	bpf_program__set_autoload(skel->progs.extra, true);
> +
> +	err = tracing_multi_rollback__load(skel);
> +	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +		goto cleanup;
> +
> +	skel->links.extra = bpf_program__attach_trace(skel->progs.extra);
> +	if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace"))
> +		goto cleanup;
> +
> +	return skel;
> +
> +cleanup:
> +	tracing_multi_rollback__destroy(skel);
> +	return NULL;
> +}
> +
> +static struct tracing_multi_rollback **fillers_load_and_link(int max)
> +{
> +	struct tracing_multi_rollback **skels, *skel;
> +	int i, err;
> +
> +	skels = calloc(max + 1, sizeof(*skels));
> +	if (!ASSERT_OK_PTR(skels, "calloc"))
> +		return NULL;
> +
> +	for (i = 0; i < max; i++) {
> +		skel = skels[i] = tracing_multi_rollback__open();
> +		if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open"))
> +			goto cleanup;
> +
> +		bpf_program__set_autoload(skel->progs.filler, true);
> +
> +		err = tracing_multi_rollback__load(skel);
> +		if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +			goto cleanup;
> +
> +		skel->links.filler = bpf_program__attach_trace(skel->progs.filler);
> +		if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace"))
> +			goto cleanup;
> +	}
> +
> +	return skels;
> +
> +cleanup:
> +	fillers_cleanup(skels, i);
> +	return NULL;
> +}
> +
> +static void test_rollback_unlink(void)
> +{
> +	struct tracing_multi_rollback *skel, *extra;
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_rollback **fillers;
> +	size_t cnt = FUNCS_CNT;
> +	__u32 *ids = NULL;
> +	int err, max;
> +
> +	max = get_bpf_max_tramp_links();
> +	if (!ASSERT_GE(max, 1, "bpf_max_tramp_links"))
> +		return;
> +
> +	/* Attach maximum allowed programs to bpf_fentry_test10 */
> +	fillers = fillers_load_and_link(max);
> +	if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link"))
> +		return;
> +
> +	extra = extra_load_and_link();
> +	if (!ASSERT_OK_PTR(extra, "extra_load_and_link"))

Should cleanup fillers here?

Thanks,
Leon

> +		return;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		goto cleanup;
> +
[...]


^ permalink raw reply

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Leon Hwang @ 2026-03-25  6:45 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-25-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding benchmark test that attaches to (almost) all allowed tracing
> functions and display attach/detach times.
> 
>   # ./test_progs -t tracing_multi_bench_attach -v
>   bpf_testmod.ko is already unloaded.
>   Loading bpf_testmod.ko...
>   Successfully loaded bpf_testmod.ko.
>   serial_test_tracing_multi_bench_attach:PASS:btf__load_vmlinux_btf 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:tracing_multi_bench__open_and_load 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:get_syms 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:bpf_program__attach_tracing_multi 0 nsec
>   serial_test_tracing_multi_bench_attach: found 51186 functions
>   serial_test_tracing_multi_bench_attach: attached in   1.295s
>   serial_test_tracing_multi_bench_attach: detached in   0.243s
>   #507     tracing_multi_bench_attach:OK
>   Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
>   Successfully unloaded bpf_testmod.ko.
> 
> Exporting skip_entry as is_unsafe_function and using it in the test.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  .../selftests/bpf/prog_tests/tracing_multi.c  | 98 +++++++++++++++++++
>  .../selftests/bpf/progs/tracing_multi_bench.c | 12 +++
>  tools/testing/selftests/bpf/trace_helpers.c   |  6 +-
>  tools/testing/selftests/bpf/trace_helpers.h   |  1 +
>  4 files changed, 114 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> index dece45d8fb5e..6917471e329c 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> @@ -9,6 +9,7 @@
>  #include "tracing_multi_intersect.skel.h"
>  #include "tracing_multi_session.skel.h"
>  #include "tracing_multi_fail.skel.h"
> +#include "tracing_multi_bench.skel.h"
>  #include "trace_helpers.h"
>  
>  static __u64 bpf_fentry_test_cookies[] = {
> @@ -571,6 +572,103 @@ static void test_attach_api_fails(void)
>  	free(ids2);
>  }
>  
> +void serial_test_tracing_multi_bench_attach(void)
> +{
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_bench *skel = NULL;
> +	long attach_start_ns, attach_end_ns;
> +	long detach_start_ns, detach_end_ns;
> +	double attach_delta, detach_delta;
> +	struct bpf_link *link = NULL;
> +	size_t i, cap = 0, cnt = 0;
> +	struct ksyms *ksyms = NULL;
> +	void *root = NULL;
> +	__u32 *ids = NULL;
> +	__u32 nr, type_id;
> +	struct btf *btf;
> +	int err;
> +
> +#ifndef __x86_64__
> +	test__skip();
> +	return;
> +#endif
> +
> +	btf = btf__load_vmlinux_btf();
> +	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
> +		return;> +
> +	skel = tracing_multi_bench__open_and_load();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load"))
> +		goto cleanup;
> +
> +	if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms"))
> +		goto cleanup;
> +
> +	/* Get all ftrace 'safe' symbols.. */
> +	for (i = 0; i < ksyms->filtered_cnt; i++) {
> +		if (is_unsafe_function(ksyms->filtered_syms[i]))
> +			continue;
> +		tsearch(&ksyms->filtered_syms[i], &root, compare);
> +	}
> +
> +	/* ..and filter them through BTF and btf_type_is_traceable_func. */
> +	nr = btf__type_cnt(btf);
> +	for (type_id = 1; type_id < nr; type_id++) {
> +		const struct btf_type *type;
> +		const char *str;
> +
> +		type = btf__type_by_id(btf, type_id);
> +		if (!type)
> +			break;
> +
> +		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
> +			continue;
> +
> +		str = btf__name_by_offset(btf, type->name_off);
> +		if (!str)
> +			break;
> +
> +		if (!tfind(&str, &root, compare))
> +			continue;
> +
> +		if (!btf_type_is_traceable_func(btf, type))
> +			continue;
> +
> +		err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1);
> +		if (err)
> +			goto cleanup;
> +
> +		ids[cnt++] = type_id;
> +	}
> +
> +	opts.ids = ids;
> +	opts.cnt = cnt;
> +
> +	attach_start_ns = get_time_ns();
> +	link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
> +	attach_end_ns = get_time_ns();
> +
> +	if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	detach_start_ns = get_time_ns();
> +	bpf_link__destroy(link);
> +	detach_end_ns = get_time_ns();
> +
> +	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> +	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> +
> +	printf("%s: found %lu functions\n", __func__, cnt);
> +	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> +	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> +
> +cleanup:
> +	tracing_multi_bench__destroy(skel);
> +	tdestroy(root, tdestroy_free_nop);
> +	free_kallsyms_local(ksyms);
> +	free(ids);

Is btf__free(btf) missing here? Since 'btf' was calloc inner
btf__load_vmlinux_btf().

Thanks,
Leon

> +}
> +
>  void test_tracing_multi_test(void)
>  {
>  #ifndef __x86_64__
> diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> new file mode 100644
> index 000000000000..beae946cb8c4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <vmlinux.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("fentry.multi")
> +int BPF_PROG(bench)
> +{
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
> index 0e63daf83ed5..8de0b60766de 100644
> --- a/tools/testing/selftests/bpf/trace_helpers.c
> +++ b/tools/testing/selftests/bpf/trace_helpers.c
> @@ -548,7 +548,7 @@ static const char * const trace_blacklist[] = {
>  	"bpf_get_numa_node_id",
>  };
>  
> -static bool skip_entry(char *name)
> +bool is_unsafe_function(const char *name)
>  {
>  	int i;
>  
> @@ -651,7 +651,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel)
>  		free(name);
>  		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
>  			continue;
> -		if (skip_entry(name))
> +		if (is_unsafe_function(name))
>  			continue;
>  
>  		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
> @@ -728,7 +728,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
>  		free(name);
>  		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
>  			continue;
> -		if (skip_entry(name))
> +		if (is_unsafe_function(name))
>  			continue;
>  
>  		if (cnt == max_cnt) {
> diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
> index d5bf1433675d..01c8ecc45627 100644
> --- a/tools/testing/selftests/bpf/trace_helpers.h
> +++ b/tools/testing/selftests/bpf/trace_helpers.h
> @@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size);
>  int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel);
>  int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
>  
> +bool is_unsafe_function(const char *name);
>  #endif


^ permalink raw reply

* Re: [PATCHv4 bpf-next 13/25] bpf: Add support for tracing_multi link fdinfo
From: Leon Hwang @ 2026-03-25  6:43 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-14-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding tracing_multi link fdinfo support with following output:
> 
> pos:    0
> flags:  02000000
> mnt_id: 19
> ino:    3091
> link_type:      tracing_multi
> link_id:        382

Would better to add attach_type?

attach_type:	[fentry,fexit,fsession]_multi

Thanks,
Leon

> prog_tag:       62073a1123f07ef7
> prog_id:        715
> cnt:    10
> cookie   BTF-id  func
> 8        91203   bpf_fentry_test1+0x4/0x10
> 9        91205   bpf_fentry_test2+0x4/0x10
> 7        91206   bpf_fentry_test3+0x4/0x20
> 5        91207   bpf_fentry_test4+0x4/0x20
> 4        91208   bpf_fentry_test5+0x4/0x20
> 2        91209   bpf_fentry_test6+0x4/0x20
> 3        91210   bpf_fentry_test7+0x4/0x10
> 1        91211   bpf_fentry_test8+0x4/0x10
> 10       91212   bpf_fentry_test9+0x4/0x10
> 6        91204   bpf_fentry_test10+0x4/0x10
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  kernel/trace/bpf_trace.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 761501ce3a5f..41b691e83dc4 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3618,9 +3618,35 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
>  	kvfree(tr_link);
>  }
>  
> +#ifdef CONFIG_PROC_FS
> +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
> +					  struct seq_file *seq)
> +{
> +	struct bpf_tracing_multi_link *tr_link =
> +		container_of(link, struct bpf_tracing_multi_link, link);
> +	bool has_cookies = !!tr_link->cookies;
> +
> +	seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
> +
> +	seq_printf(seq, "%s\t %s\t %s\n", "cookie", "BTF-id", "func");
> +	for (int i = 0; i < tr_link->nodes_cnt; i++) {
> +		struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
> +		u32 btf_id;
> +
> +		bpf_trampoline_unpack_key(mnode->trampoline->key, NULL, &btf_id);
> +		seq_printf(seq, "%llu\t %u\t %pS\n",
> +			   has_cookies ? tr_link->cookies[i] : 0,
> +			   btf_id, (void *) mnode->trampoline->ip);
> +	}
> +}
> +#endif
> +
>  static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
>  	.release = bpf_tracing_multi_link_release,
>  	.dealloc_deferred = bpf_tracing_multi_link_dealloc,
> +#ifdef CONFIG_PROC_FS
> +	.show_fdinfo = bpf_tracing_multi_show_fdinfo,
> +#endif
>  };
>  
>  int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)


^ permalink raw reply

* Re: [PATCHv4 bpf-next 00/25] bpf: tracing_multi link
From: Leon Hwang @ 2026-03-25  6:42 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: Hengqi Chen, bpf, linux-trace-kernel, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, Menglong Dong,
	Steven Rostedt
In-Reply-To: <20260324081846.2334094-1-jolsa@kernel.org>

Hi Jiri,

Nice version for tracing_multi link.

I hope I have time to add tracing_multi link support to bpfsnoop, and
test this new tracing feature.

I left comments on patches #13, #24, and #25.

Hope this series lands in bpf-next soon.

Thanks,
Leon

On 24/3/26 16:18, Jiri Olsa wrote:
> hi,
> adding tracing_multi link support that allows fast attachment
> of tracing program to many functions.
> 
> RFC: https://lore.kernel.org/bpf/20260203093819.2105105-1-jolsa@kernel.org/
> v1: https://lore.kernel.org/bpf/20260220100649.628307-1-jolsa@kernel.org/
> v2: https://lore.kernel.org/bpf/20260304222141.497203-1-jolsa@kernel.org/
> v3: https://lore.kernel.org/bpf/20260316075138.465430-1-jolsa@kernel.org/
> 
> v4 changes:
> - unlink rollback fix (added ftrace_hash_count) [bot]
> - use const for some bpf_link_create_opts tracing_multi members [bot]
> - adding missing comment for lockdep keys [bot]
> - selftest error path fixes (leaks) and other assorted test fixes [Leon Hwang]
> - several compile fixes wrt CONFIG_BPF_SYSCALL and CONFIG_BPF_JIT [kernel test robot]
> - make ftrace_hash_clear global, because it's needed in rollback
> 
> v3 changes:
> - fix module parsing [Leon Hwang]
> - use function traceable check from libbpf [Leon Hwang]
> - use ptr_to_u64 and fix/updated few comments [ci]
> - display cookies as decimal numbers [ci]
> - added link_create.flags check [ci]
> - fix error path in bpf_trampoline_multi_detach [ci]
> - make fentry/fexit.multi not extendable [ci]
> - add missing OPTS_VALID to bpf_program__attach_tracing_multi [ci]
> 
> v2 changes:
> - allocate data.unreg in bpf_trampoline_multi_attach for rollback path [ci]
>   and fixed link count setup in rollback path [ci]
> - several small assorted fixes [ci]
> - added loongarch and powerpc changes for struct bpf_tramp_node change
> - added support to attach functions from modules
> - added tests for sleepable programs
> - added rollback tests
> 
> v1 changes:
> - added ftrace_hash_count as wrapper for hash_count [Steven]
> - added trampoline mutex pool [Andrii]
> - reworked 'struct bpf_tramp_node' separatoin [Andrii]
>   - the 'struct bpf_tramp_node' now holds pointer to bpf_link,
>     which is similar to what we do for uprobe_multi;
>     I understand it's not a fundamental change compared to previous
>     version which used bpf_prog pointer instead, but I don't see better
>     way of doing this.. I'm happy to discuss this further if there's
>     better idea
> - reworked 'struct bpf_fsession_link' based on bpf_tramp_node
> - made btf__find_by_glob_kind function internal helper [Andrii]
> - many small assorted fixes [Andrii,CI]
> - added session support [Leon Hwang]
> - added cookies support
> - added more tests
> 
> 
> Note I plan to send linkinfo support separately, the patchset is big enough.
> 
> thanks,
> jirka
> 
> 
> Cc: Hengqi Chen <hengqi.chen@gmail.com>
> ---
> Jiri Olsa (25):
>       ftrace: Add ftrace_hash_count function
>       ftrace: Make ftrace_hash_clear global
>       bpf: Use mutex lock pool for bpf trampolines
>       bpf: Add struct bpf_trampoline_ops object
>       bpf: Add struct bpf_tramp_node object
>       bpf: Factor fsession link to use struct bpf_tramp_node
>       bpf: Add multi tracing attach types
>       bpf: Move sleepable verification code to btf_id_allow_sleepable
>       bpf: Add bpf_trampoline_multi_attach/detach functions
>       bpf: Add support for tracing multi link
>       bpf: Add support for tracing_multi link cookies
>       bpf: Add support for tracing_multi link session
>       bpf: Add support for tracing_multi link fdinfo
>       libbpf: Add bpf_object_cleanup_btf function
>       libbpf: Add bpf_link_create support for tracing_multi link
>       libbpf: Add btf_type_is_traceable_func function
>       libbpf: Add support to create tracing multi link
>       selftests/bpf: Add tracing multi skel/pattern/ids attach tests
>       selftests/bpf: Add tracing multi skel/pattern/ids module attach tests
>       selftests/bpf: Add tracing multi intersect tests
>       selftests/bpf: Add tracing multi cookies test
>       selftests/bpf: Add tracing multi session test
>       selftests/bpf: Add tracing multi attach fails test
>       selftests/bpf: Add tracing multi attach benchmark test
>       selftests/bpf: Add tracing multi attach rollback tests
> 
>  arch/arm64/net/bpf_jit_comp.c                                      |  58 +++---
>  arch/loongarch/net/bpf_jit.c                                       |  44 ++---
>  arch/powerpc/net/bpf_jit_comp.c                                    |  46 ++---
>  arch/riscv/net/bpf_jit_comp64.c                                    |  52 ++---
>  arch/s390/net/bpf_jit_comp.c                                       |  44 ++---
>  arch/x86/net/bpf_jit_comp.c                                        |  54 ++---
>  include/linux/bpf.h                                                | 102 +++++++---
>  include/linux/bpf_types.h                                          |   1 +
>  include/linux/bpf_verifier.h                                       |   3 +
>  include/linux/btf_ids.h                                            |   1 +
>  include/linux/ftrace.h                                             |   2 +
>  include/linux/trace_events.h                                       |   6 +
>  include/uapi/linux/bpf.h                                           |   9 +
>  kernel/bpf/bpf_struct_ops.c                                        |  27 +--
>  kernel/bpf/btf.c                                                   |   4 +
>  kernel/bpf/syscall.c                                               |  88 ++++++---
>  kernel/bpf/trampoline.c                                            | 536 ++++++++++++++++++++++++++++++++++++++++---------
>  kernel/bpf/verifier.c                                              | 124 +++++++++---
>  kernel/trace/bpf_trace.c                                           | 149 +++++++++++++-
>  kernel/trace/ftrace.c                                              |   9 +-
>  net/bpf/bpf_dummy_struct_ops.c                                     |  14 +-
>  net/bpf/test_run.c                                                 |   3 +
>  tools/include/uapi/linux/bpf.h                                     |  10 +
>  tools/lib/bpf/bpf.c                                                |   9 +
>  tools/lib/bpf/bpf.h                                                |   5 +
>  tools/lib/bpf/libbpf.c                                             | 337 ++++++++++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h                                             |  15 ++
>  tools/lib/bpf/libbpf.map                                           |   1 +
>  tools/lib/bpf/libbpf_internal.h                                    |   1 +
>  tools/testing/selftests/bpf/Makefile                               |   9 +-
>  tools/testing/selftests/bpf/prog_tests/tracing_multi.c             | 912 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tools/testing/selftests/bpf/progs/tracing_multi_attach.c           |  39 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c    |  25 +++
>  tools/testing/selftests/bpf/progs/tracing_multi_bench.c            |  12 ++
>  tools/testing/selftests/bpf/progs/tracing_multi_check.c            | 212 ++++++++++++++++++++
>  tools/testing/selftests/bpf/progs/tracing_multi_fail.c             |  18 ++
>  tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c |  41 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_rollback.c         |  43 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c   |  43 ++++
>  tools/testing/selftests/bpf/trace_helpers.c                        |   6 +-
>  tools/testing/selftests/bpf/trace_helpers.h                        |   1 +
>  41 files changed, 2749 insertions(+), 366 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_check.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_fail.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
> 


^ permalink raw reply

* Re: [RFC v5 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-03-25  6:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel,
	Vineeth Remanan Pillai
In-Reply-To: <20260317122149.5d07132a@gandalf.local.home>

Hi Steven,

Thank you for your review, and I apologize for my delayed response.

 ---- On Wed, 18 Mar 2026 00:21:29 +0800  Steven Rostedt <rostedt@goodmis.org> wrote --- 
 > On Tue, 17 Mar 2026 16:46:21 +0800
 > Li Chen <me@linux.beauty> wrote:
 > 
 > > Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
 > > so it is useful to quantify the time spent with updates locked and to
 > > understand why snapshotting can fail.
 > > 
 > > Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
 > > the updates-locked window along with the number of snapshotted inodes
 > > and ranges. Record the first snapshot failure reason in a stable snap_err
 > > field for tooling.
 > > 
 > > Signed-off-by: Li Chen <me@linux.beauty>
 > > ---
 > >  fs/ext4/ext4.h              | 15 ++++++++
 > >  fs/ext4/fast_commit.c       | 71 +++++++++++++++++++++++++++++--------
 > >  include/trace/events/ext4.h | 61 +++++++++++++++++++++++++++++++
 > >  3 files changed, 132 insertions(+), 15 deletions(-)
 > > 
 > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
 > > index 68a64fa0be926..b9e146f3dd9e4 100644
 > > --- a/fs/ext4/ext4.h
 > > +++ b/fs/ext4/ext4.h
 > > @@ -1037,6 +1037,21 @@ enum {
 > >  
 > >  struct ext4_fc_inode_snap;
 > >  
 > > +/*
 > > + * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
 > > + * Keep these stable for tooling.
 > > + */
 > > +enum ext4_fc_snap_err {
 > > +    EXT4_FC_SNAP_ERR_NONE        = 0,
 > > +    EXT4_FC_SNAP_ERR_ES_MISS    = 1,
 > > +    EXT4_FC_SNAP_ERR_ES_DELAYED    = 2,
 > > +    EXT4_FC_SNAP_ERR_ES_OTHER    = 3,
 > > +    EXT4_FC_SNAP_ERR_INODES_CAP    = 4,
 > > +    EXT4_FC_SNAP_ERR_RANGES_CAP    = 5,
 > > +    EXT4_FC_SNAP_ERR_NOMEM        = 6,
 > > +    EXT4_FC_SNAP_ERR_INODE_LOC    = 7,
 > 
 > You don't need to explicitly state the assignments, the enum will increment
 > them without them.

Agree.

 > > +};
 > > +
 > >  /*
 > >   * fourth extended file system inode data in memory
 > >   */
 > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
 > > index d1eefee609120..4929e2990b292 100644
 > > --- a/fs/ext4/fast_commit.c
 > > +++ b/fs/ext4/fast_commit.c
 > > @@ -193,6 +193,12 @@ static struct kmem_cache *ext4_fc_range_cachep;
 > >  #define EXT4_FC_SNAPSHOT_MAX_INODES    1024
 > >  #define EXT4_FC_SNAPSHOT_MAX_RANGES    2048
 > >  
 > > +static inline void ext4_fc_set_snap_err(int *snap_err, int err)
 > > +{
 > > +    if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
 > > +        *snap_err = err;
 > > +}
 > > +
 > >  static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 > >  {
 > >      BUFFER_TRACE(bh, "");
 > > @@ -983,11 +989,12 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 > >  static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >                         struct list_head *ranges,
 > >                         unsigned int nr_ranges_total,
 > > -                       unsigned int *nr_rangesp)
 > > +                       unsigned int *nr_rangesp,
 > > +                       int *snap_err)
 > >  {
 > >      struct ext4_inode_info *ei = EXT4_I(inode);
 > > -    unsigned int nr_ranges = 0;
 > >      ext4_lblk_t start_lblk, end_lblk, cur_lblk;
 > > +    unsigned int nr_ranges = 0;
 > >  
 > >      spin_lock(&ei->i_fc_lock);
 > >      if (ei->i_fc_lblk_len == 0) {
 > > @@ -1010,11 +1017,16 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >          struct ext4_fc_range *range;
 > >          ext4_lblk_t len;
 > >  
 > > -        if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
 > > +        if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 > >              return -EAGAIN;
 > > +        }
 > >  
 > > -        if (ext4_es_is_delayed(&es))
 > > +        if (ext4_es_is_delayed(&es)) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_ES_DELAYED);
 > >              return -EAGAIN;
 > > +        }
 > >  
 > >          len = es.es_len - (cur_lblk - es.es_lblk);
 > >          if (len > end_lblk - cur_lblk + 1)
 > > @@ -1024,12 +1036,17 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >              continue;
 > >          }
 > >  
 > > -        if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
 > > +        if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_RANGES_CAP);
 > >              return -E2BIG;
 > > +        }
 > >  
 > >          range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 > > -        if (!range)
 > > +        if (!range) {
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 > >              return -ENOMEM;
 > > +        }
 > >          nr_ranges++;
 > >  
 > >          range->lblk = cur_lblk;
 > > @@ -1054,6 +1071,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >                  range->len = max;
 > >          } else {
 > >              kmem_cache_free(ext4_fc_range_cachep, range);
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 > >              return -EAGAIN;
 > >          }
 > >  
 > > @@ -1070,7 +1088,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >  
 > >  static int ext4_fc_snapshot_inode(struct inode *inode,
 > >                    unsigned int nr_ranges_total,
 > > -                  unsigned int *nr_rangesp)
 > > +                  unsigned int *nr_rangesp, int *snap_err)
 > >  {
 > >      struct ext4_inode_info *ei = EXT4_I(inode);
 > >      struct ext4_fc_inode_snap *snap;
 > > @@ -1082,8 +1100,10 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >      int alloc_ctx;
 > >  
 > >      ret = ext4_get_inode_loc_noio(inode, &iloc);
 > > -    if (ret)
 > > +    if (ret) {
 > > +        ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 > >          return ret;
 > > +    }
 > >  
 > >      if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 > >          inode_len = EXT4_INODE_SIZE(inode->i_sb);
 > > @@ -1092,6 +1112,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >  
 > >      snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 > >      if (!snap) {
 > > +        ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 > >          brelse(iloc.bh);
 > >          return -ENOMEM;
 > >      }
 > > @@ -1102,7 +1123,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >      brelse(iloc.bh);
 > >  
 > >      ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
 > > -                      &nr_ranges);
 > > +                      &nr_ranges, snap_err);
 > >      if (ret) {
 > >          kfree(snap);
 > >          ext4_fc_free_ranges(&ranges);
 > > @@ -1203,7 +1224,10 @@ static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
 > >                       unsigned int *nr_inodesp);
 > >  
 > >  static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > > -                   unsigned int inodes_size)
 > > +                   unsigned int inodes_size,
 > > +                   unsigned int *nr_inodesp,
 > > +                   unsigned int *nr_rangesp,
 > > +                   int *snap_err)
 > >  {
 > >      struct super_block *sb = journal->j_private;
 > >      struct ext4_sb_info *sbi = EXT4_SB(sb);
 > > @@ -1221,6 +1245,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >      alloc_ctx = ext4_fc_lock(sb);
 > >      list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 > >          if (i >= inodes_size) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_INODES_CAP);
 > >              ret = -E2BIG;
 > >              goto unlock;
 > >          }
 > > @@ -1244,6 +1270,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >              continue;
 > >  
 > >          if (i >= inodes_size) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_INODES_CAP);
 > >              ret = -E2BIG;
 > >              goto unlock;
 > >          }
 > > @@ -1268,16 +1296,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >          unsigned int inode_ranges = 0;
 > >  
 > >          ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
 > > -                         &inode_ranges);
 > > +                         &inode_ranges, snap_err);
 > >          if (ret)
 > >              break;
 > >          nr_ranges += inode_ranges;
 > >      }
 > >  
 > > +    if (nr_inodesp)
 > > +        *nr_inodesp = i;
 > > +    if (nr_rangesp)
 > > +        *nr_rangesp = nr_ranges;
 > >      return ret;
 > >  }
 > >  
 > > -static int ext4_fc_perform_commit(journal_t *journal)
 > > +static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 > >  {
 > >      struct super_block *sb = journal->j_private;
 > >      struct ext4_sb_info *sbi = EXT4_SB(sb);
 > > @@ -1286,10 +1318,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      struct inode *inode;
 > >      struct inode **inodes;
 > >      unsigned int inodes_size;
 > > +    unsigned int snap_inodes = 0;
 > > +    unsigned int snap_ranges = 0;
 > > +    int snap_err = EXT4_FC_SNAP_ERR_NONE;
 > >      struct blk_plug plug;
 > >      int ret = 0;
 > >      u32 crc = 0;
 > >      int alloc_ctx;
 > > +    ktime_t lock_start;
 > > +    u64 locked_ns;
 > >  
 > >      /*
 > >       * Step 1: Mark all inodes on s_fc_q[MAIN] with
 > > @@ -1337,13 +1374,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      if (ret)
 > >          return ret;
 > >  
 > > -
 > >      ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
 > >      if (ret)
 > >          return ret;
 > >  
 > >      /* Step 4: Mark all inodes as being committed. */
 > >      jbd2_journal_lock_updates(journal);
 > > +    lock_start = ktime_get();
 > >      /*
 > >       * The journal is now locked. No more handles can start and all the
 > >       * previous handles are now drained. Snapshotting happens in this
 > > @@ -1357,8 +1394,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      }
 > >      ext4_fc_unlock(sb, alloc_ctx);
 > >  
 > > -    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 > > +    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 > > +                      &snap_inodes, &snap_ranges, &snap_err);
 > >      jbd2_journal_unlock_updates(journal);
 > > +    locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 > 
 > If locked_ns is only used for the tracepoint, it should either be
 > calculated in the tracepoint, or add:
 > 
 >     if (trace_ext4_fc_lock_updates_enabled()) {
 >         locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 
Good catch!

 > > +    trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns, snap_inodes,
 > > +                   snap_ranges, ret, snap_err);
 > 
 >     }
 > 
 > Note, we are going to also add a code to call the tracepoint directly, to
 > remove the double static_branch.
 > 
 >     https://lore.kernel.org/all/20260312150523.2054552-1-vineeth@bitbyteword.org/
 > 
 > But that code is still being worked on so you don't need to worry about it
 > at the moment.

Thank you! I will monitor this patch set and incorporate it into future versions if possible.

 > 
 > >      kvfree(inodes);
 > >      if (ret)
 > >          return ret;
 > > @@ -1563,7 +1604,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 > >          journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 > >      set_task_ioprio(current, journal_ioprio);
 > >      fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
 > > -    ret = ext4_fc_perform_commit(journal);
 > > +    ret = ext4_fc_perform_commit(journal, commit_tid);
 > >      if (ret < 0) {
 > >          if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
 > >              status = EXT4_FC_STATUS_INELIGIBLE;
 > > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
 > > index fd76d14c2776e..dc084f39b74ad 100644
 > > --- a/include/trace/events/ext4.h
 > > +++ b/include/trace/events/ext4.h
 > > @@ -104,6 +104,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 > >  
 > > +#undef EM
 > > +#undef EMe
 > > +#define EM(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +#define EMe(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +
 > > +#define TRACE_SNAP_ERR                        \
 > > +    EM(NONE)                        \
 > > +    EM(ES_MISS)                        \
 > > +    EM(ES_DELAYED)                        \
 > > +    EM(ES_OTHER)                        \
 > > +    EM(INODES_CAP)                        \
 > > +    EM(RANGES_CAP)                        \
 > > +    EM(NOMEM)                        \
 > > +    EMe(INODE_LOC)
 > > +
 > > +TRACE_SNAP_ERR
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +
 > >  #define show_fc_reason(reason)                        \
 > >      __print_symbolic(reason,                    \
 > >          { EXT4_FC_REASON_XATTR,        "XATTR"},        \
 > > @@ -2812,6 +2832,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 > >            __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 > >  );
 > >  
 > > +#define EM(a)    { EXT4_FC_SNAP_ERR_##a, #a },
 > > +#define EMe(a)    { EXT4_FC_SNAP_ERR_##a, #a }
 > > +
 > > +TRACE_EVENT(ext4_fc_lock_updates,
 > > +        TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
 > > +             unsigned int nr_inodes, unsigned int nr_ranges, int err,
 > > +             int snap_err),
 > > +
 > > +    TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
 > > +
 > > +    TP_STRUCT__entry(/* entry */
 > > +        __field(dev_t, dev)
 > > +        __field(tid_t, tid)
 > > +        __field(u64, locked_ns)
 > > +        __field(unsigned int, nr_inodes)
 > > +        __field(unsigned int, nr_ranges)
 > > +        __field(int, err)
 > > +        __field(int, snap_err)
 > > +    ),
 > > +
 > > +    TP_fast_assign(/* assign */
 > > +        __entry->dev = sb->s_dev;
 > > +        __entry->tid = commit_tid;
 > > +        __entry->locked_ns = locked_ns;
 > > +        __entry->nr_inodes = nr_inodes;
 > > +        __entry->nr_ranges = nr_ranges;
 > > +        __entry->err = err;
 > > +        __entry->snap_err = snap_err;
 > > +    ),
 > > +
 > > +    TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
 > > +          MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
 > > +          __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
 > > +          __entry->err, __print_symbolic(__entry->snap_err,
 > > +                         TRACE_SNAP_ERR))
 > > +);
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +#undef TRACE_SNAP_ERR
 > > +
 > >  #define FC_REASON_NAME_STAT(reason)                    \
 > >      show_fc_reason(reason),                        \
 > >      __entry->fc_ineligible_rc[reason]
 > 
 > 

Regards,
Li


^ permalink raw reply

* [PATCH v5 3/3] PCI: dw-rockchip: Add pcie_ltssm_state_transition trace support
From: Shawn Lin @ 2026-03-25  1:58 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Bjorn Helgaas
  Cc: linux-rockchip, linux-pci, linux-trace-kernel, linux-doc,
	Steven Rostedt, Shawn Lin
In-Reply-To: <1774403912-210670-1-git-send-email-shawn.lin@rock-chips.com>

Rockchip platforms provide a 64x4 bytes debug FIFO to trace the
LTSSM history. Any LTSSM change will be recorded. It's useful
for debug purpose, for example link failure, etc.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
---

Changes in v5:
- rebase
- use trace_pcie_ltssm_state_transition_enabled()

Changes in v4:
- skip trace if pci_ltssm_tp_enabled() is false.(Steven)
- wrap into 80 columns(Bjorn)

Changes in v3:
- reorder variables(Mani)
- rename loop to i; rename en to enable(Mani)
- use FIELD_GET(Mani)
- add comment about how the FIFO works(Mani)

Changes in v2:
- use tracepoint

 drivers/pci/controller/dwc/pcie-dw-rockchip.c | 111 ++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index bb5d1a3..e737103 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -22,6 +22,8 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/reset.h>
+#include <linux/workqueue.h>
+#include <trace/events/pci_controller.h>
 
 #include "../../pci.h"
 #include "pcie-designware.h"
@@ -73,6 +75,20 @@
 #define  PCIE_CLIENT_CDM_RASDES_TBA_L1_1	BIT(4)
 #define  PCIE_CLIENT_CDM_RASDES_TBA_L1_2	BIT(5)
 
+/* Debug FIFO information */
+#define PCIE_CLIENT_DBG_FIFO_MODE_CON	0x310
+#define  PCIE_CLIENT_DBG_EN		0xffff0007
+#define  PCIE_CLIENT_DBG_DIS		0xffff0000
+#define PCIE_CLIENT_DBG_FIFO_PTN_HIT_D0	0x320
+#define PCIE_CLIENT_DBG_FIFO_PTN_HIT_D1	0x324
+#define PCIE_CLIENT_DBG_FIFO_TRN_HIT_D0	0x328
+#define PCIE_CLIENT_DBG_FIFO_TRN_HIT_D1	0x32c
+#define  PCIE_CLIENT_DBG_TRANSITION_DATA 0xffff0000
+#define PCIE_CLIENT_DBG_FIFO_STATUS	0x350
+#define  PCIE_DBG_FIFO_RATE_MASK	GENMASK(22, 20)
+#define  PCIE_DBG_FIFO_L1SUB_MASK	GENMASK(10, 8)
+#define PCIE_DBG_LTSSM_HISTORY_CNT	64
+
 /* Hot Reset Control Register */
 #define PCIE_CLIENT_HOT_RESET_CTRL	0x180
 #define  PCIE_LTSSM_APP_DLY2_EN		BIT(1)
@@ -98,6 +114,7 @@ struct rockchip_pcie {
 	struct irq_domain *irq_domain;
 	const struct rockchip_pcie_of_data *data;
 	bool supports_clkreq;
+	struct delayed_work trace_work;
 };
 
 struct rockchip_pcie_of_data {
@@ -208,6 +225,96 @@ static enum dw_pcie_ltssm rockchip_pcie_get_ltssm(struct dw_pcie *pci)
 	return rockchip_pcie_get_ltssm_reg(rockchip) & PCIE_LTSSM_STATUS_MASK;
 }
 
+#ifdef CONFIG_TRACING
+static void rockchip_pcie_ltssm_trace_work(struct work_struct *work)
+{
+	struct rockchip_pcie *rockchip = container_of(work,
+						struct rockchip_pcie,
+						trace_work.work);
+	struct dw_pcie *pci = &rockchip->pci;
+	enum dw_pcie_ltssm state;
+	u32 i, l1ss, prev_val = DW_PCIE_LTSSM_UNKNOWN, rate, val;
+
+	if (!trace_pcie_ltssm_state_transition_enabled())
+		goto skip_trace;
+
+	for (i = 0; i < PCIE_DBG_LTSSM_HISTORY_CNT; i++) {
+		val = rockchip_pcie_readl_apb(rockchip,
+				PCIE_CLIENT_DBG_FIFO_STATUS);
+		rate = FIELD_GET(PCIE_DBG_FIFO_RATE_MASK, val);
+		l1ss = FIELD_GET(PCIE_DBG_FIFO_L1SUB_MASK, val);
+		val = FIELD_GET(PCIE_LTSSM_STATUS_MASK, val);
+
+		/*
+		 * Hardware Mechanism: The ring FIFO employs two tracking
+		 * counters:
+		 * - 'last-read-point': maintains the user's last read position
+		 * - 'last-valid-point': tracks the HW's last state update
+		 *
+		 * Software Handling: When two consecutive LTSSM states are
+		 * identical, it indicates invalid subsequent data in the FIFO.
+		 * In this case, we skip the remaining entries. The dual counter
+		 * design ensures that on the next state transition, reading can
+		 * resume from the last user position.
+		 */
+		if ((i > 0 && val == prev_val) || val > DW_PCIE_LTSSM_RCVRY_EQ3)
+			break;
+
+		state = prev_val = val;
+		if (val == DW_PCIE_LTSSM_L1_IDLE) {
+			if (l1ss == 2)
+				state = DW_PCIE_LTSSM_L1_2;
+			else if (l1ss == 1)
+				state = DW_PCIE_LTSSM_L1_1;
+		}
+
+		trace_pcie_ltssm_state_transition(dev_name(pci->dev),
+				dw_pcie_ltssm_status_string(state),
+				((rate + 1) > pci->max_link_speed) ?
+				PCI_SPEED_UNKNOWN : PCIE_SPEED_2_5GT + rate);
+	}
+
+skip_trace:
+	schedule_delayed_work(&rockchip->trace_work, msecs_to_jiffies(5000));
+}
+
+static void rockchip_pcie_ltssm_trace(struct rockchip_pcie *rockchip,
+				      bool enable)
+{
+	if (enable) {
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_PTN_HIT_D0);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_PTN_HIT_D1);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_TRN_HIT_D0);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_TRN_HIT_D1);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_EN,
+					 PCIE_CLIENT_DBG_FIFO_MODE_CON);
+
+		INIT_DELAYED_WORK(&rockchip->trace_work,
+				  rockchip_pcie_ltssm_trace_work);
+		schedule_delayed_work(&rockchip->trace_work, 0);
+	} else {
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_DIS,
+					 PCIE_CLIENT_DBG_FIFO_MODE_CON);
+		cancel_delayed_work_sync(&rockchip->trace_work);
+	}
+}
+#else
+static void rockchip_pcie_ltssm_trace(struct rockchip_pcie *rockchip,
+				      bool enable)
+{
+}
+#endif
+
 static void rockchip_pcie_enable_ltssm(struct rockchip_pcie *rockchip)
 {
 	rockchip_pcie_writel_apb(rockchip, PCIE_CLIENT_ENABLE_LTSSM,
@@ -291,6 +398,9 @@ static int rockchip_pcie_start_link(struct dw_pcie *pci)
 	 * 100us as we don't know how long should the device need to reset.
 	 */
 	msleep(PCIE_T_PVPERL_MS);
+
+	rockchip_pcie_ltssm_trace(rockchip, true);
+
 	gpiod_set_value_cansleep(rockchip->rst_gpio, 1);
 
 	return 0;
@@ -301,6 +411,7 @@ static void rockchip_pcie_stop_link(struct dw_pcie *pci)
 	struct rockchip_pcie *rockchip = to_rockchip_pcie(pci);
 
 	rockchip_pcie_disable_ltssm(rockchip);
+	rockchip_pcie_ltssm_trace(rockchip, false);
 }
 
 static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] tracing/osnoise: fix potential deadlock in cpu hotplug
From: hu.shengming @ 2026-03-25  2:25 UTC (permalink / raw)
  To: rostedt
  Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	zhang.run, yang.tao172, ran.xiaokai, luo.haiyang
In-Reply-To: <20260324121918.454d6a7b@gandalf.local.home>

>On Tue, 24 Mar 2026 15:06:16 +0800 (CST)
><hu.shengming@zte.com.cn> wrote:
>
>> From: luohaiyang10243395 <luo.haiyang@zte.com.cn>
>> 
>> The following sequence may leads deadlock in cpu hotplug:
>> 
>>   CPU0                        |  CPU1
>>                               |  schedule_work_on
>>                               |
>>   _cpu_down//set CPU1 offline |
>>   cpus_write_lock             |
>>                               |  osnoise_hotplug_workfn
>>                               |    mutex_lock(&interface_lock);
>>                               |    cpus_read_lock();  //wait cpu_hotplug_lock
>>                               |
>>                               |  cpuhp/1
>>                               |    osnoise_cpu_die
>>                               |      kthread_stop
>>                               |        wait_for_completion //wait osnoise/1 exit
>>                               |
>>                               |  osnoise/1
>>                               |    osnoise_sleep
>>                               |      mutex_lock(&interface_lock); //deadlock
>> 
>> Fix by swap the order of cpus_read_lock() and mutex_lock(&interface_lock).
>
>So the deadlock is due to the "wait_for_completion"?

The osnoise_cpu_init callback returns directly, which may allow another CPU offline task to run, 
the offline task holds the cpu_hotplug_lock while waiting for the osnoise task to exit. 
osnoise_hotplug_workfn may acquire interface_lock first, causing the offline task to be blocked. 
This is an ABBA deadlock.

>How did you find this bug? Inspection, AI, triggered?
>
>Thanks,
>
>-- Steve

We run autotests on kernel-6.6, report following hung task warning, and we think the same issue exists
in linux-stable.
 [39401.476843] INFO: task cpuhp/7:47 blocked for more than 120 seconds.
 [39401.483196]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.490581] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.498398] task:cpuhp/7         state:D stack:0     pid:47    ppid:2      flags:0x00000208
 [39401.506739] Call trace:
 [39401.509175]  __switch_to+0x138/0x180
 [39401.512743]  __schedule+0x250/0x5e8
 [39401.516220]  schedule+0x60/0x100
 [39401.519437]  schedule_timeout+0x1a0/0x1c0
 [39401.523437]  wait_for_completion+0xbc/0x190
 [39401.527609]  kthread_stop+0x7c/0x268
 [39401.531175]  stop_kthread+0x8c/0x178
 [39401.534740]  osnoise_cpu_die+0xc/0x18
 [39401.538391]  cpuhp_invoke_callback+0x148/0x580
 [39401.542822]  cpuhp_thread_fun+0xc8/0x1a0
 [39401.546733]  smpboot_thread_fn+0x224/0x250
 [39401.550817]  kthread+0xf8/0x110
 [39401.553947]  ret_from_fork+0x10/0x20
 [39401.557545] INFO: task sh:28856 blocked for more than 120 seconds.
 [39401.563713]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.571095] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.578912] task:sh              state:D stack:0     pid:28856 ppid:1      flags:0x00800004
 [39401.587251] Call trace:
 [39401.589685]  __switch_to+0x138/0x180
 [39401.593250]  __schedule+0x250/0x5e8
 [39401.596725]  schedule+0x60/0x100
 [39401.599941]  schedule_timeout+0x1a0/0x1c0
 [39401.603940]  wait_for_completion+0xbc/0x190
 [39401.608113]  __flush_work+0x5c/0xa8
 [39401.611590]  work_on_cpu_key+0x88/0xc0
 [39401.615331]  cpu_down_maps_locked+0xd0/0xe8
 [39401.619503]  cpu_device_down+0x38/0x60
 [39401.623240]  cpu_subsys_offline+0x14/0x28
 [39401.627238]  device_offline+0xb8/0x130
 [39401.630976]  online_store+0x64/0xe0
 [39401.634453]  dev_attr_store+0x1c/0x38
 [39401.638104]  sysfs_kf_write+0x48/0x60
 [39401.641756]  kernfs_fop_write_iter+0x118/0x1e8
 [39401.646188]  vfs_write+0x1a4/0x2f8
 [39401.649580]  ksys_write+0x70/0x108
 [39401.652970]  __arm64_sys_write+0x20/0x30
 [39401.656880]  el0_svc_common.constprop.0+0x60/0x138
 [39401.661660]  do_el0_svc+0x20/0x30
 [39401.664964]  el0_svc+0x44/0x1f8
 [39401.668093]  el0t_64_sync_handler+0xf8/0x128
 [39401.672352]  el0t_64_sync+0x17c/0x180
 [39401.875086] INFO: task kworker/7:2:2314252 blocked for more than 121 seconds.
 [39401.882208]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.889590] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.897406] task:kworker/7:2     state:D stack:0     pid:2314252 ppid:2      flags:0x00000008
 [39401.905917] Workqueue: events osnoise_hotplug_workfn
 [39401.910871] Call trace:
 [39401.913306]  __switch_to+0x138/0x180
 [39401.916870]  __schedule+0x250/0x5e8
 [39401.920345]  schedule+0x60/0x100
 [39401.923561]  percpu_rwsem_wait+0xfc/0x128
 [39401.927559]  __percpu_down_read+0x60/0x198
 [39401.931644]  percpu_down_read.constprop.0+0xac/0xb8
 [39401.936510]  cpus_read_lock+0x14/0x20
 [39401.940160]  osnoise_hotplug_workfn+0x54/0xb0
 [39401.944506]  process_one_work+0x184/0x420
 [39401.948503]  worker_thread+0x2b4/0x3d8
 [39401.952241]  kthread+0xf8/0x110
 [39401.955370]  ret_from_fork+0x10/0x20
 [39402.125508] INFO: task osnoise/0:2356235 blocked for more than 121 seconds.
 [39402.132458]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.139840] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.147656] task:osnoise/0       state:D stack:0     pid:2356235 ppid:2      flags:0x00000008
 [39402.156168] Call trace:
 [39402.158602]  __switch_to+0x138/0x180
 [39402.162166]  __schedule+0x250/0x5e8
 [39402.165643]  schedule+0x60/0x100
 [39402.168860]  schedule_preempt_disabled+0x28/0x48
 [39402.173466]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.178158]  __mutex_lock_slowpath+0x18/0x28
 [39402.182416]  mutex_lock+0x64/0x78
 [39402.185720]  osnoise_sleep+0x30/0x130
 [39402.189371]  osnoise_main+0x164/0x190
 [39402.193021]  kthread+0xf8/0x110
 [39402.196149]  ret_from_fork+0x10/0x20
 [39402.199713] INFO: task osnoise/1:2356236 blocked for more than 121 seconds.
 [39402.206661]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.214044] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.221860] task:osnoise/1       state:D stack:0     pid:2356236 ppid:2      flags:0x00000008
 [39402.230372] Call trace:
 [39402.232804]  __switch_to+0x138/0x180
 [39402.236368]  __schedule+0x250/0x5e8
 [39402.239845]  schedule+0x60/0x100
 [39402.243061]  schedule_preempt_disabled+0x28/0x48
 [39402.247666]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.252359]  __mutex_lock_slowpath+0x18/0x28
 [39402.256618]  mutex_lock+0x64/0x78
 [39402.259921]  osnoise_sleep+0x30/0x130
 [39402.263572]  osnoise_main+0x164/0x190
 [39402.267223]  kthread+0xf8/0x110
 [39402.270352]  ret_from_fork+0x10/0x20
 [39402.273916] INFO: task osnoise/2:2356237 blocked for more than 121 seconds.
 [39402.280865]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.288247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.296064] task:osnoise/2       state:D stack:0     pid:2356237 ppid:2      flags:0x00000008
 [39402.304575] Call trace:
 [39402.307010]  __switch_to+0x138/0x180
 [39402.310574]  __schedule+0x250/0x5e8
 [39402.314051]  schedule+0x60/0x100
 [39402.317268]  schedule_preempt_disabled+0x28/0x48
 [39402.321873]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.326566]  __mutex_lock_slowpath+0x18/0x28
 [39402.330824]  mutex_lock+0x64/0x78
 [39402.334128]  osnoise_sleep+0x30/0x130
 [39402.337778]  osnoise_main+0x164/0x190
 [39402.341429]  kthread+0xf8/0x110
 [39402.344556]  ret_from_fork+0x10/0x20
 [39402.348120] Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings
 [39402.356295] Kernel panic - not syncing: hung_task: blocked tasks 

Thanks,
Haiyang

>> 
>> Signed-off-by: Luo Haiyang <luo.haiyang@zte.com.cn>
>> ---
>>  kernel/trace/trace_osnoise.c | 10 +++++-----
>>  1 file changed, 5 insertions(+), 5 deletions(-)
>> 
>> diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
>> index dee610e465b9..be6cf0bb3c03 100644
>> --- a/kernel/trace/trace_osnoise.c
>> +++ b/kernel/trace/trace_osnoise.c
>> @@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
>>      if (!osnoise_has_registered_instances())
>>          return;
>> 
>> -    guard(mutex)(&interface_lock);
>>      guard(cpus_read_lock)();
>> +    guard(mutex)(&interface_lock);
>> 
>>      if (!cpu_online(cpu))
>>          return;
>> @@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>>      if (running)
>>          stop_per_cpu_kthreads();
>> 
>> -    mutex_lock(&interface_lock);
>>      /*
>>       * avoid CPU hotplug operations that might read options.
>>       */
>>      cpus_read_lock();
>> +    mutex_lock(&interface_lock);
>> 
>>      retval = cnt;
>> 
>> @@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>>              clear_bit(option, &osnoise_options);
>>      }
>> 
>> -    cpus_read_unlock();
>>      mutex_unlock(&interface_lock);
>> +    cpus_read_unlock();
>> 
>>      if (running)
>>          start_per_cpu_kthreads();
>> @@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
>>      if (running)
>>          stop_per_cpu_kthreads();
>> 
>> -    mutex_lock(&interface_lock);
>>      /*
>>       * osnoise_cpumask is read by CPU hotplug operations.
>>       */
>>      cpus_read_lock();
>> +    mutex_lock(&interface_lock);
>> 
>>      cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
>> 
>> -    cpus_read_unlock();
>>      mutex_unlock(&interface_lock);
>> +    cpus_read_unlock();
>> 
>>      if (running)
>>          start_per_cpu_kthreads();

^ permalink raw reply

* [PATCH v13 4/4] ring-buffer: Add persistent ring buffer selftest
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a self-destractive test for the persistent ring buffer. This
will invalidate some sub-buffer pages in the persistent ring buffer
when kernel gets panic, and check whether the number of detected
invalid pages and the total entry_bytes are the same as record
after reboot.

This can ensure the kernel correctly recover partially corrupted
persistent ring buffer when boot.

The test only runs on the persistent ring buffer whose name is
"ptracingtest". And user has to fill it up with events before
kernel panics.

To run the test, enable CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
and you have to setup the kernel cmdline;

 reserve_mem=20M:2M:trace trace_instance=ptracingtest^traceoff@trace
 panic=1

And run following commands after the 1st boot;

 cd /sys/kernel/tracing/instances/ptracingtest
 echo 1 > tracing_on
 echo 1 > events/enable
 sleep 3
 echo c > /proc/sysrq-trigger

After panic message, the kernel will reboot and run the verification
on the persistent ring buffer, e.g.

 Ring buffer meta [2] invalid buffer page detected
 Ring buffer meta [2] is from previous boot! (318 pages discarded)
 Ring buffer testing [2] invalid pages: PASSED (318/318)
 Ring buffer testing [2] entry_bytes: PASSED (1300476/1300476)

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
  - Add entry_bytes test.
  - Do not compile test code if CONFIG_RING_BUFFER_PERSISTENT_SELFTEST=n.
 Changes in v9:
  - Test also reader pages.
---
 include/linux/ring_buffer.h |    1 +
 kernel/trace/Kconfig        |   15 +++++++++
 kernel/trace/ring_buffer.c  |   69 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |    4 ++
 4 files changed, 89 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 994f52b34344..0670742b2d60 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -238,6 +238,7 @@ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
+	RB_FL_TESTING		= 1 << 1,
 };
 
 #ifdef CONFIG_RING_BUFFER
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..094d5511bb17 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1202,6 +1202,21 @@ config RING_BUFFER_VALIDATE_TIME_DELTAS
 	  Only say Y if you understand what this does, and you
 	  still want it enabled. Otherwise say N
 
+config RING_BUFFER_PERSISTENT_SELFTEST
+	bool "Enable persistent ring buffer selftest"
+	depends on RING_BUFFER
+	help
+	  Run a selftest on the persistent ring buffer which names
+	  "ptracingtest" (and its backup) when panic_on_reboot by
+	  invalidating ring buffer pages.
+	  Note that user has to enable events on the persistent ring
+	  buffer manually to fill up ring buffers before rebooting.
+	  Since this invalidates the data on test target ring buffer,
+	  "ptracingtest" persistent ring buffer must not be used for
+	  actual tracing, but only for testing.
+
+	  If unsure, say N
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e5178239f2f9..10443347a6d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -64,6 +64,10 @@ struct ring_buffer_cpu_meta {
 	unsigned long	commit_buffer;
 	__u32		subbuf_size;
 	__u32		nr_subbufs;
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+	__u32		nr_invalid;
+	__u32		entry_bytes;
+#endif
 	int		buffers[];
 };
 
@@ -2077,6 +2081,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 	pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
 		cpu_buffer->cpu, discarded);
+
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+	if (meta->nr_invalid)
+		pr_info("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n",
+			cpu_buffer->cpu,
+			(discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
+			discarded, meta->nr_invalid);
+	if (meta->entry_bytes)
+		pr_info("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n",
+			cpu_buffer->cpu,
+			(entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
+			(long)entry_bytes, (long)meta->entry_bytes);
+#endif
 	return;
 
  invalid:
@@ -2557,12 +2574,64 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_cpu_meta *meta;
+	struct buffer_data_page *dpage;
+	u32 entry_bytes = 0;
+	unsigned long ptr;
+	int subbuf_size;
+	int invalid = 0;
+	int cpu;
+	int i;
+
+	if (!(buffer->flags & RB_FL_TESTING))
+		return;
+
+	guard(preempt)();
+	cpu = smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+	meta = cpu_buffer->ring_meta;
+	ptr = (unsigned long)rb_subbufs_from_meta(meta);
+	subbuf_size = meta->subbuf_size;
+
+	for (i = 0; i < meta->nr_subbufs; i++) {
+		int idx = meta->buffers[i];
+
+		dpage = (void *)(ptr + idx * subbuf_size);
+		/* Skip unused pages */
+		if (!local_read(&dpage->commit))
+			continue;
+
+		/* Invalidate even pages. */
+		if (!(i & 0x1)) {
+			local_add(subbuf_size + 1, &dpage->commit);
+			invalid++;
+		} else {
+			/* Count total commit bytes. */
+			entry_bytes += local_read(&dpage->commit);
+		}
+	}
+
+	pr_info("Inject invalidated %d pages on CPU%d, total size: %ld\n",
+		invalid, cpu, (long)entry_bytes);
+	meta->nr_invalid = invalid;
+	meta->entry_bytes = entry_bytes;
+}
+#else /* !CONFIG_RING_BUFFER_PERSISTENT_SELFTEST */
+#define rb_test_inject_invalid_pages(buffer)	do { } while (0)
+#endif
+
 /* Stop recording on a persistent buffer and flush cache if needed. */
 static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
 {
 	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
 
 	ring_buffer_record_off(buffer);
+	rb_test_inject_invalid_pages(buffer);
 	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
 	return NOTIFY_DONE;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4189ec9df6a5..108b0d16badf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9366,6 +9366,8 @@ static void setup_trace_scratch(struct trace_array *tr,
 	memset(tscratch, 0, size);
 }
 
+#define TRACE_TEST_PTRACING_NAME	"ptracingtest"
+
 static int
 allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned long size)
 {
@@ -9378,6 +9380,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned
 	buf->tr = tr;
 
 	if (tr->range_addr_start && tr->range_addr_size) {
+		if (!strcmp(tr->name, TRACE_TEST_PTRACING_NAME))
+			rb_flags |= RB_FL_TESTING;
 		/* Add scratch buffer to handle 128 modules */
 		buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
 						      tr->range_addr_start,


^ permalink raw reply related

* [PATCH v13 3/4] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Skip invalid sub-buffers when rewinding the persistent ring buffer
instead of stopping the rewinding the ring buffer. The skipped
buffers are cleared.

To ensure the rewinding stops at the unused page, this also clears
buffer_data_page::time_stamp when tracing resets the buffer. This
allows us to identify unused pages and empty pages.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v12:
   - Fix build error.
 Changes in v11:
   - Reset timestamp when the buffer is invalid.
   - When rewinding, skip subbuf page if timestamp is wrong and
     check timestamp after validating buffer data page.
 Changes in v10:
   - Newly added.
---
 kernel/trace/ring_buffer.c |   76 +++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 31cad8edd488..e5178239f2f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -363,6 +363,7 @@ struct buffer_page {
 static void rb_init_page(struct buffer_data_page *bpage)
 {
 	local_set(&bpage->commit, 0);
+	bpage->time_stamp = 0;
 }
 
 static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
@@ -1878,12 +1879,14 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+static int rb_validate_buffer(struct buffer_page *bpage, int cpu,
 			      struct ring_buffer_cpu_meta *meta)
 {
+	struct buffer_data_page *dpage = bpage->page;
 	unsigned long long ts;
 	unsigned long tail;
 	u64 delta;
+	int ret = -1;
 
 	/*
 	 * When a sub-buffer is recovered from a read, the commit value may
@@ -1892,9 +1895,17 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
 	 * subbuf_size is considered invalid.
 	 */
 	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
-	if (tail > meta->subbuf_size)
-		return -1;
-	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+	if (tail <= meta->subbuf_size)
+		ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+
+	if (ret < 0) {
+		local_set(&bpage->entries, 0);
+		local_set(&bpage->page->commit, 0);
+	} else {
+		local_set(&bpage->entries, ret);
+	}
+
+	return ret;
 }
 
 /* If the meta data has been validated, now validate the events */
@@ -1915,18 +1926,14 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu, meta);
+	ret = rb_validate_buffer(cpu_buffer->reader_page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
 		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
 			cpu_buffer->cpu);
 		discarded++;
-		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-		local_set(&cpu_buffer->reader_page->entries, 0);
-		local_set(&cpu_buffer->reader_page->page->commit, 0);
 	} else {
 		entries += ret;
 		entry_bytes += rb_page_size(cpu_buffer->reader_page);
-		local_set(&cpu_buffer->reader_page->entries, ret);
 	}
 
 	ts = head_page->page->time_stamp;
@@ -1945,26 +1952,33 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->tail_page)
 			break;
 
-		/* Ensure the page has older data than head. */
-		if (ts < head_page->page->time_stamp)
-			break;
-
-		ts = head_page->page->time_stamp;
-		/* Ensure the page has correct timestamp and some data. */
-		if (!ts || rb_page_commit(head_page) == 0)
-			break;
-
-		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
-		if (ret < 0)
+		/* Rewind until unused page (no timestamp, no commit). */
+		if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
 			break;
 
-		/* Recover the number of entries and update stats. */
-		local_set(&head_page->entries, ret);
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-		entries += ret;
-		entry_bytes += rb_page_commit(head_page);
+		/*
+		 * Skip if the page is invalid, or its timestamp is newer than the
+		 * previous valid page.
+		 */
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
+		if (ret >= 0 && ts < head_page->page->time_stamp) {
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+			head_page->page->time_stamp = ts;
+			ret = -1;
+		}
+		if (ret < 0) {
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+		} else {
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			if (ret > 0)
+				local_inc(&cpu_buffer->pages_touched);
+			ts = head_page->page->time_stamp;
+		}
 	}
 	if (i)
 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2034,15 +2048,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->reader_page)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
 			if (!discarded)
 				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
 					cpu_buffer->cpu);
 			discarded++;
-			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-			local_set(&head_page->entries, 0);
-			local_set(&head_page->page->commit, 0);
 		} else {
 			/* If the buffer has content, update pages_touched */
 			if (ret)
@@ -2050,7 +2061,6 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 			entries += ret;
 			entry_bytes += rb_page_size(head_page);
-			local_set(&head_page->entries, ret);
 		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
@@ -2081,7 +2091,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	/* Reset all the subbuffers */
 	for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
 		local_set(&head_page->entries, 0);
-		local_set(&head_page->page->commit, 0);
+		rb_init_page(head_page->page);
 	}
 }
 


^ permalink raw reply related

* [PATCH v13 2/4] ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Skip invalid sub-buffers when validating the persistent ring buffer
instead of discarding the entire ring buffer. Only skipped buffers
are invalidated (cleared).

If the cache data in memory fails to be synchronized during a reboot,
the persistent ring buffer may become partially corrupted, but other
sub-buffers may still contain readable event data. Only discard the
subbuffers that are found to be corrupted.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
  Changes in v11:
  - Fix a typo.
  Changes in v9:
  - Add meta->subbuf_size check.
  - Fix a typo.
  - Handle invalid reader_page case.
  Changes in v8:
  - Add comment in rb_valudate_buffer()
  - Clear the RB_MISSED_* flags in rb_valudate_buffer() instead of
    skipping subbuf.
  - Remove unused subbuf local variable from rb_cpu_meta_valid().
  Changes in v7:
  - Combined with Handling RB_MISSED_* flags patch, focus on validation at boot.
  - Remove checking subbuffer data when validating metadata, because it should be done
    later.
  - Do not mark the discarded sub buffer page but just reset it.
  Changes in v6:
  - Show invalid page detection message once per CPU.
  Changes in v5:
  - Instead of showing errors for each page, just show the number
    of discarded pages at last.
  Changes in v3:
  - Record missed data event on commit.
---
 kernel/trace/ring_buffer.c |   98 ++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 40 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e793bd1c134..31cad8edd488 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,12 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 	return local_read(&bpage->page->commit);
 }
 
+/* Size is determined by what has been committed */
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+}
+
 static void free_buffer_page(struct buffer_page *bpage)
 {
 	/* Range pages are not to be freed */
@@ -1762,7 +1768,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			      unsigned long *subbuf_mask)
 {
 	int subbuf_size = PAGE_SIZE;
-	struct buffer_data_page *subbuf;
 	unsigned long buffers_start;
 	unsigned long buffers_end;
 	int i;
@@ -1770,6 +1775,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 	if (!subbuf_mask)
 		return false;
 
+	if (meta->subbuf_size != PAGE_SIZE) {
+		pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+		return false;
+	}
+
 	buffers_start = meta->first_buffer;
 	buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
 
@@ -1786,11 +1796,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 		return false;
 	}
 
-	subbuf = rb_subbufs_from_meta(meta);
-
 	bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
 
-	/* Is the meta buffers and the subbufs themselves have correct data? */
+	/*
+	 * Ensure the meta::buffers array has correct data. The data in each subbufs
+	 * are checked later in rb_meta_validate_events().
+	 */
 	for (i = 0; i < meta->nr_subbufs; i++) {
 		if (meta->buffers[i] < 0 ||
 		    meta->buffers[i] >= meta->nr_subbufs) {
@@ -1798,18 +1809,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			return false;
 		}
 
-		if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
-			pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
-			return false;
-		}
-
 		if (test_bit(meta->buffers[i], subbuf_mask)) {
 			pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
 			return false;
 		}
 
 		set_bit(meta->buffers[i], subbuf_mask);
-		subbuf = (void *)subbuf + subbuf_size;
 	}
 
 	return true;
@@ -1873,13 +1878,22 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+			      struct ring_buffer_cpu_meta *meta)
 {
 	unsigned long long ts;
+	unsigned long tail;
 	u64 delta;
-	int tail;
 
-	tail = local_read(&dpage->commit);
+	/*
+	 * When a sub-buffer is recovered from a read, the commit value may
+	 * have RB_MISSED_* bits set, as these bits are reset on reuse.
+	 * Even after clearing these bits, a commit value greater than the
+	 * subbuf_size is considered invalid.
+	 */
+	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+	if (tail > meta->subbuf_size)
+		return -1;
 	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 }
 
@@ -1890,6 +1904,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *head_page, *orig_head;
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
+	int discarded = 0;
 	int ret;
 	u64 ts;
 	int i;
@@ -1900,14 +1915,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
-		pr_info("Ring buffer reader page is invalid\n");
-		goto invalid;
+		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
+			cpu_buffer->cpu);
+		discarded++;
+		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+		local_set(&cpu_buffer->reader_page->entries, 0);
+		local_set(&cpu_buffer->reader_page->page->commit, 0);
+	} else {
+		entries += ret;
+		entry_bytes += rb_page_size(cpu_buffer->reader_page);
+		local_set(&cpu_buffer->reader_page->entries, ret);
 	}
-	entries += ret;
-	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
-	local_set(&cpu_buffer->reader_page->entries, ret);
 
 	ts = head_page->page->time_stamp;
 
@@ -1935,7 +1955,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 			break;
 
 		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0)
 			break;
 
@@ -2014,21 +2034,24 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->reader_page)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
-			pr_info("Ring buffer meta [%d] invalid buffer page\n",
-				cpu_buffer->cpu);
-			goto invalid;
-		}
-
-		/* If the buffer has content, update pages_touched */
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-
-		entries += ret;
-		entry_bytes += local_read(&head_page->page->commit);
-		local_set(&head_page->entries, ret);
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+		} else {
+			/* If the buffer has content, update pages_touched */
+			if (ret)
+				local_inc(&cpu_buffer->pages_touched);
 
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			local_set(&head_page->entries, ret);
+		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
 	}
@@ -2042,7 +2065,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->entries, entries);
 	local_set(&cpu_buffer->entries_bytes, entry_bytes);
 
-	pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+	pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
+		cpu_buffer->cpu, discarded);
 	return;
 
  invalid:
@@ -3329,12 +3353,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	return NULL;
 }
 
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
-	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
 static __always_inline unsigned
 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 {


^ permalink raw reply related

* [PATCH v13 1/4] ring-buffer: Flush and stop persistent ring buffer on panic
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

On real hardware, panic and machine reboot may not flush hardware cache
to memory. This means the persistent ring buffer, which relies on a
coherent state of memory, may not have its events written to the buffer
and they may be lost. Moreover, there may be inconsistency with the
counters which are used for validation of the integrity of the
persistent ring buffer which may cause all data to be discarded.

To avoid this issue, stop recording of the ring buffer on panic and
flush the cache of the ring buffer's memory.

Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v13:
   - Fix a rebase conflict.
 Changes in v11:
   - Do nothing by default since flush_cache_vmap() does nothing on x86
     but it can cause deadlock on some architectures via on_each_cpu()
     because other CPUs will be stoppped when panic notifier is called.
 Changes in v9:
   - Fix typo of & to &&.
   - Fix typo of "Generic"
 Changes in v6:
   - Introduce asm/ring_buffer.h for arch_ring_buffer_flush_range().
   - Use flush_cache_vmap() instead of flush_cache_all().
 Changes in v5:
   - Use ring_buffer_record_off() instead of ring_buffer_record_disable().
   - Use flush_cache_all() to ensure flush all cache.
 Changes in v3:
   - update patch description.
---
 arch/alpha/include/asm/Kbuild        |    1 +
 arch/arc/include/asm/Kbuild          |    1 +
 arch/arm/include/asm/Kbuild          |    1 +
 arch/arm64/include/asm/ring_buffer.h |   10 ++++++++++
 arch/csky/include/asm/Kbuild         |    1 +
 arch/hexagon/include/asm/Kbuild      |    1 +
 arch/loongarch/include/asm/Kbuild    |    1 +
 arch/m68k/include/asm/Kbuild         |    1 +
 arch/microblaze/include/asm/Kbuild   |    1 +
 arch/mips/include/asm/Kbuild         |    1 +
 arch/nios2/include/asm/Kbuild        |    1 +
 arch/openrisc/include/asm/Kbuild     |    1 +
 arch/parisc/include/asm/Kbuild       |    1 +
 arch/powerpc/include/asm/Kbuild      |    1 +
 arch/riscv/include/asm/Kbuild        |    1 +
 arch/s390/include/asm/Kbuild         |    1 +
 arch/sh/include/asm/Kbuild           |    1 +
 arch/sparc/include/asm/Kbuild        |    1 +
 arch/um/include/asm/Kbuild           |    1 +
 arch/x86/include/asm/Kbuild          |    1 +
 arch/xtensa/include/asm/Kbuild       |    1 +
 include/asm-generic/ring_buffer.h    |   13 +++++++++++++
 kernel/trace/ring_buffer.c           |   22 ++++++++++++++++++++++
 23 files changed, 65 insertions(+)
 create mode 100644 arch/arm64/include/asm/ring_buffer.h
 create mode 100644 include/asm-generic/ring_buffer.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 483965c5a4de..b154b4e3dfa8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += agp.h
 generic-y += asm-offsets.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4c69522e0328..483caacc6988 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -5,5 +5,6 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 03657ff8fbe3..decad5f2c826 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 
 generated-y += mach-types.h
 generated-y += unistd-nr.h
diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h
new file mode 100644
index 000000000000..62316c406888
--- /dev/null
+++ b/arch/arm64/include/asm/ring_buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_ARM64_RING_BUFFER_H
+#define _ASM_ARM64_RING_BUFFER_H
+
+#include <asm/cacheflush.h>
+
+/* Flush D-cache on persistent ring buffer */
+#define arch_ring_buffer_flush_range(start, end)	dcache_clean_pop(start, end)
+
+#endif /* _ASM_ARM64_RING_BUFFER_H */
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 3a5c7f6e5aac..7dca0c6cdc84 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
 generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1efa1e993d4b..0f887d4238ed 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += extable.h
 generic-y += iomap.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 9034b583a88a..7e92957baf6a 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -10,5 +10,6 @@ generic-y += qrwlock.h
 generic-y += user.h
 generic-y += ioctl.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
 generic-y += statfs.h
 generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index b282e0dd8dc1..62543bf305ff 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -3,5 +3,6 @@ generated-y += syscall_table.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += text-patching.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 7178f990e8b3..0030309b47ad 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += syscalls.h
 generic-y += tlb.h
 generic-y += user.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 684569b2ecd6..9771c3d85074 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 28004301c236..0a2530964413 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += cmpxchg.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..8aa34621702d 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += spinlock_types.h
 generic-y += spinlock.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 4fb596d94c89..d48d158f7241 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2e23533b67e3..805b5aeebb6f 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h
 generic-y += agp.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += early_ioremap.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index bd5fc9403295..7721b63642f4 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h
 generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 80bad7de7a04..0c1fc47c3ba0 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,3 +7,4 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 4d3f10ed8275..f0403d3ee8ab 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,4 +3,5 @@ generated-y += syscall_table.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 17ee8a273aa6..49c6bb326b75 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 1b9b82bbe322..2a1629ba8140 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += module.lds.h
 generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += ring_buffer.h
 generic-y += runtime-const.h
 generic-y += softirq_stack.h
 generic-y += switch_to.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4566000e15c4..078fd2c0d69d 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += early_ioremap.h
 generic-y += fprobe.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 13fe45dea296..e57af619263a 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h
new file mode 100644
index 000000000000..201d2aee1005
--- /dev/null
+++ b/include/asm-generic/ring_buffer.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic arch dependent ring_buffer macros.
+ */
+#ifndef __ASM_GENERIC_RING_BUFFER_H__
+#define __ASM_GENERIC_RING_BUFFER_H__
+
+#include <linux/cacheflush.h>
+
+/* Flush cache on ring buffer range if needed. Do nothing by default. */
+#define arch_ring_buffer_flush_range(start, end)	do { } while (0)
+
+#endif /* __ASM_GENERIC_RING_BUFFER_H__ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b6c39bba56d..3e793bd1c134 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
 #include <linux/ring_buffer_types.h>
 #include <linux/sched/isolation.h>
 #include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
 #include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
 #include <linux/oom.h>
 #include <linux/mm.h>
 
+#include <asm/ring_buffer.h>
 #include <asm/local64.h>
 #include <asm/local.h>
 #include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
 
 	unsigned long			range_addr_start;
 	unsigned long			range_addr_end;
+	struct notifier_block		flush_nb;
 
 	struct ring_buffer_meta		*meta;
 
@@ -2520,6 +2523,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+	ring_buffer_record_off(buffer);
+	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+	return NOTIFY_DONE;
+}
+
 static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 					 int order, unsigned long start,
 					 unsigned long end,
@@ -2650,6 +2663,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 
 	mutex_init(&buffer->mutex);
 
+	/* Persistent ring buffer needs to flush cache before reboot. */
+	if (start && end) {
+		buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+		atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+	}
+
 	return_ptr(buffer);
 
  fail_free_buffers:
@@ -2748,6 +2767,9 @@ ring_buffer_free(struct trace_buffer *buffer)
 {
 	int cpu;
 
+	if (buffer->range_addr_start && buffer->range_addr_end)
+		atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
 	cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
 
 	irq_work_sync(&buffer->irq_work.work);


^ permalink raw reply related

* [PATCH v13 0/4] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu (Google) @ 2026-03-25  2:24 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers

Hi,

Here is the 13th version of improvement patches for making persistent
ring buffers robust to failures.
The previous version is here:

https://lore.kernel.org/all/177431040276.3708637.10277850418341653677.stgit@mhiramat.tok.corp.google.com/

In this version, I rebased it on ring-buffer/for-next.

Thank you,

---

Masami Hiramatsu (Google) (4):
      ring-buffer: Flush and stop persistent ring buffer on panic
      ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
      ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
      ring-buffer: Add persistent ring buffer selftest


 arch/alpha/include/asm/Kbuild        |    1 
 arch/arc/include/asm/Kbuild          |    1 
 arch/arm/include/asm/Kbuild          |    1 
 arch/arm64/include/asm/ring_buffer.h |   10 +
 arch/csky/include/asm/Kbuild         |    1 
 arch/hexagon/include/asm/Kbuild      |    1 
 arch/loongarch/include/asm/Kbuild    |    1 
 arch/m68k/include/asm/Kbuild         |    1 
 arch/microblaze/include/asm/Kbuild   |    1 
 arch/mips/include/asm/Kbuild         |    1 
 arch/nios2/include/asm/Kbuild        |    1 
 arch/openrisc/include/asm/Kbuild     |    1 
 arch/parisc/include/asm/Kbuild       |    1 
 arch/powerpc/include/asm/Kbuild      |    1 
 arch/riscv/include/asm/Kbuild        |    1 
 arch/s390/include/asm/Kbuild         |    1 
 arch/sh/include/asm/Kbuild           |    1 
 arch/sparc/include/asm/Kbuild        |    1 
 arch/um/include/asm/Kbuild           |    1 
 arch/x86/include/asm/Kbuild          |    1 
 arch/xtensa/include/asm/Kbuild       |    1 
 include/asm-generic/ring_buffer.h    |   13 ++
 include/linux/ring_buffer.h          |    1 
 kernel/trace/Kconfig                 |   15 ++
 kernel/trace/ring_buffer.c           |  237 ++++++++++++++++++++++++++--------
 kernel/trace/trace.c                 |    4 +
 26 files changed, 241 insertions(+), 59 deletions(-)
 create mode 100644 arch/arm64/include/asm/ring_buffer.h
 create mode 100644 include/asm-generic/ring_buffer.h

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v5 2/3] Documentation: tracing: Add PCI controller event documentation
From: Shawn Lin @ 2026-03-25  1:58 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Bjorn Helgaas
  Cc: linux-rockchip, linux-pci, linux-trace-kernel, linux-doc,
	Steven Rostedt, Shawn Lin
In-Reply-To: <1774403912-210670-1-git-send-email-shawn.lin@rock-chips.com>

The available tracepoint, pcie_ltssm_state_transition, monitors the LTSSM
state transition for debugging purpose. Add description about it.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
---

Changes in v5: None
Changes in v4: None
Changes in v3:
- Add toctree entry in Documentation/trace/index.rst(Bagas Sanjaya)
- fix mismatch section underline length(Bagas Sanjaya)
- Make example snippets in code block(Bagas Sanjaya)
- warp context into 80 columns and fix the file name(Bjorn)

Changes in v2: None

 Documentation/trace/events-pci-controller.rst | 42 +++++++++++++++++++++++++++
 Documentation/trace/index.rst                 |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 Documentation/trace/events-pci-controller.rst

diff --git a/Documentation/trace/events-pci-controller.rst b/Documentation/trace/events-pci-controller.rst
new file mode 100644
index 0000000..cb9f715
--- /dev/null
+++ b/Documentation/trace/events-pci-controller.rst
@@ -0,0 +1,42 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+Subsystem Trace Points: PCI Controller
+======================================
+
+Overview
+========
+The PCI controller tracing system provides tracepoints to monitor controller
+level information for debugging purpose. The events normally show up here:
+
+	/sys/kernel/tracing/events/pci_controller
+
+Cf. include/trace/events/pci_controller.h for the events definitions.
+
+Available Tracepoints
+=====================
+
+pcie_ltssm_state_transition
+---------------------------
+
+Monitors PCIe LTSSM state transition including state and rate information
+::
+
+    pcie_ltssm_state_transition  "dev: %s state: %s rate: %s\n"
+
+**Parameters**:
+
+* ``dev`` - PCIe controller instance
+* ``state`` - PCIe LTSSM state
+* ``rate`` - PCIe date rate
+
+**Example Usage**:
+
+.. code-block:: shell
+
+    # Enable the tracepoint
+    echo 1 > /sys/kernel/debug/tracing/events/pci_controller/pcie_ltssm_state_transition/enable
+
+    # Monitor events (the following output is generated when a device is linking)
+    cat /sys/kernel/debug/tracing/trace_pipe
+       kworker/0:0-9       [000] .....     5.600221: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_EQ2 rate: 8.0 GT/s
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst
index 036db96..5d9bf469 100644
--- a/Documentation/trace/index.rst
+++ b/Documentation/trace/index.rst
@@ -55,6 +55,7 @@ applications.
    events-nmi
    events-msr
    events-pci
+   events-pci-controller
    boottime-trace
    histogram
    histogram-design
-- 
2.7.4


^ permalink raw reply related

* [PATCH v5 1/3] PCI: trace: Add PCI controller LTSSM transition tracepoint
From: Shawn Lin @ 2026-03-25  1:58 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Bjorn Helgaas
  Cc: linux-rockchip, linux-pci, linux-trace-kernel, linux-doc,
	Steven Rostedt, Shawn Lin
In-Reply-To: <1774403912-210670-1-git-send-email-shawn.lin@rock-chips.com>

Some platforms may provide LTSSM trace functionality, recording historical
LTSSM state transition information. This is very useful for debugging, such
as when certain devices cannot be recognized or link broken during test.
Implement the pci controller tracepoint for recording LTSSM and rate.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
---

Changes in v5:
- use EM/EMe instead
- remove reg/unreg function and back to use TRACE_EVENT

Changes in v4:
- use TRACE_EVENT_FN to notify when to start and stop the tracepoint,
  and export pci_ltssm_tp_enabled() for host drivers to use

Changes in v3:
- add TRACE_DEFINE_ENUM for all enums(Steven Rostedt)

Changes in v2: None

 drivers/pci/trace.c                   |  1 +
 include/trace/events/pci_controller.h | 58 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 include/trace/events/pci_controller.h

diff --git a/drivers/pci/trace.c b/drivers/pci/trace.c
index cf11abc..c1da9d3 100644
--- a/drivers/pci/trace.c
+++ b/drivers/pci/trace.c
@@ -9,3 +9,4 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/pci.h>
+#include <trace/events/pci_controller.h>
diff --git a/include/trace/events/pci_controller.h b/include/trace/events/pci_controller.h
new file mode 100644
index 0000000..a4b387c
--- /dev/null
+++ b/include/trace/events/pci_controller.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pci_controller
+
+#if !defined(_TRACE_HW_EVENT_PCI_CONTROLLER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HW_EVENT_PCI_CONTROLLER_H
+
+#include <uapi/linux/pci_regs.h>
+#include <linux/tracepoint.h>
+
+#define RATE					\
+	EM(PCIE_SPEED_2_5GT,  "2.5 GT/s")	\
+	EM(PCIE_SPEED_5_0GT,  "5.0 GT/s")	\
+	EM(PCIE_SPEED_8_0GT,  "8.0 GT/s")	\
+	EM(PCIE_SPEED_16_0GT, "16.0 GT/s")	\
+	EM(PCIE_SPEED_32_0GT, "32.0 GT/s")	\
+	EM(PCIE_SPEED_64_0GT, "64.0 GT/s")	\
+	EMe(PCI_SPEED_UNKNOWN, "Unknown")
+
+
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+RATE
+
+#undef EM
+#undef EMe
+#define EM(a, b)	{a, b},
+#define EMe(a, b)	{a, b}
+
+TRACE_EVENT(pcie_ltssm_state_transition,
+	TP_PROTO(const char *dev_name, const char *state, u32 rate),
+	TP_ARGS(dev_name, state, rate),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name)
+		__string(state, state)
+		__field(u32, rate)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name);
+		__assign_str(state);
+		__entry->rate = rate;
+	),
+
+	TP_printk("dev: %s state: %s rate: %s",
+		__get_str(dev_name), __get_str(state),
+		__print_symbolic(__entry->rate, RATE)
+	)
+);
+
+#endif /* _TRACE_HW_EVENT_PCI_CONTROLLER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
2.7.4


^ permalink raw reply related

* [PATCH v5 0/3] PCI Controller event and LTSSM tracepoint support
From: Shawn Lin @ 2026-03-25  1:58 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Bjorn Helgaas
  Cc: linux-rockchip, linux-pci, linux-trace-kernel, linux-doc,
	Steven Rostedt, Shawn Lin


This patch-set adds new pci controller event and LTSSM tracepoint used by host drivers
which provide LTSSM trace functionality. The first user is pcie-dw-rockchip with a 256
Bytes FIFO for recording LTSSM transition.

Testing
=========

This series was tested on RK3588/RK3588s EVB1 with NVMe SSD connected to PCIe3 and PCIe2
root ports.

echo 1 > /sys/kernel/debug/tracing/events/pci_controller/pcie_ltssm_state_transition/enable
cat /sys/kernel/debug/tracing/trace_pipe

 # tracer: nop
 #
 # entries-in-buffer/entries-written: 64/64   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
      kworker/0:0-9       [000] .....     5.600194: pcie_ltssm_state_transition: dev: a40000000.pcie state: DETECT_ACT rate: Unknown
      kworker/0:0-9       [000] .....     5.600198: pcie_ltssm_state_transition: dev: a40000000.pcie state: DETECT_WAIT rate: Unknown
      kworker/0:0-9       [000] .....     5.600199: pcie_ltssm_state_transition: dev: a40000000.pcie state: DETECT_ACT rate: Unknown
      kworker/0:0-9       [000] .....     5.600201: pcie_ltssm_state_transition: dev: a40000000.pcie state: POLL_ACTIVE rate: Unknown
      kworker/0:0-9       [000] .....     5.600202: pcie_ltssm_state_transition: dev: a40000000.pcie state: POLL_CONFIG rate: Unknown
      kworker/0:0-9       [000] .....     5.600204: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_LINKWD_START rate: Unknown
      kworker/0:0-9       [000] .....     5.600206: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_LINKWD_ACEPT rate: Unknown
      kworker/0:0-9       [000] .....     5.600207: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_LANENUM_WAI rate: Unknown
      kworker/0:0-9       [000] .....     5.600208: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_LANENUM_ACEPT rate: Unknown
      kworker/0:0-9       [000] .....     5.600210: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_COMPLETE rate: Unknown
      kworker/0:0-9       [000] .....     5.600212: pcie_ltssm_state_transition: dev: a40000000.pcie state: CFG_IDLE rate: Unknown
      kworker/0:0-9       [000] .....     5.600213: pcie_ltssm_state_transition: dev: a40000000.pcie state: L0 rate: 2.5 GT/s
      kworker/0:0-9       [000] .....     5.600214: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_LOCK rate: Unknown
      kworker/0:0-9       [000] .....     5.600216: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_RCVRCFG rate: Unknown
      kworker/0:0-9       [000] .....     5.600217: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_SPEED rate: Unknown
      kworker/0:0-9       [000] .....     5.600218: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_LOCK rate: Unknown
      kworker/0:0-9       [000] .....     5.600220: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_EQ1 rate: Unknown
      kworker/0:0-9       [000] .....     5.600221: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_EQ2 rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600222: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_EQ3 rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600224: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_LOCK rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600225: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_RCVRCFG rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600226: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_IDLE rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600227: pcie_ltssm_state_transition: dev: a40000000.pcie state: L0 rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600228: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_LOCK rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600229: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_RCVRCFG rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600231: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_IDLE rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600232: pcie_ltssm_state_transition: dev: a40000000.pcie state: L0 rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600233: pcie_ltssm_state_transition: dev: a40000000.pcie state: L123_SEND_EIDLE rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600234: pcie_ltssm_state_transition: dev: a40000000.pcie state: L1_IDLE rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600236: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_LOCK rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600237: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_RCVRCFG rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600238: pcie_ltssm_state_transition: dev: a40000000.pcie state: RCVRY_IDLE rate: 8.0 GT/s
      kworker/0:0-9       [000] .....     5.600239: pcie_ltssm_state_transition: dev: a40000000.pcie state: L0 rate: 8.0 GT/s


Changes in v5:
- rebase
- use EM/EMe instead
- remove reg/unreg function and back to use TRACE_EVENT
- use trace_pcie_ltssm_state_transition_enabled()

Changes in v4:
- use TRACE_EVENT_FN to notify when to start and stop the tracepoint,
  and export pci_ltssm_tp_enabled() for host drivers to use
- skip trace if pci_ltssm_tp_enabled() is false.(Steven)
- wrap into 80 columns(Bjorn)

Changes in v3:
- add TRACE_DEFINE_ENUM for all enums(Steven Rostedt)
- Add toctree entry in Documentation/trace/index.rst(Bagas Sanjaya)
- fix mismatch section underline length(Bagas Sanjaya)
- Make example snippets in code block(Bagas Sanjaya)
- warp context into 80 columns and fix the file name(Bjorn)
- reorder variables(Mani)
- rename loop to i; rename en to enable(Mani)
- use FIELD_GET(Mani)
- add comment about how the FIFO works(Mani)

Changes in v2:
- use tracepoint

Shawn Lin (3):
  PCI: trace: Add PCI controller LTSSM transition tracepoint
  Documentation: tracing: Add PCI controller event documentation
  PCI: dw-rockchip: Add pcie_ltssm_state_transition trace support

 Documentation/trace/events-pci-controller.rst |  42 ++++++++++
 Documentation/trace/index.rst                 |   1 +
 drivers/pci/controller/dwc/pcie-dw-rockchip.c | 111 ++++++++++++++++++++++++++
 drivers/pci/trace.c                           |   1 +
 include/trace/events/pci_controller.h         |  58 ++++++++++++++
 5 files changed, 213 insertions(+)
 create mode 100644 Documentation/trace/events-pci-controller.rst
 create mode 100644 include/trace/events/pci_controller.h

-- 
2.7.4


^ permalink raw reply

* Re: [PATCH v12 0/4] ring-buffer: Making persistent ring buffers robust
From: Steven Rostedt @ 2026-03-25  0:47 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, Ian Rogers
In-Reply-To: <20260324203941.2bd628d9@gandalf.local.home>

On Tue, 24 Mar 2026 20:39:41 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 25 Mar 2026 09:13:04 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > I found the branch does not have a fix [1] and it makes a conflict with
> > [2/4]. Can you update it to merge the trace/fixes branch?
> > 
> > [1] f35dbac69421 ("ring-buffer: Fix to update per-subbuf entries of persistent ring buffer")
> > 
> > I can send a series on ring-buffer/for-next but I think it may
> > cause the same conflict on git-pull later.  
> 
> Ah crap. OK, let me merge in that commit to the for-next branch, and then see if it works.
> 
> Thanks for letting me know.

OK, refresh the ring-buffer/for-next branch and it has the fix merged.

There's still a small conflict (with a header file include). Could you
rebase on that?

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH v12 0/4] ring-buffer: Making persistent ring buffers robust
From: Steven Rostedt @ 2026-03-25  0:39 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, Ian Rogers
In-Reply-To: <20260325091304.c94f4e9505660202f96006a1@kernel.org>

On Wed, 25 Mar 2026 09:13:04 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> I found the branch does not have a fix [1] and it makes a conflict with
> [2/4]. Can you update it to merge the trace/fixes branch?
> 
> [1] f35dbac69421 ("ring-buffer: Fix to update per-subbuf entries of persistent ring buffer")
> 
> I can send a series on ring-buffer/for-next but I think it may
> cause the same conflict on git-pull later.

Ah crap. OK, let me merge in that commit to the for-next branch, and then see if it works.

Thanks for letting me know.

-- Steve

^ permalink raw reply

* Re: [PATCH] ring-buffer: Show what clock function is used on timestamp errors
From: Masami Hiramatsu @ 2026-03-25  0:17 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Linux Trace Kernel, Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <20260323202212.479bb288@gandalf.local.home>

On Mon, 23 Mar 2026 20:22:12 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> The testing for tracing was triggering a timestamp count issue that was
> always off by one. This has been happening for some time but has never
> been reported by anyone else. It was finally discovered to be an issue
> with the "uptime" (jiffies) clock that happened to be traced and the
> internal recursion caused the discrepancy. This would have been much
> easier to solve if the clock function being used was displayed when the
> error was detected.
> 
> Add the clock function to the error output.

Looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>  kernel/trace/ring_buffer.c | 10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 170170bd83bd..a99d1c7d180b 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -4435,18 +4435,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
>  	ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
>  	if (ret < 0) {
>  		if (delta < ts) {
> -			buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
> -					   cpu_buffer->cpu, ts, delta);
> +			buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
> +					   cpu_buffer->cpu, ts, delta,
> +					   cpu_buffer->buffer->clock);
>  			goto out;
>  		}
>  	}
>  	if ((full && ts > info->ts) ||
>  	    (!full && ts + info->delta != info->ts)) {
> -		buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
> +		buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
>  				   cpu_buffer->cpu,
>  				   ts + info->delta, info->ts, info->delta,
>  				   info->before, info->after,
> -				   full ? " (full)" : "", show_interrupt_level());
> +				   full ? " (full)" : "", show_interrupt_level(),
> +				   cpu_buffer->buffer->clock);
>  	}
>  out:
>  	atomic_dec(this_cpu_ptr(&checking));
> -- 
> 2.51.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v12 0/4] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu @ 2026-03-25  0:13 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Steven Rostedt, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <20260325085644.4310a1295b0b962ed87b5b60@kernel.org>

On Wed, 25 Mar 2026 08:56:44 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Tue, 24 Mar 2026 17:25:57 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Tue, 24 Mar 2026 09:00:03 +0900
> > "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> > 
> > > Hi,
> > > 
> > > Here is the 12th version of improvement patches for making persistent
> > > ring buffers robust to failures.
> > > The previous version is here:
> > > 
> > > https://lore.kernel.org/all/177391152793.193994.8986943289250629418.stgit@mhiramat.tok.corp.google.com/
> > > 
> > > 
> > > In this version, I rebased it on tracing/fixes branch (thus
> > > the first fix patch has been removed) and fixed
> > > a build error which came from a copy-paste mistake.
> > > 
> > 
> > Hi Masami,
> > 
> > Can you rebase this on the ring-buffer/for-next branch?
> > 
> > It has the remote updates for pkvm tracing, and there's some conflicts.
> > That branch is also used by the ARM tree so it can't be rebased.
> 
> OK, let me rebase the series on it.

I found the branch does not have a fix [1] and it makes a conflict with
[2/4]. Can you update it to merge the trace/fixes branch?

[1] f35dbac69421 ("ring-buffer: Fix to update per-subbuf entries of persistent ring buffer")

I can send a series on ring-buffer/for-next but I think it may
cause the same conflict on git-pull later.

Thanks,

> 
> Thanks!
> 
> > 
> > Thanks,
> > 
> > -- Steve
> > 
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v12 0/4] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu @ 2026-03-24 23:56 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, Ian Rogers
In-Reply-To: <20260324172557.04cfc2a9@gandalf.local.home>

On Tue, 24 Mar 2026 17:25:57 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 24 Mar 2026 09:00:03 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> 
> > Hi,
> > 
> > Here is the 12th version of improvement patches for making persistent
> > ring buffers robust to failures.
> > The previous version is here:
> > 
> > https://lore.kernel.org/all/177391152793.193994.8986943289250629418.stgit@mhiramat.tok.corp.google.com/
> > 
> > 
> > In this version, I rebased it on tracing/fixes branch (thus
> > the first fix patch has been removed) and fixed
> > a build error which came from a copy-paste mistake.
> > 
> 
> Hi Masami,
> 
> Can you rebase this on the ring-buffer/for-next branch?
> 
> It has the remote updates for pkvm tracing, and there's some conflicts.
> That branch is also used by the ARM tree so it can't be rebased.

OK, let me rebase the series on it.

Thanks!

> 
> Thanks,
> 
> -- Steve
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4 0/5] mm: zone lock tracepoint instrumentation
From: Andrew Morton @ 2026-03-24 23:39 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Steven Rostedt, Matthew Wilcox, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Masami Hiramatsu, Mathieu Desnoyers, Rafael J. Wysocki,
	Pavel Machek, Len Brown, Brendan Jackman, Johannes Weiner, Zi Yan,
	Oscar Salvador, Qi Zheng, Shakeel Butt, linux-kernel, linux-mm,
	linux-trace-kernel, linux-pm
In-Reply-To: <abv4riGLlubast8v@shell.ilvokhin.com>

On Thu, 19 Mar 2026 13:22:54 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> On Mon, Mar 16, 2026 at 05:40:50PM +0000, Dmitry Ilvokhin wrote:
> 
> [...]
> 
> > A possible generic solution is a trace_contended_release() for spin
> > locks, for example:
> > 
> >     if (trace_contended_release_enabled() &&
> >         atomic_read(&lock->val) & ~_Q_LOCKED_MASK)
> >         trace_contended_release(lock);
> > 
> > This might work on x86, but could increase code size and regress
> > performance on arches where spin_unlock() is inlined, such as arm64
> > under !PREEMPTION.
> 
> I took a stab at this idea and submitted an RFC [1].
> 
> The implementation builds on your earlier observation from Matthew that
> _raw_spin_unlock() is not inlined in most configurations. In those
> cases, when the tracepoint is disabled, this adds a single NOP on the
> fast path, with the conditional check staying out of line. The measured
> text size increase in this configuration is +983 bytes.
> 
> For configurations where _raw_spin_unlock() is inlined, the
> instrumentation does increase code size more noticeably
> (+71 KB in my measurements), since the check and out of line call is
> replicated at each call site.
> 
> This provides a generic release-side signal for contended locks,
> allowing: correlation of lock holders with waiters and measurement of
> contended hold times
> 
> This RFC addressing the same visibility gap without introducing per-lock
> instrumentation.
> 
> If this tradeoff is acceptable, this could be a generic alternative to
> lock-specific tracepoints.
> 
> [1]: https://lore.kernel.org/all/51aad0415b78c5a39f2029722118fa01eac77538.1773858853.git.d@ilvokhin.com 

That submission has met a disappointing response.

How should I proceed with this series "mm: zone lock tracepoint
instrumentation"?  It's not urgent so I'm inclined to put this on hold
while you pursue "locking: Add contended_release tracepoint to spinning
locks"?

Please send that v2 sometime and hopefully Steven can help push it along?

^ permalink raw reply

* [PATCH v4 2/2] tracing: Drain deferred trigger frees if kthread creation fails
From: Wesley Atwell @ 2026-03-24 22:13 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: linux-kernel, rostedt, mhiramat, mark.rutland, mathieu.desnoyers,
	tom.zanussi, Wesley Atwell
In-Reply-To: <20260324221326.1395799-1-atwellwea@gmail.com>

Boot-time trigger registration can fail before the trigger-data cleanup
kthread exists. Deferring those frees until late init is fine, but the
post-boot fallback must still drain the deferred list if kthread
creation never succeeds.

Otherwise, boot-deferred nodes can accumulate on
trigger_data_free_list, later frees fall back to synchronously freeing
only the current object, and the older queued entries are leaked
forever.

Keep the deferred boot-time behavior, but when kthread creation fails,
drain the whole queued list synchronously. Do the same in the late-init
drain path so queued entries are not stranded there either.

Fixes: 61d445af0a7c ("tracing: Add bulk garbage collection of freeing event_trigger_data")
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 kernel/trace/trace_events_trigger.c | 79 ++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d5230b759a2d..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
 static struct llist_head trigger_data_free_list;
 static DEFINE_MUTEX(trigger_data_kthread_mutex);
 
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	if (!trigger_kthread) {
+		struct task_struct *kthread;
+
+		kthread = kthread_create(trigger_kthread_fn, NULL,
+					 "trigger_data_free");
+		if (!IS_ERR(kthread))
+			WRITE_ONCE(trigger_kthread, kthread);
+	}
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+	struct event_trigger_data *data, *tmp;
+	struct llist_node *llnodes;
+
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	llnodes = llist_del_all(&trigger_data_free_list);
+	if (!llnodes)
+		return;
+
+	tracepoint_synchronize_unregister();
+
+	llist_for_each_entry_safe(data, tmp, llnodes, llist)
+		kfree(data);
+}
+
 /* Bulk garbage collection of event_trigger_data elements */
 static int trigger_kthread_fn(void *ignore)
 {
@@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data)
 	if (data->cmd_ops->set_filter)
 		data->cmd_ops->set_filter(NULL, data, NULL);
 
+	/*
+	 * Boot-time trigger registration can fail before kthread creation
+	 * works. Keep the deferred-free semantics during boot and let late
+	 * init start the kthread to drain the list.
+	 */
+	if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+		llist_add(&data->llist, &trigger_data_free_list);
+		return;
+	}
+
 	if (unlikely(!trigger_kthread)) {
 		guard(mutex)(&trigger_data_kthread_mutex);
+
+		trigger_create_kthread_locked();
 		/* Check again after taking mutex */
 		if (!trigger_kthread) {
-			struct task_struct *kthread;
-
-			kthread = kthread_create(trigger_kthread_fn, NULL,
-						 "trigger_data_free");
-			if (!IS_ERR(kthread))
-				WRITE_ONCE(trigger_kthread, kthread);
+			llist_add(&data->llist, &trigger_data_free_list);
+			/* Drain the queued frees synchronously if creation failed. */
+			trigger_data_free_queued_locked();
+			return;
 		}
 	}
 
-	if (!trigger_kthread) {
-		/* Do it the slow way */
-		tracepoint_synchronize_unregister();
-		kfree(data);
-		return;
-	}
-
 	llist_add(&data->llist, &trigger_data_free_list);
 	wake_up_process(trigger_kthread);
 }
 
+static int __init trigger_data_free_init(void)
+{
+	guard(mutex)(&trigger_data_kthread_mutex);
+
+	if (llist_empty(&trigger_data_free_list))
+		return 0;
+
+	trigger_create_kthread_locked();
+	if (trigger_kthread)
+		wake_up_process(trigger_kthread);
+	else
+		trigger_data_free_queued_locked();
+
+	return 0;
+}
+late_initcall(trigger_data_free_init);
+
 static inline void data_ops_trigger(struct event_trigger_data *data,
 				    struct trace_buffer *buffer,  void *rec,
 				    struct ring_buffer_event *event)
-- 
2.43.0


^ permalink raw reply related

* [PATCH v4 1/2] tracing: Preserve repeated boot-time tracing parameters
From: Wesley Atwell @ 2026-03-24 22:13 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: linux-kernel, rostedt, mhiramat, mark.rutland, mathieu.desnoyers,
	tom.zanussi, Wesley Atwell
In-Reply-To: <20260324221326.1395799-1-atwellwea@gmail.com>

Some tracing boot parameters already accept delimited value lists, but
their __setup() handlers keep only the last instance seen at boot.
Make repeated instances append to the same boot-time buffer in the
format each parser already consumes.

Use a shared trace_append_boot_param() helper for the ftrace filters,
trace_options, and kprobe_event boot parameters. trace_trigger=
tokenizes its backing storage in place, so keep a running offset and
only parse the newly appended chunk into bootup_triggers[].

This also lets Bootconfig array values work naturally when they expand
to repeated param=value entries.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 kernel/trace/ftrace.c       | 12 ++++++++----
 kernel/trace/trace.c        | 31 ++++++++++++++++++++++++++++++-
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c | 26 +++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c |  3 ++-
 5 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 413310912609..8bd3dd1d549c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6841,7 +6841,8 @@ bool ftrace_filter_param __initdata;
 static int __init set_ftrace_notrace(char *str)
 {
 	ftrace_filter_param = true;
-	strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	trace_append_boot_param(ftrace_notrace_buf, str, ',',
+				FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
@@ -6849,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
 static int __init set_ftrace_filter(char *str)
 {
 	ftrace_filter_param = true;
-	strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	trace_append_boot_param(ftrace_filter_buf, str, ',',
+				FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
@@ -6861,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
 
 static int __init set_graph_function(char *str)
 {
-	strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+	trace_append_boot_param(ftrace_graph_buf, str, ',',
+				FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_graph_filter=", set_graph_function);
 
 static int __init set_graph_notrace_function(char *str)
 {
-	strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+	trace_append_boot_param(ftrace_graph_notrace_buf, str, ',',
+				FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_graph_notrace=", set_graph_notrace_function);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a626211ceb9a..1b1f6f9035c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -228,6 +228,34 @@ static int boot_instance_index;
 static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
 static int boot_snapshot_index;
 
+/*
+ * Repeated boot parameters, including Bootconfig array expansions, need
+ * to stay in the delimiter form that the existing parser consumes.
+ */
+void __init trace_append_boot_param(char *buf, const char *str, char sep,
+				    size_t size)
+{
+	size_t len, str_len;
+
+	if (!buf[0]) {
+		strscpy(buf, str, size);
+		return;
+	}
+
+	str_len = strlen(str);
+	if (!str_len)
+		return;
+
+	len = strlen(buf);
+	if (len >= size - 1)
+		return;
+	if (str_len >= size - len - 1)
+		return;
+
+	buf[len] = sep;
+	strscpy(buf + len + 1, str, size - len - 1);
+}
+
 static int __init set_cmdline_ftrace(char *str)
 {
 	strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
@@ -329,7 +357,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
 
 static int __init set_trace_boot_options(char *str)
 {
-	strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	trace_append_boot_param(trace_boot_options_buf, str, ',',
+				MAX_TRACER_SIZE);
 	return 1;
 }
 __setup("trace_options=", set_trace_boot_options);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b8f3804586a0..64ce179d12dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -863,6 +863,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
 extern int DYN_FTRACE_TEST_NAME2(void);
 
 extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
+void __init trace_append_boot_param(char *buf, const char *str,
+				    char sep, size_t size);
 extern bool tracing_selftest_disabled;
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 249d1cba72c0..5f72be33f2d1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3679,20 +3679,40 @@ static struct boot_triggers {
 } bootup_triggers[MAX_BOOT_TRIGGERS];
 
 static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static size_t bootup_trigger_buf_len;
 static int nr_boot_triggers;
 
 static __init int setup_trace_triggers(char *str)
 {
 	char *trigger;
 	char *buf;
+	size_t start, str_len;
 	int i;
 
-	strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+	if (bootup_trigger_buf_len >= COMMAND_LINE_SIZE)
+		return 1;
+
+	start = bootup_trigger_buf_len;
+	if (start && !*str)
+		return 1;
+
+	str_len = strlen(str);
+	if (start && str_len >= COMMAND_LINE_SIZE - start)
+		return 1;
+
+	/*
+	 * trace_trigger= parsing tokenizes the backing storage in place.
+	 * Copy each repeated parameter into fresh space and only parse that
+	 * newly copied chunk here.
+	 */
+	trace_append_boot_param(bootup_trigger_buf + start, str, '\0',
+				COMMAND_LINE_SIZE - start);
+	bootup_trigger_buf_len += strlen(bootup_trigger_buf + start) + 1;
 	trace_set_ring_buffer_expanded(NULL);
 	disable_tracing_selftest("running event triggers");
 
-	buf = bootup_trigger_buf;
-	for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+	buf = bootup_trigger_buf + start;
+	for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) {
 		trigger = strsep(&buf, ",");
 		if (!trigger)
 			break;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a5dbb72528e0..e9f1c55aea64 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
 
 static int __init set_kprobe_boot_events(char *str)
 {
-	strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+	trace_append_boot_param(kprobe_boot_events_buf, str, ';',
+				COMMAND_LINE_SIZE);
 	disable_tracing_selftest("running kprobe events");
 
 	return 1;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v4 0/2] tracing: Preserve repeated boot-time parameters and drain deferred trigger frees
From: Wesley Atwell @ 2026-03-24 22:13 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: linux-kernel, rostedt, mhiramat, mark.rutland, mathieu.desnoyers,
	tom.zanussi, Wesley Atwell

Patch 1 preserves repeated boot-time tracing parameters by appending to
the existing parser buffers instead of overwriting earlier values, so
repeated command-line entries and Bootconfig array expansions work with
the delimited formats the parsers already consume.

Patch 2 fixes the deferred event-trigger free path so queued frees are
still drained if the cleanup kthread never comes up after boot.

v4:
 - drop the kernel-parameters.txt update from patch 1
 - move trace_append_boot_param() into trace.c and keep only the
   prototype in trace.h
 - capitalize the tracing patch subjects
 - rename trigger_start_kthread_locked() to trigger_create_kthread_locked()
 - change the failure comment to say "creation failed"

Wesley Atwell (2):
  tracing: Preserve repeated boot-time tracing parameters
  tracing: Drain deferred trigger frees if kthread creation fails

 kernel/trace/ftrace.c               | 12 +++--
 kernel/trace/trace.c                | 31 ++++++++++-
 kernel/trace/trace.h                |  2 +
 kernel/trace/trace_events.c         | 26 ++++++++--
 kernel/trace/trace_events_trigger.c | 79 ++++++++++++++++++++++++-----
 kernel/trace/trace_kprobe.c         |  3 +-
 6 files changed, 131 insertions(+), 22 deletions(-)

-- 
2.43.0

^ permalink raw reply

* Re: [PATCH] tracing: Allow perf to read synthetic events
From: Steven Rostedt @ 2026-03-24 21:41 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: LKML, Linux Trace Kernel, Masami Hiramatsu, Mathieu Desnoyers,
	Ian Rogers, Arnaldo Carvalho de Melo, Namhyung Kim,
	Peter Zijlstra
In-Reply-To: <20251229183632.7b518b8d@gandalf.local.home>

On Mon, 29 Dec 2025 18:36:32 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> diff --git a/include/traceevent/event-parse.h b/include/traceevent/event-parse.h
> index ebfc2c7..9c1abfa 100644
> --- a/include/traceevent/event-parse.h
> +++ b/include/traceevent/event-parse.h
> @@ -138,6 +138,11 @@ struct tep_print_arg_bitmask {
>  	struct tep_format_field	*field;
>  };
>  
> +struct tep_print_arg_stacktrace {
> +	char			*stacktrace;
> +	struct tep_format_field	*field;
> +};
> +

[..]

So this has been already applied to libtraceevent 1.9. I'll get the kernel
side patch ready for the next merge window.

-- Steve

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox