Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Jiri Olsa @ 2026-03-25 21:48 UTC (permalink / raw)
  To: Leon Hwang
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
	linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <7a119223-9994-4edc-af0b-f1ee9876cd20@linux.dev>

On Wed, Mar 25, 2026 at 02:45:31PM +0800, Leon Hwang wrote:

SNIP

> > +
> > +	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> > +	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> > +
> > +	printf("%s: found %lu functions\n", __func__, cnt);
> > +	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> > +	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> > +
> > +cleanup:
> > +	tracing_multi_bench__destroy(skel);
> > +	tdestroy(root, tdestroy_free_nop);
> > +	free_kallsyms_local(ksyms);
> > +	free(ids);
> 
> Is btf__free(btf) missing here? Since 'btf' was calloc inner
> btf__load_vmlinux_btf().

ah yea, will add, thanks

jirka

^ permalink raw reply

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Jiri Olsa @ 2026-03-25 21:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Leon Hwang, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <CAADnVQL3gO=kuvDHQNa8VfL_NWUMvBv6=vrXkOd=8Fe9-dcU3A@mail.gmail.com>

On Wed, Mar 25, 2026 at 08:11:00AM -0700, Alexei Starovoitov wrote:

SNIP

> > > +     attach_start_ns = get_time_ns();
> > > +     link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
> > > +     attach_end_ns = get_time_ns();
> > > +
> > > +     if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
> > > +             goto cleanup;
> > > +
> > > +     detach_start_ns = get_time_ns();
> > > +     bpf_link__destroy(link);
> > > +     detach_end_ns = get_time_ns();
> > > +
> > > +     attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> > > +     detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> > > +
> > > +     printf("%s: found %lu functions\n", __func__, cnt);
> > > +     printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> > > +     printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> > > +
> > > +cleanup:
> > > +     tracing_multi_bench__destroy(skel);
> > > +     tdestroy(root, tdestroy_free_nop);
> > > +     free_kallsyms_local(ksyms);
> > > +     free(ids);
> >
> > Is btf__free(btf) missing here? Since 'btf' was calloc inner
> > btf__load_vmlinux_btf().
> 
> Good point.
> Leon, please trim your replies. No need to quote the whole patch.
> 
> btw sashiko caught it too:
> https://sashiko.dev/#/patchset/20260324081846.2334094-1-jolsa%40kernel.org
> and many other bugs beyond what bpf CI could find.
> 
> Jiri, please address them all.

ok, will check

jirka

^ permalink raw reply

* Re: [PATCH v2 02/19] kernel: Use trace_call__##name() at guarded tracepoint call sites
From: Thomas Gleixner @ 2026-03-25 17:16 UTC (permalink / raw)
  To: Vineeth Pillai (Google)
  Cc: Vineeth Pillai (Google), Steven Rostedt, Peter Zijlstra,
	Tejun Heo, David Vernet, Andrea Righi, Changwoo Min, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Ben Segall,
	Mel Gorman, Valentin Schneider, Yury Norov [NVIDIA],
	Paul E. McKenney, Rik van Riel, Roman Kisel, Joel Fernandes,
	Rafael J. Wysocki, Ulf Hansson, linux-kernel, sched-ext,
	linux-trace-kernel
In-Reply-To: <20260323160052.17528-3-vineeth@bitbyteword.org>

On Mon, Mar 23 2026 at 12:00, Vineeth Pillai wrote:

> Replace trace_foo() with the new trace_call__foo() at sites already
> guarded by trace_foo_enabled(), avoiding a redundant
> static_branch_unlikely() re-evaluation inside the tracepoint.
> trace_call__foo() calls the tracepoint callbacks directly without
> utilizing the static branch again.
>
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Assisted-by: Claude:claude-sonnet-4-6

Acked-by: Thomas Gleixner <tglx@kernel.org>

^ permalink raw reply

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Alexei Starovoitov @ 2026-03-25 15:11 UTC (permalink / raw)
  To: Leon Hwang
  Cc: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <7a119223-9994-4edc-af0b-f1ee9876cd20@linux.dev>

On Tue, Mar 24, 2026 at 11:45 PM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> > +
> > +     btf = btf__load_vmlinux_btf();
> > +     if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
> > +             return;> +
> > +     skel = tracing_multi_bench__open_and_load();
> > +     if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load"))
> > +             goto cleanup;
> > +
> > +     if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms"))
> > +             goto cleanup;
> > +
> > +     /* Get all ftrace 'safe' symbols.. */
> > +     for (i = 0; i < ksyms->filtered_cnt; i++) {
> > +             if (is_unsafe_function(ksyms->filtered_syms[i]))
> > +                     continue;
> > +             tsearch(&ksyms->filtered_syms[i], &root, compare);
> > +     }
> > +
> > +     /* ..and filter them through BTF and btf_type_is_traceable_func. */
> > +     nr = btf__type_cnt(btf);
> > +     for (type_id = 1; type_id < nr; type_id++) {
> > +             const struct btf_type *type;
> > +             const char *str;
> > +
> > +             type = btf__type_by_id(btf, type_id);
> > +             if (!type)
> > +                     break;
> > +
> > +             if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
> > +                     continue;
> > +
> > +             str = btf__name_by_offset(btf, type->name_off);
> > +             if (!str)
> > +                     break;
> > +
> > +             if (!tfind(&str, &root, compare))
> > +                     continue;
> > +
> > +             if (!btf_type_is_traceable_func(btf, type))
> > +                     continue;
> > +
> > +             err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1);
> > +             if (err)
> > +                     goto cleanup;
> > +
> > +             ids[cnt++] = type_id;
> > +     }
> > +
> > +     opts.ids = ids;
> > +     opts.cnt = cnt;
> > +
> > +     attach_start_ns = get_time_ns();
> > +     link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
> > +     attach_end_ns = get_time_ns();
> > +
> > +     if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
> > +             goto cleanup;
> > +
> > +     detach_start_ns = get_time_ns();
> > +     bpf_link__destroy(link);
> > +     detach_end_ns = get_time_ns();
> > +
> > +     attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> > +     detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> > +
> > +     printf("%s: found %lu functions\n", __func__, cnt);
> > +     printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> > +     printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> > +
> > +cleanup:
> > +     tracing_multi_bench__destroy(skel);
> > +     tdestroy(root, tdestroy_free_nop);
> > +     free_kallsyms_local(ksyms);
> > +     free(ids);
>
> Is btf__free(btf) missing here? Since 'btf' was calloc inner
> btf__load_vmlinux_btf().

Good point.
Leon, please trim your replies. No need to quote the whole patch.

btw sashiko caught it too:
https://sashiko.dev/#/patchset/20260324081846.2334094-1-jolsa%40kernel.org
and many other bugs beyond what bpf CI could find.

Jiri, please address them all.

^ permalink raw reply

* Re: [PATCHv4 bpf-next 00/25] bpf: tracing_multi link
From: Leon Hwang @ 2026-03-25 14:58 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: Hengqi Chen, bpf, linux-trace-kernel, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, Menglong Dong,
	Steven Rostedt
In-Reply-To: <7fc7e5ad-ac42-4c7d-8314-bd252f8887a3@linux.dev>

On 2026/3/25 14:42, Leon Hwang wrote:
> Hi Jiri,
> 
> Nice version for tracing_multi link.
> 
> I hope I have time to add tracing_multi link support to bpfsnoop, and
> test this new tracing feature.
> 
> I left comments on patches #13, #24, and #25.
> 
Hmm, sashiko's reviews [1] cover my comments on patches #24 and #25. I
should check them first.

[1]
https://sashiko.dev/#/patchset/20260324081846.2334094-1-jolsa%40kernel.org

Thanks,
Leon

[...]


^ permalink raw reply

* Re: [PATCH] tracing/osnoise: fix potential deadlock in cpu hotplug
From: Steven Rostedt @ 2026-03-25 14:34 UTC (permalink / raw)
  To: hu.shengming
  Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	zhang.run, yang.tao172, ran.xiaokai, luo.haiyang
In-Reply-To: <20260325102542300G48VT-wLNp-dOgT_9Qi2f@zte.com.cn>

On Wed, 25 Mar 2026 10:25:42 +0800 (CST)
<hu.shengming@zte.com.cn> wrote:

> >On Tue, 24 Mar 2026 15:06:16 +0800 (CST)
> ><hu.shengming@zte.com.cn> wrote:
> >  
> >> From: luohaiyang10243395 <luo.haiyang@zte.com.cn>
> >> 
> >> The following sequence may leads deadlock in cpu hotplug:
> >> 
> >>   CPU0                        |  CPU1
> >>                               |  schedule_work_on
> >>                               |
> >>   _cpu_down//set CPU1 offline |
> >>   cpus_write_lock             |
> >>                               |  osnoise_hotplug_workfn
> >>                               |    mutex_lock(&interface_lock);
> >>                               |    cpus_read_lock();  //wait cpu_hotplug_lock
> >>                               |
> >>                               |  cpuhp/1
> >>                               |    osnoise_cpu_die
> >>                               |      kthread_stop
> >>                               |        wait_for_completion //wait osnoise/1 exit
> >>                               |
> >>                               |  osnoise/1
> >>                               |    osnoise_sleep
> >>                               |      mutex_lock(&interface_lock); //deadlock
> >> 
> >> Fix by swap the order of cpus_read_lock() and mutex_lock(&interface_lock).  
> >
> >So the deadlock is due to the "wait_for_completion"?  
> 
> The osnoise_cpu_init callback returns directly, which may allow another CPU offline task to run, 
> the offline task holds the cpu_hotplug_lock while waiting for the osnoise task to exit. 
> osnoise_hotplug_workfn may acquire interface_lock first, causing the offline task to be blocked. 
> This is an ABBA deadlock.

Right, as I said, it is due to the "wait_for_completion" and not due to two
different locks. One is waiting for the osnoise task to exit (the
"wait_for_completion") but the osnoise task is blocked on the interface_lock().

Better to show it as:


    task1		task2		task3
    -----		-----		-----

 mutex_lock(&interface_lock)

		    [CPU GOING OFFLINE]

		    cpus_write_lock();
		    osnoise_cpu_die();
		      kthread_stop(task3);
		        wait_for_completion();

				      osnoise_sleep();
				        mutex_lock(&interface_lock);

 cpus_read_lock();

 [DEAD LOCK]

> 
> >How did you find this bug? Inspection, AI, triggered?
> >
> >Thanks,
> >
> >-- Steve  
> 
> We run autotests on kernel-6.6, report following hung task warning, and we think the same issue exists
> in linux-stable.

Thanks. It's usually good to state how a bug was discovered when fixing it.

Could you send a v2 with an updated change log?

-- Steve

^ permalink raw reply

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Masami Hiramatsu @ 2026-03-25 14:22 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Jonathan Corbet, Shuah Khan, linux-kernel, linux-trace-kernel,
	linux-doc, oss, paulmck, rostedt, kernel-team
In-Reply-To: <20260325-early_bootconfig-v2-1-6b05a36fbfb5@debian.org>

Hi Breno,

On Wed, 25 Mar 2026 03:05:38 -0700
Breno Leitao <leitao@debian.org> wrote:

> Bootconfig currently cannot be used to configure early kernel
> parameters. For example, the "mitigations=" parameter must be passed
> through traditional boot methods because bootconfig parsing happens
> after these early parameters need to be processed.
> 
> This patch allows early options such as:
> 
>   kernel.mitigations = off
> 
> to be placed in the embedded bootconfig and take effect, without
> requiring them to be on the kernel command line.
> 
> Add bootconfig_apply_early_params() which walks all kernel.* keys in the
> parsed XBC tree and calls do_early_param() for each one. It is called
> from setup_boot_config() immediately after a successful xbc_init() on
> the embedded data, which happens before parse_early_param() runs in
> start_kernel().
> 
> Early options in initrd bootconfig are still silently ignored, as the
> initrd is only available after the early param window has closed.
> 
> Document this behaviour in both Kconfig and the admin guide.

AI review made some comments. Some of the review comments seem
reasonable.

https://sashiko.dev/#/patchset/20260325-early_bootconfig-v2-1-6b05a36fbfb5%40debian.org

[..]
> 
> diff --git a/init/main.c b/init/main.c
> index 453ac9dff2da0..14a04c283fa48 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -416,9 +416,64 @@ static int __init warn_bootconfig(char *str)
>  	return 0;
>  }
>  
> +/*
> + * do_early_param() is defined later in this file but called from
> + * bootconfig_apply_early_params() below, so we need a forward declaration.
> + */
> +static int __init do_early_param(char *param, char *val,
> +				 const char *unused, void *arg);
> +
> +/*
> + * bootconfig_apply_early_params - dispatch kernel.* keys from the embedded
> + * bootconfig as early_param() calls.
> + *
> + * early_param() handlers must run before most of the kernel initialises
> + * (e.g. before the GIC driver reads irqchip.gicv3_pseudo_nmi).  A bootconfig
> + * attached to the initrd arrives too late for this because the initrd is not
> + * mapped yet when early params are processed.  The embedded bootconfig lives
> + * in the kernel image itself (.init.data), so it is always reachable.
> + *
> + * This function is called from setup_boot_config() which runs in
> + * start_kernel() before parse_early_param(), making the timing correct.
> + */
> +static void __init bootconfig_apply_early_params(void)

[sashiko comment]
| Does this run early enough for architectural parameters?
| While setup_boot_config() runs before parse_early_param() in start_kernel(),
| it runs after setup_arch(). setup_boot_config() relies on xbc_init() which
| uses the memblock allocator, requiring setup_arch() to have already
| initialized it.
| However, the kernel expects many early parameters (like mem=, earlycon,
| noapic, and iommu) to be parsed during setup_arch() via the architecture's
| call to parse_early_param(). Since setup_arch() completes before
| setup_boot_config() runs, will these architectural early parameters be
| silently ignored because the decisions they influence were already
| finalized?

This is the major reason that I did not support early parameter
in bootconfig. Some archs initialize kernel_cmdline in setup_arch()
and setup early parameters in it.
To fix this, we need to change setup_arch() for each architecture so
that it calls this bootconfig_apply_early_params().

> +{
> +	static char val_buf[COMMAND_LINE_SIZE] __initdata;

[sashiko comment]
| Can using a single shared static buffer cause data corruption for handlers
| that save the argument pointer?
| Several early_param handlers assume the passed string pointer is persistent
| (like the boot_command_line) and retain it internally. For example,
| setup_earlycon() calls register_earlycon(), which sets
| early_console_dev.con->options = options, where options is a pointer
| directly into the passed buffer.
| Because val_buf is overwritten on every loop iteration, the stored pointer
| will point to the value of the last bootconfig key processed.

Ah, good catch. Since we don't have any standard way to handle the
parameters, some of them does not copy the value but try to keep
reference to the given string. 

> +	struct xbc_node *knode, *root;
> +	const char *val;
> +	ssize_t ret;
> +
> +	root = xbc_find_node("kernel");
> +	if (!root)
> +		return;
> +
> +	/*
> +	 * Keys that do not match any early_param() handler are silently
> +	 * ignored — do_early_param() always returns 0.
> +	 */
> +	xbc_node_for_each_key_value(root, knode, val) {

[sashiko comment]
| Does this loop handle array values correctly?
| xbc_node_for_each_key_value() only assigns the first value of an array to
| the val pointer before advancing to the next key. It does not iterate over
| the child nodes of the array.
| If the bootconfig contains a multi-value key like
| kernel.console = "ttyS0", "tty0", will the subsequent values in the array
| be silently dropped instead of passed to the early_param handlers?

Also, good catch :) we need to use xbc_node_for_each_array_value()
for inner loop.

> +		if (xbc_node_compose_key_after(root, knode, xbc_namebuf, XBC_KEYLEN_MAX) < 0)
> +			continue;
> +
> +		/*
> +		 * We need to copy const char *val to a char pointer,
> +		 * which is what do_early_param() need, given it might
> +		 * call strsep(), strtok() later.
> +		 */
> +		ret = strscpy(val_buf, val, sizeof(val_buf));
> +		if (ret < 0) {
> +			pr_warn("ignoring bootconfig value '%s', too long\n",
> +				xbc_namebuf);
> +			continue;
> +		}
> +		do_early_param(xbc_namebuf, val_buf, NULL, NULL);

[sashiko comment]
| How does this handle valueless parameters (boolean flags)?
| When parsing the standard kernel command line, parse_args() passes a NULL
| value to the setup function for flags that lack an = sign (e.g., ro or
| earlycon).
| However, the bootconfig parser returns a zero-length string for valueless
| keys, which gets copied into val_buf as "" and passed to do_early_param().
| This semantic deviation breaks handlers that explicitly check if (!val).
| For instance, param_setup_earlycon() and parse_lapic() check for a NULL
| argument to enable features. Will passing "" instead of NULL prevent these
| handlers from working correctly?

See fs/proc/bootconfig.c. You can check whether the key has a value or
not by checking xbc_node_get_child(knode) != NULL.

Thank you,

> +	}
> +}
> +
>  static void __init setup_boot_config(void)
>  {
>  	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
> +	bool using_embedded = false;
>  	const char *msg, *data;
>  	int pos, ret;
>  	size_t size;
> @@ -427,8 +482,17 @@ static void __init setup_boot_config(void)
>  	/* Cut out the bootconfig data even if we have no bootconfig option */
>  	data = get_boot_config_from_initrd(&size);
>  	/* If there is no bootconfig in initrd, try embedded one. */
> -	if (!data)
> +	if (!data) {
>  		data = xbc_get_embedded_bootconfig(&size);
> +		/*
> +		 * Record that we are using the embedded config so that
> +		 * bootconfig_apply_early_params() is called below.
> +		 * When CONFIG_BOOT_CONFIG_EMBED is not set,
> +		 * xbc_get_embedded_bootconfig() is a stub returning NULL, so
> +		 * data is always NULL here and using_embedded stays false.
> +		 */
> +		using_embedded = data;
> +	}
>  
>  	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
>  	err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
> @@ -466,6 +530,8 @@ static void __init setup_boot_config(void)
>  	} else {
>  		xbc_get_info(&ret, NULL);
>  		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
> +		if (using_embedded)
> +			bootconfig_apply_early_params();
>  		/* keys starting with "kernel." are passed via cmdline */
>  		extra_command_line = xbc_make_cmdline("kernel");
>  		/* Also, "init." keys are init arguments */
> 
> ---
> base-commit: 785f0eb2f85decbe7c1ef9ae922931f0194ffc2e
> change-id: 20260323-early_bootconfig-2efc4509af3d
> 
> Best regards,
> --  
> Breno Leitao <leitao@debian.org>
> 

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4 0/5] mm: zone lock tracepoint instrumentation
From: Steven Rostedt @ 2026-03-25 14:19 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Masami Hiramatsu, Mathieu Desnoyers, Rafael J. Wysocki,
	Pavel Machek, Len Brown, Brendan Jackman, Johannes Weiner, Zi Yan,
	Oscar Salvador, Qi Zheng, Shakeel Butt, linux-kernel, linux-mm,
	linux-trace-kernel, linux-pm
In-Reply-To: <acPRq1YPeGR8EqMB@shell.ilvokhin.com>

On Wed, 25 Mar 2026 12:14:35 +0000
Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> > Please send that v2 sometime and hopefully Steven can help push it along?  
> 
> I'll send the next version of the generic locking series soon. Any help
> in pushing it along would be appreciated.

I'll see what I can do when I see v2!

-- Steve

^ permalink raw reply

* Re: [PATCH v4 0/5] mm: zone lock tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-25 12:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Steven Rostedt, Matthew Wilcox, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Masami Hiramatsu, Mathieu Desnoyers, Rafael J. Wysocki,
	Pavel Machek, Len Brown, Brendan Jackman, Johannes Weiner, Zi Yan,
	Oscar Salvador, Qi Zheng, Shakeel Butt, linux-kernel, linux-mm,
	linux-trace-kernel, linux-pm
In-Reply-To: <20260324163918.1a3c5c960d85a4243c9ae314@linux-foundation.org>

On Tue, Mar 24, 2026 at 04:39:18PM -0700, Andrew Morton wrote:
> On Thu, 19 Mar 2026 13:22:54 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
> 
> > On Mon, Mar 16, 2026 at 05:40:50PM +0000, Dmitry Ilvokhin wrote:
> > 
> > [...]
> > 
> > > A possible generic solution is a trace_contended_release() for spin
> > > locks, for example:
> > > 
> > >     if (trace_contended_release_enabled() &&
> > >         atomic_read(&lock->val) & ~_Q_LOCKED_MASK)
> > >         trace_contended_release(lock);
> > > 
> > > This might work on x86, but could increase code size and regress
> > > performance on arches where spin_unlock() is inlined, such as arm64
> > > under !PREEMPTION.
> > 
> > I took a stab at this idea and submitted an RFC [1].
> > 
> > The implementation builds on your earlier observation from Matthew that
> > _raw_spin_unlock() is not inlined in most configurations. In those
> > cases, when the tracepoint is disabled, this adds a single NOP on the
> > fast path, with the conditional check staying out of line. The measured
> > text size increase in this configuration is +983 bytes.
> > 
> > For configurations where _raw_spin_unlock() is inlined, the
> > instrumentation does increase code size more noticeably
> > (+71 KB in my measurements), since the check and out of line call is
> > replicated at each call site.
> > 
> > This provides a generic release-side signal for contended locks,
> > allowing: correlation of lock holders with waiters and measurement of
> > contended hold times
> > 
> > This RFC addressing the same visibility gap without introducing per-lock
> > instrumentation.
> > 
> > If this tradeoff is acceptable, this could be a generic alternative to
> > lock-specific tracepoints.
> > 
> > [1]: https://lore.kernel.org/all/51aad0415b78c5a39f2029722118fa01eac77538.1773858853.git.d@ilvokhin.com 
> 
> That submission has met a disappointing response.
> 
> How should I proceed with this series "mm: zone lock tracepoint
> instrumentation"?  It's not urgent so I'm inclined to put this on hold
> while you pursue "locking: Add contended_release tracepoint to spinning
> locks"?

Thanks for the follow-up, Andrew.

My current plan is to focus on the "locking: Add contended_release
tracepoint to spinning locks" work and drive it to a clear conclusion:
either by getting feedback that it's not a good direction, or by getting
it into mainline.

In the meantime, it seems reasonable to drop the "mm: zone lock
tracepoint instrumentation" patchset from mm-new to avoid confusion
until the direction is clearer. I can revisit and respin it if the more
generic locking approach doesn't pan out.

> 
> Please send that v2 sometime and hopefully Steven can help push it along?

I'll send the next version of the generic locking series soon. Any help
in pushing it along would be appreciated.

^ permalink raw reply

* [PATCH v2] bootconfig: Apply early options from embedded config
From: Breno Leitao @ 2026-03-25 10:05 UTC (permalink / raw)
  To: Masami Hiramatsu, Jonathan Corbet, Shuah Khan
  Cc: linux-kernel, linux-trace-kernel, linux-doc, oss, paulmck,
	rostedt, kernel-team, Breno Leitao

Bootconfig currently cannot be used to configure early kernel
parameters. For example, the "mitigations=" parameter must be passed
through traditional boot methods because bootconfig parsing happens
after these early parameters need to be processed.

This patch allows early options such as:

  kernel.mitigations = off

to be placed in the embedded bootconfig and take effect, without
requiring them to be on the kernel command line.

Add bootconfig_apply_early_params() which walks all kernel.* keys in the
parsed XBC tree and calls do_early_param() for each one. It is called
from setup_boot_config() immediately after a successful xbc_init() on
the embedded data, which happens before parse_early_param() runs in
start_kernel().

Early options in initrd bootconfig are still silently ignored, as the
initrd is only available after the early param window has closed.

Document this behaviour in both Kconfig and the admin guide.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes in v2:
- Made val_buf static __initdata to keep 2KB off the stack
- Removed dead !val branch — xbc_node_find_next_key_value() returns "" for boolean keys, never NULL
- Added pr_warn + continue when strscpy truncates the value
- Link to v1: https://patch.msgid.link/20260324-early_bootconfig-v1-1-1c0e625aff06@debian.org
---
 Documentation/admin-guide/bootconfig.rst |  4 ++
 init/Kconfig                             |  6 +++
 init/main.c                              | 68 +++++++++++++++++++++++++++++++-
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index f712758472d5c..e820f33d3ad16 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -169,6 +169,10 @@ Boot Kernel With a Boot Config
 There are two options to boot the kernel with bootconfig: attaching the
 bootconfig to the initrd image or embedding it in the kernel itself.
 
+Early options (those registered with ``early_param()``) may only be
+specified in the embedded bootconfig, because the initrd is not yet
+available when early parameters are processed.
+
 Attaching a Boot Config to Initrd
 ---------------------------------
 
diff --git a/init/Kconfig b/init/Kconfig
index 938fbe6a91e15..5e8057e73fe06 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1534,6 +1534,12 @@ config BOOT_CONFIG_EMBED
 	  image. But if the system doesn't support initrd, this option will
 	  help you by embedding a bootconfig file while building the kernel.
 
+	  Unlike bootconfig attached to initrd, the embedded bootconfig also
+	  supports early options (those registered with early_param()). Any
+	  kernel.* key in the embedded bootconfig is applied before
+	  parse_early_param() runs. Early options in initrd bootconfig will
+	  not be applied.
+
 	  If unsure, say N.
 
 config BOOT_CONFIG_EMBED_FILE
diff --git a/init/main.c b/init/main.c
index 453ac9dff2da0..14a04c283fa48 100644
--- a/init/main.c
+++ b/init/main.c
@@ -416,9 +416,64 @@ static int __init warn_bootconfig(char *str)
 	return 0;
 }
 
+/*
+ * do_early_param() is defined later in this file but called from
+ * bootconfig_apply_early_params() below, so we need a forward declaration.
+ */
+static int __init do_early_param(char *param, char *val,
+				 const char *unused, void *arg);
+
+/*
+ * bootconfig_apply_early_params - dispatch kernel.* keys from the embedded
+ * bootconfig as early_param() calls.
+ *
+ * early_param() handlers must run before most of the kernel initialises
+ * (e.g. before the GIC driver reads irqchip.gicv3_pseudo_nmi).  A bootconfig
+ * attached to the initrd arrives too late for this because the initrd is not
+ * mapped yet when early params are processed.  The embedded bootconfig lives
+ * in the kernel image itself (.init.data), so it is always reachable.
+ *
+ * This function is called from setup_boot_config() which runs in
+ * start_kernel() before parse_early_param(), making the timing correct.
+ */
+static void __init bootconfig_apply_early_params(void)
+{
+	static char val_buf[COMMAND_LINE_SIZE] __initdata;
+	struct xbc_node *knode, *root;
+	const char *val;
+	ssize_t ret;
+
+	root = xbc_find_node("kernel");
+	if (!root)
+		return;
+
+	/*
+	 * Keys that do not match any early_param() handler are silently
+	 * ignored — do_early_param() always returns 0.
+	 */
+	xbc_node_for_each_key_value(root, knode, val) {
+		if (xbc_node_compose_key_after(root, knode, xbc_namebuf, XBC_KEYLEN_MAX) < 0)
+			continue;
+
+		/*
+		 * We need to copy const char *val to a char pointer,
+		 * which is what do_early_param() need, given it might
+		 * call strsep(), strtok() later.
+		 */
+		ret = strscpy(val_buf, val, sizeof(val_buf));
+		if (ret < 0) {
+			pr_warn("ignoring bootconfig value '%s', too long\n",
+				xbc_namebuf);
+			continue;
+		}
+		do_early_param(xbc_namebuf, val_buf, NULL, NULL);
+	}
+}
+
 static void __init setup_boot_config(void)
 {
 	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
+	bool using_embedded = false;
 	const char *msg, *data;
 	int pos, ret;
 	size_t size;
@@ -427,8 +482,17 @@ static void __init setup_boot_config(void)
 	/* Cut out the bootconfig data even if we have no bootconfig option */
 	data = get_boot_config_from_initrd(&size);
 	/* If there is no bootconfig in initrd, try embedded one. */
-	if (!data)
+	if (!data) {
 		data = xbc_get_embedded_bootconfig(&size);
+		/*
+		 * Record that we are using the embedded config so that
+		 * bootconfig_apply_early_params() is called below.
+		 * When CONFIG_BOOT_CONFIG_EMBED is not set,
+		 * xbc_get_embedded_bootconfig() is a stub returning NULL, so
+		 * data is always NULL here and using_embedded stays false.
+		 */
+		using_embedded = data;
+	}
 
 	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
@@ -466,6 +530,8 @@ static void __init setup_boot_config(void)
 	} else {
 		xbc_get_info(&ret, NULL);
 		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
+		if (using_embedded)
+			bootconfig_apply_early_params();
 		/* keys starting with "kernel." are passed via cmdline */
 		extra_command_line = xbc_make_cmdline("kernel");
 		/* Also, "init." keys are init arguments */

---
base-commit: 785f0eb2f85decbe7c1ef9ae922931f0194ffc2e
change-id: 20260323-early_bootconfig-2efc4509af3d

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply related

* Re: [PATCH] module/kallsyms: sort function symbols and use binary search
From: Stanislaw Gruszka @ 2026-03-25 10:02 UTC (permalink / raw)
  To: Petr Pavlu
  Cc: linux-modules, Sami Tolvanen, Luis Chamberlain, linux-kernel,
	linux-trace-kernel, live-patching, Daniel Gomez, Aaron Tomlin,
	Steven Rostedt, Masami Hiramatsu, Jordan Rome, Viktor Malik
In-Reply-To: <20260325082648.GA18968@wp.pl>

On Wed, Mar 25, 2026 at 09:26:56AM +0100, Stanislaw Gruszka wrote:
> On Tue, Mar 24, 2026 at 05:00:19PM +0100, Petr Pavlu wrote:
> > On 3/24/26 1:53 PM, Stanislaw Gruszka wrote:
> > > Hi,
> > > 
> > > On Mon, Mar 23, 2026 at 02:06:43PM +0100, Petr Pavlu wrote:
> > >> On 3/17/26 12:04 PM, Stanislaw Gruszka wrote:
> > >>> Module symbol lookup via find_kallsyms_symbol() performs a linear scan
> > >>> over the entire symtab when resolving an address. The number of symbols
> > >>> in module symtabs has grown over the years, largely due to additional
> > >>> metadata in non-standard sections, making this lookup very slow.
> > >>>
> > >>> Improve this by separating function symbols during module load, placing
> > >>> them at the beginning of the symtab, sorting them by address, and using
> > >>> binary search when resolving addresses in module text.
> > >>
> > >> Doesn't considering only function symbols break the expected behavior
> > >> with CONFIG_KALLSYMS_ALL=y. For instance, when using kdb, is it still
> > >> able to see all symbols in a module? The module loader should be remain
> > >> consistent with the main kallsyms code regarding which symbols can be
> > >> looked up.
> > > 
> > > We already have a CONFIG_KALLSYMS_ALL=y inconsistency between kernel and 
> > > module symbol lookup, independent of this patch. find_kallsyms_symbol()
> > > restricts the search to MOD_TEXT (or MOD_INIT_TEXT) address ranges, so
> > > it cannot resolve data or rodata symbols.
> > 
> > My understanding is that find_kallsyms_symbol() can identify all symbols
> > in a module by their addresses. However, the issue I see with
> > MOD_TEXT/MOD_INIT_TEXT is that the function may incorrectly calculate
> > the size of symbols that are not within these ranges, which is a bug
> > that should be fixed.
> 
> You are right, I misinterpreted the code:
> 
> 	if (within_module_init(addr, mod))
> 		mod_mem = &mod->mem[MOD_INIT_TEXT];
> 	else
> 		mod_mem = &mod->mem[MOD_TEXT];
> 
> 	nextval = (unsigned long)mod_mem->base + mod_mem->size;
> 
> 	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
> 
> For best = 0, bestval is also 0 as it comes from the ELF null symbol.
> 
> > A test using kdb confirms that non-text symbols can be found by their
> > addresses. The following shows the current behavior with 7.0-rc5 when
> > printing a module parameter in mlx4_en:
> > 
> > [1]kdb> mds __param_arr_num_vfs
> > 0xffffffffc1209f20 0000000100000003   ........
> > 0xffffffffc1209f28 ffffffffc0fbf07c [mlx4_core]num_vfs_argc  
> > 0xffffffffc1209f30 ffffffff8844bba0 param_ops_byte  
> > 0xffffffffc1209f38 ffffffffc0fbf080 [mlx4_core]num_vfs  
> > 0xffffffffc1209f40 000000785f69736d   msi_x...
> > 0xffffffffc1209f48 656c5f6775626564   debug_le
> > 0xffffffffc1209f50 00000000006c6576   vel.....
> > 0xffffffffc1209f58 0000000000000000   ........
> > 
> > .. and the behavior with the proposed patch:
> > 
> > [1]kdb> mds __param_arr_num_vfs
> > 0xffffffffc1077f20 0000000100000003   ........
> > 0xffffffffc1077f28 ffffffffc104707c   |p......
> > 0xffffffffc1077f30 ffffffffb4a4bba0 param_ops_byte  
> > 0xffffffffc1077f38 ffffffffc1047080   .p......
> > 0xffffffffc1077f40 000000785f69736d   msi_x...
> > 0xffffffffc1077f48 656c5f6775626564   debug_le
> > 0xffffffffc1077f50 00000000006c6576   vel.....
> > 0xffffffffc1077f58 0000000000000000   ........
> 
> Thanks for testing and pointing this out. Patch indeed breaks
> the CONFIG_KALLSYMS_ALL case. 
> 
> I think, possible fix would be to track the relevant sections in 
> __layout_sections() and use defined symbols from those sections,
> instead of just function symbols. 

I considered sorting data symbols as well, but this is nontrivial, 
it is difficult to reliably distinguish real data sections from metadata
sections containing symbols we do not want to include.

An alternative approach is to check the module memory type and fall back to
a linear search for ranges other than MOD_TEXT. This approach would also
fix the incorrect nextval/size problem.

Regards
Stanislaw

^ permalink raw reply

* Re: [PATCH v2 0/9] memblock: improve late freeing of reserved memory
From: Mike Rapoport @ 2026-03-25  8:51 UTC (permalink / raw)
  To: Andrew Morton, Mike Rapoport
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Madhavan Srinivasan, Marco Elver,
	Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Nicholas Piggin, H. Peter Anvin, Rob Herring,
	Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86, Lorenzo Stoakes
In-Reply-To: <20260323074836.3653702-1-rppt@kernel.org>

On Mon, 23 Mar 2026 09:48:27 +0200, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Hi,
> 
> Following a recent discussion about leaks in x86 EFI [1], I audited usage of
> memblock_free_late() and free_reserved_area() and made some imporovements how
> we handle late freeing of the memory allocated with memblock.
> 
> [...]

Applied to for-next branch of memblock.git tree, thanks!

[1/9] memblock: reserve_mem: fix end caclulation in reserve_mem_release_by_name()
      commit: ea459d3c24fefd90b60a702f4a73833434ae0248
[2/9] powerpc: fadump: pair alloc_pages_exact() with free_pages_exact()
      commit: 6e827110aea5fb9c53a5bf070413ffe5cad105b0
[3/9] powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact()
      commit: 3cf80188ecb828ed034ba562614cf1d48156b126
[4/9] mm: move free_reserved_area() to mm/memblock.c
      commit: 0aa264cda784f9fbe1a80ef13144cf81610086c7
[5/9] memblock: make free_reserved_area() more robust
      commit: 456ac994018598bc57ceaacb8a2c72e722c9755b
[6/9] memblock: extract page freeing from free_reserved_area() into a helper
      commit: 40191dae9ed84c816b593bb1b36a80f86c2279d1
[7/9] memblock: make free_reserved_area() update memblock if ARCH_KEEP_MEMBLOCK=y
      commit: b9e028ca869de24df00206d7ec640380670fc38f
[8/9] memblock, treewide: make memblock_free() handle late freeing
      commit: 64cb853c2ab4d8bd25b965f05e33ac0c6672bae7
[9/9] memblock: warn when freeing reserved memory before memory map is initialized
      commit: c7fc9cde41be029cf6675befbafcbb2dab40b39b

tree: https://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock
branch: for-next

--
Sincerely yours,
Mike.


^ permalink raw reply

* Re: [PATCH v2 00/11] Add spi-hid transport driver
From: Krzysztof Kozlowski @ 2026-03-25  8:49 UTC (permalink / raw)
  To: Jingyuan Liang
  Cc: Jiri Kosina, Benjamin Tissoires, Jonathan Corbet, Mark Brown,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Dmitry Torokhov, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	linux-input, linux-doc, linux-kernel, linux-spi,
	linux-trace-kernel, devicetree, hbarnor, tfiga, Jarrett Schultz,
	Dmitry Antipov, Angela Czubak
In-Reply-To: <20260324-send-upstream-v2-0-521ce8afff86@chromium.org>

On Tue, Mar 24, 2026 at 06:39:33AM +0000, Jingyuan Liang wrote:
> This series picks up the spi-hid driver work originally started by
> Microsoft. The patch breakdown has been modified and the implementation
> has been refactored to address upstream feedback and testing issues. We
> are submitting this as a new series while keeping the original sign-off
> chain to reflect the history.
> 
> Same as the original series, there is a change to HID documentation, some
> HID core changes to support a SPI device, the SPI HID transport driver,
> and HID over SPI Device Tree binding. We have added the HID over SPI ACPI
> support, power management, panel follower, and quirks for Ilitek touch
> controllers.
> 
> Original authors: Jarrett Schultz <jaschultz@microsoft.com>,
> 		  Dmitry Antipov <dmanti@microsoft.com>
> Link: https://lore.kernel.org/r/86b63b7b-afda-d7f4-7bfa-175085d5a8ef@gmail.com
> 
> Signed-off-by: Jingyuan Liang <jingyliang@chromium.org>
> ---
> Changes in v2:
> - Fix style problems and remove unnecessary fields from the DT binding file

Style and removal? So other comments were skipped?

Please write detailed changelogs, otherwise it feels you just ignore
parts of the feedback.

Best regards,
Krzysztof


^ permalink raw reply

* Re: [PATCH] module/kallsyms: sort function symbols and use binary search
From: Stanislaw Gruszka @ 2026-03-25  8:26 UTC (permalink / raw)
  To: Petr Pavlu
  Cc: linux-modules, Sami Tolvanen, Luis Chamberlain, linux-kernel,
	linux-trace-kernel, live-patching, Daniel Gomez, Aaron Tomlin,
	Steven Rostedt, Masami Hiramatsu, Jordan Rome, Viktor Malik
In-Reply-To: <282574df-7689-4677-929b-b844e7201bd5@suse.com>

On Tue, Mar 24, 2026 at 05:00:19PM +0100, Petr Pavlu wrote:
> On 3/24/26 1:53 PM, Stanislaw Gruszka wrote:
> > Hi,
> > 
> > On Mon, Mar 23, 2026 at 02:06:43PM +0100, Petr Pavlu wrote:
> >> On 3/17/26 12:04 PM, Stanislaw Gruszka wrote:
> >>> Module symbol lookup via find_kallsyms_symbol() performs a linear scan
> >>> over the entire symtab when resolving an address. The number of symbols
> >>> in module symtabs has grown over the years, largely due to additional
> >>> metadata in non-standard sections, making this lookup very slow.
> >>>
> >>> Improve this by separating function symbols during module load, placing
> >>> them at the beginning of the symtab, sorting them by address, and using
> >>> binary search when resolving addresses in module text.
> >>
> >> Doesn't considering only function symbols break the expected behavior
> >> with CONFIG_KALLSYMS_ALL=y. For instance, when using kdb, is it still
> >> able to see all symbols in a module? The module loader should be remain
> >> consistent with the main kallsyms code regarding which symbols can be
> >> looked up.
> > 
> > We already have a CONFIG_KALLSYMS_ALL=y inconsistency between kernel and 
> > module symbol lookup, independent of this patch. find_kallsyms_symbol()
> > restricts the search to MOD_TEXT (or MOD_INIT_TEXT) address ranges, so
> > it cannot resolve data or rodata symbols.
> 
> My understanding is that find_kallsyms_symbol() can identify all symbols
> in a module by their addresses. However, the issue I see with
> MOD_TEXT/MOD_INIT_TEXT is that the function may incorrectly calculate
> the size of symbols that are not within these ranges, which is a bug
> that should be fixed.

You are right, I misinterpreted the code:

	if (within_module_init(addr, mod))
		mod_mem = &mod->mem[MOD_INIT_TEXT];
	else
		mod_mem = &mod->mem[MOD_TEXT];

	nextval = (unsigned long)mod_mem->base + mod_mem->size;

	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);

For best = 0, bestval is also 0 as it comes from the ELF null symbol.

> A test using kdb confirms that non-text symbols can be found by their
> addresses. The following shows the current behavior with 7.0-rc5 when
> printing a module parameter in mlx4_en:
> 
> [1]kdb> mds __param_arr_num_vfs
> 0xffffffffc1209f20 0000000100000003   ........
> 0xffffffffc1209f28 ffffffffc0fbf07c [mlx4_core]num_vfs_argc  
> 0xffffffffc1209f30 ffffffff8844bba0 param_ops_byte  
> 0xffffffffc1209f38 ffffffffc0fbf080 [mlx4_core]num_vfs  
> 0xffffffffc1209f40 000000785f69736d   msi_x...
> 0xffffffffc1209f48 656c5f6775626564   debug_le
> 0xffffffffc1209f50 00000000006c6576   vel.....
> 0xffffffffc1209f58 0000000000000000   ........
> 
> .. and the behavior with the proposed patch:
> 
> [1]kdb> mds __param_arr_num_vfs
> 0xffffffffc1077f20 0000000100000003   ........
> 0xffffffffc1077f28 ffffffffc104707c   |p......
> 0xffffffffc1077f30 ffffffffb4a4bba0 param_ops_byte  
> 0xffffffffc1077f38 ffffffffc1047080   .p......
> 0xffffffffc1077f40 000000785f69736d   msi_x...
> 0xffffffffc1077f48 656c5f6775626564   debug_le
> 0xffffffffc1077f50 00000000006c6576   vel.....
> 0xffffffffc1077f58 0000000000000000   ........

Thanks for testing and pointing this out. Patch indeed breaks
the CONFIG_KALLSYMS_ALL case. 

I think, possible fix would be to track the relevant sections in 
__layout_sections() and use defined symbols from those sections,
instead of just function symbols. 

Regards
Stanislaw

^ permalink raw reply

* Re: [PATCHv4 bpf-next 25/25] selftests/bpf: Add tracing multi attach rollback tests
From: Leon Hwang @ 2026-03-25  6:45 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-26-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding tests for the rollback code when the tracing_multi
> link won't get attached, covering 2 reasons:
> 
>   - wrong btf id passed by user, where all previously allocated
>     trampolines will be released
>   - trampoline for requested function is fully attached (has already
>     maximum programs attached) and the link fails, the rollback code
>     needs to release all previously link-ed trampolines and release
>     them
> 
> We need the bpf_fentry_test* unattached for the tests to pass,
> so the rollback tests are serial.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  .../selftests/bpf/prog_tests/tracing_multi.c  | 213 ++++++++++++++++++
>  .../bpf/progs/tracing_multi_rollback.c        |  43 ++++
>  2 files changed, 256 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> index 6917471e329c..6ff0f72f8c46 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> @@ -10,6 +10,7 @@
>  #include "tracing_multi_session.skel.h"
>  #include "tracing_multi_fail.skel.h"
>  #include "tracing_multi_bench.skel.h"
> +#include "tracing_multi_rollback.skel.h"
>  #include "trace_helpers.h"
>  
>  static __u64 bpf_fentry_test_cookies[] = {
> @@ -669,6 +670,218 @@ void serial_test_tracing_multi_bench_attach(void)
>  	free(ids);
>  }
>  
> +static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel)
> +{
> +	LIBBPF_OPTS(bpf_test_run_opts, topts);
> +	int err, prog_fd;
> +
> +	prog_fd = bpf_program__fd(skel->progs.test_fentry);
> +	err = bpf_prog_test_run_opts(prog_fd, &topts);
> +	ASSERT_OK(err, "test_run");
> +
> +	/* make sure the rollback code did not leave any program attached */
> +	ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry");
> +	ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit");
> +}
> +
> +static void test_rollback_put(void)
> +{
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_rollback *skel = NULL;
> +	size_t cnt = FUNCS_CNT;
> +	__u32 *ids = NULL;
> +	int err;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		return;
> +
> +	bpf_program__set_autoload(skel->progs.test_fentry, true);
> +	bpf_program__set_autoload(skel->progs.test_fexit, true);
> +
> +	err = tracing_multi_rollback__load(skel);
> +	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +		goto cleanup;
> +
> +	ids = get_ids(bpf_fentry_test, cnt, NULL);
> +	if (!ASSERT_OK_PTR(ids, "get_ids"))
> +		goto cleanup;
> +
> +	/*
> +	 * Mangle last id to trigger rollback, which needs to do put
> +	 * on get-ed trampolines.
> +	 */
> +	ids[9] = 0;
> +
> +	opts.ids = ids;
> +	opts.cnt = cnt;
> +
> +	skel->bss->pid = getpid();
> +
> +	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
> +						NULL, &opts);
> +	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
> +						NULL, &opts);
> +	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	/* We don't really attach any program, but let's make sure. */
> +	tracing_multi_rollback_run(skel);
> +
> +cleanup:
> +	tracing_multi_rollback__destroy(skel);
> +	free(ids);
> +}
> +
> +

NIT: keep one blank line here.

> +static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt)
> +{
> +	int i;
> +
> +	for (i = 0; i < cnt; i++)
> +		tracing_multi_rollback__destroy(skels[i]);
> +
> +	free(skels);
> +}
> +
> +static struct tracing_multi_rollback *extra_load_and_link(void)
> +{
> +	struct tracing_multi_rollback *skel;
> +	int err;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		goto cleanup;
> +
> +	bpf_program__set_autoload(skel->progs.extra, true);
> +
> +	err = tracing_multi_rollback__load(skel);
> +	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +		goto cleanup;
> +
> +	skel->links.extra = bpf_program__attach_trace(skel->progs.extra);
> +	if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace"))
> +		goto cleanup;
> +
> +	return skel;
> +
> +cleanup:
> +	tracing_multi_rollback__destroy(skel);
> +	return NULL;
> +}
> +
> +static struct tracing_multi_rollback **fillers_load_and_link(int max)
> +{
> +	struct tracing_multi_rollback **skels, *skel;
> +	int i, err;
> +
> +	skels = calloc(max + 1, sizeof(*skels));
> +	if (!ASSERT_OK_PTR(skels, "calloc"))
> +		return NULL;
> +
> +	for (i = 0; i < max; i++) {
> +		skel = skels[i] = tracing_multi_rollback__open();
> +		if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open"))
> +			goto cleanup;
> +
> +		bpf_program__set_autoload(skel->progs.filler, true);
> +
> +		err = tracing_multi_rollback__load(skel);
> +		if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
> +			goto cleanup;
> +
> +		skel->links.filler = bpf_program__attach_trace(skel->progs.filler);
> +		if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace"))
> +			goto cleanup;
> +	}
> +
> +	return skels;
> +
> +cleanup:
> +	fillers_cleanup(skels, i);
> +	return NULL;
> +}
> +
> +static void test_rollback_unlink(void)
> +{
> +	struct tracing_multi_rollback *skel, *extra;
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_rollback **fillers;
> +	size_t cnt = FUNCS_CNT;
> +	__u32 *ids = NULL;
> +	int err, max;
> +
> +	max = get_bpf_max_tramp_links();
> +	if (!ASSERT_GE(max, 1, "bpf_max_tramp_links"))
> +		return;
> +
> +	/* Attach maximum allowed programs to bpf_fentry_test10 */
> +	fillers = fillers_load_and_link(max);
> +	if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link"))
> +		return;
> +
> +	extra = extra_load_and_link();
> +	if (!ASSERT_OK_PTR(extra, "extra_load_and_link"))

Should cleanup fillers here?

Thanks,
Leon

> +		return;
> +
> +	skel = tracing_multi_rollback__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
> +		goto cleanup;
> +
[...]


^ permalink raw reply

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Leon Hwang @ 2026-03-25  6:45 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-25-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding benchmark test that attaches to (almost) all allowed tracing
> functions and display attach/detach times.
> 
>   # ./test_progs -t tracing_multi_bench_attach -v
>   bpf_testmod.ko is already unloaded.
>   Loading bpf_testmod.ko...
>   Successfully loaded bpf_testmod.ko.
>   serial_test_tracing_multi_bench_attach:PASS:btf__load_vmlinux_btf 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:tracing_multi_bench__open_and_load 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:get_syms 0 nsec
>   serial_test_tracing_multi_bench_attach:PASS:bpf_program__attach_tracing_multi 0 nsec
>   serial_test_tracing_multi_bench_attach: found 51186 functions
>   serial_test_tracing_multi_bench_attach: attached in   1.295s
>   serial_test_tracing_multi_bench_attach: detached in   0.243s
>   #507     tracing_multi_bench_attach:OK
>   Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
>   Successfully unloaded bpf_testmod.ko.
> 
> Exporting skip_entry as is_unsafe_function and using it in the test.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  .../selftests/bpf/prog_tests/tracing_multi.c  | 98 +++++++++++++++++++
>  .../selftests/bpf/progs/tracing_multi_bench.c | 12 +++
>  tools/testing/selftests/bpf/trace_helpers.c   |  6 +-
>  tools/testing/selftests/bpf/trace_helpers.h   |  1 +
>  4 files changed, 114 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> index dece45d8fb5e..6917471e329c 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> @@ -9,6 +9,7 @@
>  #include "tracing_multi_intersect.skel.h"
>  #include "tracing_multi_session.skel.h"
>  #include "tracing_multi_fail.skel.h"
> +#include "tracing_multi_bench.skel.h"
>  #include "trace_helpers.h"
>  
>  static __u64 bpf_fentry_test_cookies[] = {
> @@ -571,6 +572,103 @@ static void test_attach_api_fails(void)
>  	free(ids2);
>  }
>  
> +void serial_test_tracing_multi_bench_attach(void)
> +{
> +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> +	struct tracing_multi_bench *skel = NULL;
> +	long attach_start_ns, attach_end_ns;
> +	long detach_start_ns, detach_end_ns;
> +	double attach_delta, detach_delta;
> +	struct bpf_link *link = NULL;
> +	size_t i, cap = 0, cnt = 0;
> +	struct ksyms *ksyms = NULL;
> +	void *root = NULL;
> +	__u32 *ids = NULL;
> +	__u32 nr, type_id;
> +	struct btf *btf;
> +	int err;
> +
> +#ifndef __x86_64__
> +	test__skip();
> +	return;
> +#endif
> +
> +	btf = btf__load_vmlinux_btf();
> +	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
> +		return;> +
> +	skel = tracing_multi_bench__open_and_load();
> +	if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load"))
> +		goto cleanup;
> +
> +	if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms"))
> +		goto cleanup;
> +
> +	/* Get all ftrace 'safe' symbols.. */
> +	for (i = 0; i < ksyms->filtered_cnt; i++) {
> +		if (is_unsafe_function(ksyms->filtered_syms[i]))
> +			continue;
> +		tsearch(&ksyms->filtered_syms[i], &root, compare);
> +	}
> +
> +	/* ..and filter them through BTF and btf_type_is_traceable_func. */
> +	nr = btf__type_cnt(btf);
> +	for (type_id = 1; type_id < nr; type_id++) {
> +		const struct btf_type *type;
> +		const char *str;
> +
> +		type = btf__type_by_id(btf, type_id);
> +		if (!type)
> +			break;
> +
> +		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
> +			continue;
> +
> +		str = btf__name_by_offset(btf, type->name_off);
> +		if (!str)
> +			break;
> +
> +		if (!tfind(&str, &root, compare))
> +			continue;
> +
> +		if (!btf_type_is_traceable_func(btf, type))
> +			continue;
> +
> +		err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1);
> +		if (err)
> +			goto cleanup;
> +
> +		ids[cnt++] = type_id;
> +	}
> +
> +	opts.ids = ids;
> +	opts.cnt = cnt;
> +
> +	attach_start_ns = get_time_ns();
> +	link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
> +	attach_end_ns = get_time_ns();
> +
> +	if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
> +		goto cleanup;
> +
> +	detach_start_ns = get_time_ns();
> +	bpf_link__destroy(link);
> +	detach_end_ns = get_time_ns();
> +
> +	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> +	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> +
> +	printf("%s: found %lu functions\n", __func__, cnt);
> +	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> +	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> +
> +cleanup:
> +	tracing_multi_bench__destroy(skel);
> +	tdestroy(root, tdestroy_free_nop);
> +	free_kallsyms_local(ksyms);
> +	free(ids);

Is btf__free(btf) missing here? Since 'btf' was calloc inner
btf__load_vmlinux_btf().

Thanks,
Leon

> +}
> +
>  void test_tracing_multi_test(void)
>  {
>  #ifndef __x86_64__
> diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> new file mode 100644
> index 000000000000..beae946cb8c4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <vmlinux.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("fentry.multi")
> +int BPF_PROG(bench)
> +{
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
> index 0e63daf83ed5..8de0b60766de 100644
> --- a/tools/testing/selftests/bpf/trace_helpers.c
> +++ b/tools/testing/selftests/bpf/trace_helpers.c
> @@ -548,7 +548,7 @@ static const char * const trace_blacklist[] = {
>  	"bpf_get_numa_node_id",
>  };
>  
> -static bool skip_entry(char *name)
> +bool is_unsafe_function(const char *name)
>  {
>  	int i;
>  
> @@ -651,7 +651,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel)
>  		free(name);
>  		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
>  			continue;
> -		if (skip_entry(name))
> +		if (is_unsafe_function(name))
>  			continue;
>  
>  		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
> @@ -728,7 +728,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
>  		free(name);
>  		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
>  			continue;
> -		if (skip_entry(name))
> +		if (is_unsafe_function(name))
>  			continue;
>  
>  		if (cnt == max_cnt) {
> diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
> index d5bf1433675d..01c8ecc45627 100644
> --- a/tools/testing/selftests/bpf/trace_helpers.h
> +++ b/tools/testing/selftests/bpf/trace_helpers.h
> @@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size);
>  int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel);
>  int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
>  
> +bool is_unsafe_function(const char *name);
>  #endif


^ permalink raw reply

* Re: [PATCHv4 bpf-next 13/25] bpf: Add support for tracing_multi link fdinfo
From: Leon Hwang @ 2026-03-25  6:43 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260324081846.2334094-14-jolsa@kernel.org>

On 24/3/26 16:18, Jiri Olsa wrote:
> Adding tracing_multi link fdinfo support with following output:
> 
> pos:    0
> flags:  02000000
> mnt_id: 19
> ino:    3091
> link_type:      tracing_multi
> link_id:        382

Would better to add attach_type?

attach_type:	[fentry,fexit,fsession]_multi

Thanks,
Leon

> prog_tag:       62073a1123f07ef7
> prog_id:        715
> cnt:    10
> cookie   BTF-id  func
> 8        91203   bpf_fentry_test1+0x4/0x10
> 9        91205   bpf_fentry_test2+0x4/0x10
> 7        91206   bpf_fentry_test3+0x4/0x20
> 5        91207   bpf_fentry_test4+0x4/0x20
> 4        91208   bpf_fentry_test5+0x4/0x20
> 2        91209   bpf_fentry_test6+0x4/0x20
> 3        91210   bpf_fentry_test7+0x4/0x10
> 1        91211   bpf_fentry_test8+0x4/0x10
> 10       91212   bpf_fentry_test9+0x4/0x10
> 6        91204   bpf_fentry_test10+0x4/0x10
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  kernel/trace/bpf_trace.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 761501ce3a5f..41b691e83dc4 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3618,9 +3618,35 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
>  	kvfree(tr_link);
>  }
>  
> +#ifdef CONFIG_PROC_FS
> +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
> +					  struct seq_file *seq)
> +{
> +	struct bpf_tracing_multi_link *tr_link =
> +		container_of(link, struct bpf_tracing_multi_link, link);
> +	bool has_cookies = !!tr_link->cookies;
> +
> +	seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
> +
> +	seq_printf(seq, "%s\t %s\t %s\n", "cookie", "BTF-id", "func");
> +	for (int i = 0; i < tr_link->nodes_cnt; i++) {
> +		struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
> +		u32 btf_id;
> +
> +		bpf_trampoline_unpack_key(mnode->trampoline->key, NULL, &btf_id);
> +		seq_printf(seq, "%llu\t %u\t %pS\n",
> +			   has_cookies ? tr_link->cookies[i] : 0,
> +			   btf_id, (void *) mnode->trampoline->ip);
> +	}
> +}
> +#endif
> +
>  static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
>  	.release = bpf_tracing_multi_link_release,
>  	.dealloc_deferred = bpf_tracing_multi_link_dealloc,
> +#ifdef CONFIG_PROC_FS
> +	.show_fdinfo = bpf_tracing_multi_show_fdinfo,
> +#endif
>  };
>  
>  int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)


^ permalink raw reply

* Re: [PATCHv4 bpf-next 00/25] bpf: tracing_multi link
From: Leon Hwang @ 2026-03-25  6:42 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: Hengqi Chen, bpf, linux-trace-kernel, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, Menglong Dong,
	Steven Rostedt
In-Reply-To: <20260324081846.2334094-1-jolsa@kernel.org>

Hi Jiri,

Nice version for tracing_multi link.

I hope I have time to add tracing_multi link support to bpfsnoop, and
test this new tracing feature.

I left comments on patches #13, #24, and #25.

Hope this series lands in bpf-next soon.

Thanks,
Leon

On 24/3/26 16:18, Jiri Olsa wrote:
> hi,
> adding tracing_multi link support that allows fast attachment
> of tracing program to many functions.
> 
> RFC: https://lore.kernel.org/bpf/20260203093819.2105105-1-jolsa@kernel.org/
> v1: https://lore.kernel.org/bpf/20260220100649.628307-1-jolsa@kernel.org/
> v2: https://lore.kernel.org/bpf/20260304222141.497203-1-jolsa@kernel.org/
> v3: https://lore.kernel.org/bpf/20260316075138.465430-1-jolsa@kernel.org/
> 
> v4 changes:
> - unlink rollback fix (added ftrace_hash_count) [bot]
> - use const for some bpf_link_create_opts tracing_multi members [bot]
> - adding missing comment for lockdep keys [bot]
> - selftest error path fixes (leaks) and other assorted test fixes [Leon Hwang]
> - several compile fixes wrt CONFIG_BPF_SYSCALL and CONFIG_BPF_JIT [kernel test robot]
> - make ftrace_hash_clear global, because it's needed in rollback
> 
> v3 changes:
> - fix module parsing [Leon Hwang]
> - use function traceable check from libbpf [Leon Hwang]
> - use ptr_to_u64 and fix/updated few comments [ci]
> - display cookies as decimal numbers [ci]
> - added link_create.flags check [ci]
> - fix error path in bpf_trampoline_multi_detach [ci]
> - make fentry/fexit.multi not extendable [ci]
> - add missing OPTS_VALID to bpf_program__attach_tracing_multi [ci]
> 
> v2 changes:
> - allocate data.unreg in bpf_trampoline_multi_attach for rollback path [ci]
>   and fixed link count setup in rollback path [ci]
> - several small assorted fixes [ci]
> - added loongarch and powerpc changes for struct bpf_tramp_node change
> - added support to attach functions from modules
> - added tests for sleepable programs
> - added rollback tests
> 
> v1 changes:
> - added ftrace_hash_count as wrapper for hash_count [Steven]
> - added trampoline mutex pool [Andrii]
> - reworked 'struct bpf_tramp_node' separatoin [Andrii]
>   - the 'struct bpf_tramp_node' now holds pointer to bpf_link,
>     which is similar to what we do for uprobe_multi;
>     I understand it's not a fundamental change compared to previous
>     version which used bpf_prog pointer instead, but I don't see better
>     way of doing this.. I'm happy to discuss this further if there's
>     better idea
> - reworked 'struct bpf_fsession_link' based on bpf_tramp_node
> - made btf__find_by_glob_kind function internal helper [Andrii]
> - many small assorted fixes [Andrii,CI]
> - added session support [Leon Hwang]
> - added cookies support
> - added more tests
> 
> 
> Note I plan to send linkinfo support separately, the patchset is big enough.
> 
> thanks,
> jirka
> 
> 
> Cc: Hengqi Chen <hengqi.chen@gmail.com>
> ---
> Jiri Olsa (25):
>       ftrace: Add ftrace_hash_count function
>       ftrace: Make ftrace_hash_clear global
>       bpf: Use mutex lock pool for bpf trampolines
>       bpf: Add struct bpf_trampoline_ops object
>       bpf: Add struct bpf_tramp_node object
>       bpf: Factor fsession link to use struct bpf_tramp_node
>       bpf: Add multi tracing attach types
>       bpf: Move sleepable verification code to btf_id_allow_sleepable
>       bpf: Add bpf_trampoline_multi_attach/detach functions
>       bpf: Add support for tracing multi link
>       bpf: Add support for tracing_multi link cookies
>       bpf: Add support for tracing_multi link session
>       bpf: Add support for tracing_multi link fdinfo
>       libbpf: Add bpf_object_cleanup_btf function
>       libbpf: Add bpf_link_create support for tracing_multi link
>       libbpf: Add btf_type_is_traceable_func function
>       libbpf: Add support to create tracing multi link
>       selftests/bpf: Add tracing multi skel/pattern/ids attach tests
>       selftests/bpf: Add tracing multi skel/pattern/ids module attach tests
>       selftests/bpf: Add tracing multi intersect tests
>       selftests/bpf: Add tracing multi cookies test
>       selftests/bpf: Add tracing multi session test
>       selftests/bpf: Add tracing multi attach fails test
>       selftests/bpf: Add tracing multi attach benchmark test
>       selftests/bpf: Add tracing multi attach rollback tests
> 
>  arch/arm64/net/bpf_jit_comp.c                                      |  58 +++---
>  arch/loongarch/net/bpf_jit.c                                       |  44 ++---
>  arch/powerpc/net/bpf_jit_comp.c                                    |  46 ++---
>  arch/riscv/net/bpf_jit_comp64.c                                    |  52 ++---
>  arch/s390/net/bpf_jit_comp.c                                       |  44 ++---
>  arch/x86/net/bpf_jit_comp.c                                        |  54 ++---
>  include/linux/bpf.h                                                | 102 +++++++---
>  include/linux/bpf_types.h                                          |   1 +
>  include/linux/bpf_verifier.h                                       |   3 +
>  include/linux/btf_ids.h                                            |   1 +
>  include/linux/ftrace.h                                             |   2 +
>  include/linux/trace_events.h                                       |   6 +
>  include/uapi/linux/bpf.h                                           |   9 +
>  kernel/bpf/bpf_struct_ops.c                                        |  27 +--
>  kernel/bpf/btf.c                                                   |   4 +
>  kernel/bpf/syscall.c                                               |  88 ++++++---
>  kernel/bpf/trampoline.c                                            | 536 ++++++++++++++++++++++++++++++++++++++++---------
>  kernel/bpf/verifier.c                                              | 124 +++++++++---
>  kernel/trace/bpf_trace.c                                           | 149 +++++++++++++-
>  kernel/trace/ftrace.c                                              |   9 +-
>  net/bpf/bpf_dummy_struct_ops.c                                     |  14 +-
>  net/bpf/test_run.c                                                 |   3 +
>  tools/include/uapi/linux/bpf.h                                     |  10 +
>  tools/lib/bpf/bpf.c                                                |   9 +
>  tools/lib/bpf/bpf.h                                                |   5 +
>  tools/lib/bpf/libbpf.c                                             | 337 ++++++++++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h                                             |  15 ++
>  tools/lib/bpf/libbpf.map                                           |   1 +
>  tools/lib/bpf/libbpf_internal.h                                    |   1 +
>  tools/testing/selftests/bpf/Makefile                               |   9 +-
>  tools/testing/selftests/bpf/prog_tests/tracing_multi.c             | 912 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tools/testing/selftests/bpf/progs/tracing_multi_attach.c           |  39 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c    |  25 +++
>  tools/testing/selftests/bpf/progs/tracing_multi_bench.c            |  12 ++
>  tools/testing/selftests/bpf/progs/tracing_multi_check.c            | 212 ++++++++++++++++++++
>  tools/testing/selftests/bpf/progs/tracing_multi_fail.c             |  18 ++
>  tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c |  41 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_rollback.c         |  43 ++++
>  tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c   |  43 ++++
>  tools/testing/selftests/bpf/trace_helpers.c                        |   6 +-
>  tools/testing/selftests/bpf/trace_helpers.h                        |   1 +
>  41 files changed, 2749 insertions(+), 366 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_check.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_fail.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
>  create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
> 


^ permalink raw reply

* Re: [RFC v5 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-03-25  6:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel,
	Vineeth Remanan Pillai
In-Reply-To: <20260317122149.5d07132a@gandalf.local.home>

Hi Steven,

Thank you for your review, and I apologize for my delayed response.

 ---- On Wed, 18 Mar 2026 00:21:29 +0800  Steven Rostedt <rostedt@goodmis.org> wrote --- 
 > On Tue, 17 Mar 2026 16:46:21 +0800
 > Li Chen <me@linux.beauty> wrote:
 > 
 > > Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
 > > so it is useful to quantify the time spent with updates locked and to
 > > understand why snapshotting can fail.
 > > 
 > > Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
 > > the updates-locked window along with the number of snapshotted inodes
 > > and ranges. Record the first snapshot failure reason in a stable snap_err
 > > field for tooling.
 > > 
 > > Signed-off-by: Li Chen <me@linux.beauty>
 > > ---
 > >  fs/ext4/ext4.h              | 15 ++++++++
 > >  fs/ext4/fast_commit.c       | 71 +++++++++++++++++++++++++++++--------
 > >  include/trace/events/ext4.h | 61 +++++++++++++++++++++++++++++++
 > >  3 files changed, 132 insertions(+), 15 deletions(-)
 > > 
 > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
 > > index 68a64fa0be926..b9e146f3dd9e4 100644
 > > --- a/fs/ext4/ext4.h
 > > +++ b/fs/ext4/ext4.h
 > > @@ -1037,6 +1037,21 @@ enum {
 > >  
 > >  struct ext4_fc_inode_snap;
 > >  
 > > +/*
 > > + * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
 > > + * Keep these stable for tooling.
 > > + */
 > > +enum ext4_fc_snap_err {
 > > +    EXT4_FC_SNAP_ERR_NONE        = 0,
 > > +    EXT4_FC_SNAP_ERR_ES_MISS    = 1,
 > > +    EXT4_FC_SNAP_ERR_ES_DELAYED    = 2,
 > > +    EXT4_FC_SNAP_ERR_ES_OTHER    = 3,
 > > +    EXT4_FC_SNAP_ERR_INODES_CAP    = 4,
 > > +    EXT4_FC_SNAP_ERR_RANGES_CAP    = 5,
 > > +    EXT4_FC_SNAP_ERR_NOMEM        = 6,
 > > +    EXT4_FC_SNAP_ERR_INODE_LOC    = 7,
 > 
 > You don't need to explicitly state the assignments, the enum will increment
 > them without them.

Agree.

 > > +};
 > > +
 > >  /*
 > >   * fourth extended file system inode data in memory
 > >   */
 > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
 > > index d1eefee609120..4929e2990b292 100644
 > > --- a/fs/ext4/fast_commit.c
 > > +++ b/fs/ext4/fast_commit.c
 > > @@ -193,6 +193,12 @@ static struct kmem_cache *ext4_fc_range_cachep;
 > >  #define EXT4_FC_SNAPSHOT_MAX_INODES    1024
 > >  #define EXT4_FC_SNAPSHOT_MAX_RANGES    2048
 > >  
 > > +static inline void ext4_fc_set_snap_err(int *snap_err, int err)
 > > +{
 > > +    if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
 > > +        *snap_err = err;
 > > +}
 > > +
 > >  static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 > >  {
 > >      BUFFER_TRACE(bh, "");
 > > @@ -983,11 +989,12 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 > >  static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >                         struct list_head *ranges,
 > >                         unsigned int nr_ranges_total,
 > > -                       unsigned int *nr_rangesp)
 > > +                       unsigned int *nr_rangesp,
 > > +                       int *snap_err)
 > >  {
 > >      struct ext4_inode_info *ei = EXT4_I(inode);
 > > -    unsigned int nr_ranges = 0;
 > >      ext4_lblk_t start_lblk, end_lblk, cur_lblk;
 > > +    unsigned int nr_ranges = 0;
 > >  
 > >      spin_lock(&ei->i_fc_lock);
 > >      if (ei->i_fc_lblk_len == 0) {
 > > @@ -1010,11 +1017,16 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >          struct ext4_fc_range *range;
 > >          ext4_lblk_t len;
 > >  
 > > -        if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
 > > +        if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 > >              return -EAGAIN;
 > > +        }
 > >  
 > > -        if (ext4_es_is_delayed(&es))
 > > +        if (ext4_es_is_delayed(&es)) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_ES_DELAYED);
 > >              return -EAGAIN;
 > > +        }
 > >  
 > >          len = es.es_len - (cur_lblk - es.es_lblk);
 > >          if (len > end_lblk - cur_lblk + 1)
 > > @@ -1024,12 +1036,17 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >              continue;
 > >          }
 > >  
 > > -        if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
 > > +        if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_RANGES_CAP);
 > >              return -E2BIG;
 > > +        }
 > >  
 > >          range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 > > -        if (!range)
 > > +        if (!range) {
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 > >              return -ENOMEM;
 > > +        }
 > >          nr_ranges++;
 > >  
 > >          range->lblk = cur_lblk;
 > > @@ -1054,6 +1071,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >                  range->len = max;
 > >          } else {
 > >              kmem_cache_free(ext4_fc_range_cachep, range);
 > > +            ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 > >              return -EAGAIN;
 > >          }
 > >  
 > > @@ -1070,7 +1088,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 > >  
 > >  static int ext4_fc_snapshot_inode(struct inode *inode,
 > >                    unsigned int nr_ranges_total,
 > > -                  unsigned int *nr_rangesp)
 > > +                  unsigned int *nr_rangesp, int *snap_err)
 > >  {
 > >      struct ext4_inode_info *ei = EXT4_I(inode);
 > >      struct ext4_fc_inode_snap *snap;
 > > @@ -1082,8 +1100,10 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >      int alloc_ctx;
 > >  
 > >      ret = ext4_get_inode_loc_noio(inode, &iloc);
 > > -    if (ret)
 > > +    if (ret) {
 > > +        ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 > >          return ret;
 > > +    }
 > >  
 > >      if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 > >          inode_len = EXT4_INODE_SIZE(inode->i_sb);
 > > @@ -1092,6 +1112,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >  
 > >      snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 > >      if (!snap) {
 > > +        ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 > >          brelse(iloc.bh);
 > >          return -ENOMEM;
 > >      }
 > > @@ -1102,7 +1123,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 > >      brelse(iloc.bh);
 > >  
 > >      ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
 > > -                      &nr_ranges);
 > > +                      &nr_ranges, snap_err);
 > >      if (ret) {
 > >          kfree(snap);
 > >          ext4_fc_free_ranges(&ranges);
 > > @@ -1203,7 +1224,10 @@ static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
 > >                       unsigned int *nr_inodesp);
 > >  
 > >  static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > > -                   unsigned int inodes_size)
 > > +                   unsigned int inodes_size,
 > > +                   unsigned int *nr_inodesp,
 > > +                   unsigned int *nr_rangesp,
 > > +                   int *snap_err)
 > >  {
 > >      struct super_block *sb = journal->j_private;
 > >      struct ext4_sb_info *sbi = EXT4_SB(sb);
 > > @@ -1221,6 +1245,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >      alloc_ctx = ext4_fc_lock(sb);
 > >      list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 > >          if (i >= inodes_size) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_INODES_CAP);
 > >              ret = -E2BIG;
 > >              goto unlock;
 > >          }
 > > @@ -1244,6 +1270,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >              continue;
 > >  
 > >          if (i >= inodes_size) {
 > > +            ext4_fc_set_snap_err(snap_err,
 > > +                         EXT4_FC_SNAP_ERR_INODES_CAP);
 > >              ret = -E2BIG;
 > >              goto unlock;
 > >          }
 > > @@ -1268,16 +1296,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 > >          unsigned int inode_ranges = 0;
 > >  
 > >          ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
 > > -                         &inode_ranges);
 > > +                         &inode_ranges, snap_err);
 > >          if (ret)
 > >              break;
 > >          nr_ranges += inode_ranges;
 > >      }
 > >  
 > > +    if (nr_inodesp)
 > > +        *nr_inodesp = i;
 > > +    if (nr_rangesp)
 > > +        *nr_rangesp = nr_ranges;
 > >      return ret;
 > >  }
 > >  
 > > -static int ext4_fc_perform_commit(journal_t *journal)
 > > +static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 > >  {
 > >      struct super_block *sb = journal->j_private;
 > >      struct ext4_sb_info *sbi = EXT4_SB(sb);
 > > @@ -1286,10 +1318,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      struct inode *inode;
 > >      struct inode **inodes;
 > >      unsigned int inodes_size;
 > > +    unsigned int snap_inodes = 0;
 > > +    unsigned int snap_ranges = 0;
 > > +    int snap_err = EXT4_FC_SNAP_ERR_NONE;
 > >      struct blk_plug plug;
 > >      int ret = 0;
 > >      u32 crc = 0;
 > >      int alloc_ctx;
 > > +    ktime_t lock_start;
 > > +    u64 locked_ns;
 > >  
 > >      /*
 > >       * Step 1: Mark all inodes on s_fc_q[MAIN] with
 > > @@ -1337,13 +1374,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      if (ret)
 > >          return ret;
 > >  
 > > -
 > >      ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
 > >      if (ret)
 > >          return ret;
 > >  
 > >      /* Step 4: Mark all inodes as being committed. */
 > >      jbd2_journal_lock_updates(journal);
 > > +    lock_start = ktime_get();
 > >      /*
 > >       * The journal is now locked. No more handles can start and all the
 > >       * previous handles are now drained. Snapshotting happens in this
 > > @@ -1357,8 +1394,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      }
 > >      ext4_fc_unlock(sb, alloc_ctx);
 > >  
 > > -    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 > > +    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 > > +                      &snap_inodes, &snap_ranges, &snap_err);
 > >      jbd2_journal_unlock_updates(journal);
 > > +    locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 > 
 > If locked_ns is only used for the tracepoint, it should either be
 > calculated in the tracepoint, or add:
 > 
 >     if (trace_ext4_fc_lock_updates_enabled()) {
 >         locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 
Good catch!

 > > +    trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns, snap_inodes,
 > > +                   snap_ranges, ret, snap_err);
 > 
 >     }
 > 
 > Note, we are going to also add a code to call the tracepoint directly, to
 > remove the double static_branch.
 > 
 >     https://lore.kernel.org/all/20260312150523.2054552-1-vineeth@bitbyteword.org/
 > 
 > But that code is still being worked on so you don't need to worry about it
 > at the moment.

Thank you! I will monitor this patch set and incorporate it into future versions if possible.

 > 
 > >      kvfree(inodes);
 > >      if (ret)
 > >          return ret;
 > > @@ -1563,7 +1604,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 > >          journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 > >      set_task_ioprio(current, journal_ioprio);
 > >      fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
 > > -    ret = ext4_fc_perform_commit(journal);
 > > +    ret = ext4_fc_perform_commit(journal, commit_tid);
 > >      if (ret < 0) {
 > >          if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
 > >              status = EXT4_FC_STATUS_INELIGIBLE;
 > > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
 > > index fd76d14c2776e..dc084f39b74ad 100644
 > > --- a/include/trace/events/ext4.h
 > > +++ b/include/trace/events/ext4.h
 > > @@ -104,6 +104,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 > >  
 > > +#undef EM
 > > +#undef EMe
 > > +#define EM(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +#define EMe(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +
 > > +#define TRACE_SNAP_ERR                        \
 > > +    EM(NONE)                        \
 > > +    EM(ES_MISS)                        \
 > > +    EM(ES_DELAYED)                        \
 > > +    EM(ES_OTHER)                        \
 > > +    EM(INODES_CAP)                        \
 > > +    EM(RANGES_CAP)                        \
 > > +    EM(NOMEM)                        \
 > > +    EMe(INODE_LOC)
 > > +
 > > +TRACE_SNAP_ERR
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +
 > >  #define show_fc_reason(reason)                        \
 > >      __print_symbolic(reason,                    \
 > >          { EXT4_FC_REASON_XATTR,        "XATTR"},        \
 > > @@ -2812,6 +2832,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 > >            __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 > >  );
 > >  
 > > +#define EM(a)    { EXT4_FC_SNAP_ERR_##a, #a },
 > > +#define EMe(a)    { EXT4_FC_SNAP_ERR_##a, #a }
 > > +
 > > +TRACE_EVENT(ext4_fc_lock_updates,
 > > +        TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
 > > +             unsigned int nr_inodes, unsigned int nr_ranges, int err,
 > > +             int snap_err),
 > > +
 > > +    TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
 > > +
 > > +    TP_STRUCT__entry(/* entry */
 > > +        __field(dev_t, dev)
 > > +        __field(tid_t, tid)
 > > +        __field(u64, locked_ns)
 > > +        __field(unsigned int, nr_inodes)
 > > +        __field(unsigned int, nr_ranges)
 > > +        __field(int, err)
 > > +        __field(int, snap_err)
 > > +    ),
 > > +
 > > +    TP_fast_assign(/* assign */
 > > +        __entry->dev = sb->s_dev;
 > > +        __entry->tid = commit_tid;
 > > +        __entry->locked_ns = locked_ns;
 > > +        __entry->nr_inodes = nr_inodes;
 > > +        __entry->nr_ranges = nr_ranges;
 > > +        __entry->err = err;
 > > +        __entry->snap_err = snap_err;
 > > +    ),
 > > +
 > > +    TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
 > > +          MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
 > > +          __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
 > > +          __entry->err, __print_symbolic(__entry->snap_err,
 > > +                         TRACE_SNAP_ERR))
 > > +);
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +#undef TRACE_SNAP_ERR
 > > +
 > >  #define FC_REASON_NAME_STAT(reason)                    \
 > >      show_fc_reason(reason),                        \
 > >      __entry->fc_ineligible_rc[reason]
 > 
 > 

Regards,
Li


^ permalink raw reply

* [PATCH v5 3/3] PCI: dw-rockchip: Add pcie_ltssm_state_transition trace support
From: Shawn Lin @ 2026-03-25  1:58 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Bjorn Helgaas
  Cc: linux-rockchip, linux-pci, linux-trace-kernel, linux-doc,
	Steven Rostedt, Shawn Lin
In-Reply-To: <1774403912-210670-1-git-send-email-shawn.lin@rock-chips.com>

Rockchip platforms provide a 64x4 bytes debug FIFO to trace the
LTSSM history. Any LTSSM change will be recorded. It's useful
for debug purpose, for example link failure, etc.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
---

Changes in v5:
- rebase
- use trace_pcie_ltssm_state_transition_enabled()

Changes in v4:
- skip trace if pci_ltssm_tp_enabled() is false.(Steven)
- wrap into 80 columns(Bjorn)

Changes in v3:
- reorder variables(Mani)
- rename loop to i; rename en to enable(Mani)
- use FIELD_GET(Mani)
- add comment about how the FIFO works(Mani)

Changes in v2:
- use tracepoint

 drivers/pci/controller/dwc/pcie-dw-rockchip.c | 111 ++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index bb5d1a3..e737103 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -22,6 +22,8 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/reset.h>
+#include <linux/workqueue.h>
+#include <trace/events/pci_controller.h>
 
 #include "../../pci.h"
 #include "pcie-designware.h"
@@ -73,6 +75,20 @@
 #define  PCIE_CLIENT_CDM_RASDES_TBA_L1_1	BIT(4)
 #define  PCIE_CLIENT_CDM_RASDES_TBA_L1_2	BIT(5)
 
+/* Debug FIFO information */
+#define PCIE_CLIENT_DBG_FIFO_MODE_CON	0x310
+#define  PCIE_CLIENT_DBG_EN		0xffff0007
+#define  PCIE_CLIENT_DBG_DIS		0xffff0000
+#define PCIE_CLIENT_DBG_FIFO_PTN_HIT_D0	0x320
+#define PCIE_CLIENT_DBG_FIFO_PTN_HIT_D1	0x324
+#define PCIE_CLIENT_DBG_FIFO_TRN_HIT_D0	0x328
+#define PCIE_CLIENT_DBG_FIFO_TRN_HIT_D1	0x32c
+#define  PCIE_CLIENT_DBG_TRANSITION_DATA 0xffff0000
+#define PCIE_CLIENT_DBG_FIFO_STATUS	0x350
+#define  PCIE_DBG_FIFO_RATE_MASK	GENMASK(22, 20)
+#define  PCIE_DBG_FIFO_L1SUB_MASK	GENMASK(10, 8)
+#define PCIE_DBG_LTSSM_HISTORY_CNT	64
+
 /* Hot Reset Control Register */
 #define PCIE_CLIENT_HOT_RESET_CTRL	0x180
 #define  PCIE_LTSSM_APP_DLY2_EN		BIT(1)
@@ -98,6 +114,7 @@ struct rockchip_pcie {
 	struct irq_domain *irq_domain;
 	const struct rockchip_pcie_of_data *data;
 	bool supports_clkreq;
+	struct delayed_work trace_work;
 };
 
 struct rockchip_pcie_of_data {
@@ -208,6 +225,96 @@ static enum dw_pcie_ltssm rockchip_pcie_get_ltssm(struct dw_pcie *pci)
 	return rockchip_pcie_get_ltssm_reg(rockchip) & PCIE_LTSSM_STATUS_MASK;
 }
 
+#ifdef CONFIG_TRACING
+static void rockchip_pcie_ltssm_trace_work(struct work_struct *work)
+{
+	struct rockchip_pcie *rockchip = container_of(work,
+						struct rockchip_pcie,
+						trace_work.work);
+	struct dw_pcie *pci = &rockchip->pci;
+	enum dw_pcie_ltssm state;
+	u32 i, l1ss, prev_val = DW_PCIE_LTSSM_UNKNOWN, rate, val;
+
+	if (!trace_pcie_ltssm_state_transition_enabled())
+		goto skip_trace;
+
+	for (i = 0; i < PCIE_DBG_LTSSM_HISTORY_CNT; i++) {
+		val = rockchip_pcie_readl_apb(rockchip,
+				PCIE_CLIENT_DBG_FIFO_STATUS);
+		rate = FIELD_GET(PCIE_DBG_FIFO_RATE_MASK, val);
+		l1ss = FIELD_GET(PCIE_DBG_FIFO_L1SUB_MASK, val);
+		val = FIELD_GET(PCIE_LTSSM_STATUS_MASK, val);
+
+		/*
+		 * Hardware Mechanism: The ring FIFO employs two tracking
+		 * counters:
+		 * - 'last-read-point': maintains the user's last read position
+		 * - 'last-valid-point': tracks the HW's last state update
+		 *
+		 * Software Handling: When two consecutive LTSSM states are
+		 * identical, it indicates invalid subsequent data in the FIFO.
+		 * In this case, we skip the remaining entries. The dual counter
+		 * design ensures that on the next state transition, reading can
+		 * resume from the last user position.
+		 */
+		if ((i > 0 && val == prev_val) || val > DW_PCIE_LTSSM_RCVRY_EQ3)
+			break;
+
+		state = prev_val = val;
+		if (val == DW_PCIE_LTSSM_L1_IDLE) {
+			if (l1ss == 2)
+				state = DW_PCIE_LTSSM_L1_2;
+			else if (l1ss == 1)
+				state = DW_PCIE_LTSSM_L1_1;
+		}
+
+		trace_pcie_ltssm_state_transition(dev_name(pci->dev),
+				dw_pcie_ltssm_status_string(state),
+				((rate + 1) > pci->max_link_speed) ?
+				PCI_SPEED_UNKNOWN : PCIE_SPEED_2_5GT + rate);
+	}
+
+skip_trace:
+	schedule_delayed_work(&rockchip->trace_work, msecs_to_jiffies(5000));
+}
+
+static void rockchip_pcie_ltssm_trace(struct rockchip_pcie *rockchip,
+				      bool enable)
+{
+	if (enable) {
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_PTN_HIT_D0);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_PTN_HIT_D1);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_TRN_HIT_D0);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_TRANSITION_DATA,
+					 PCIE_CLIENT_DBG_FIFO_TRN_HIT_D1);
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_EN,
+					 PCIE_CLIENT_DBG_FIFO_MODE_CON);
+
+		INIT_DELAYED_WORK(&rockchip->trace_work,
+				  rockchip_pcie_ltssm_trace_work);
+		schedule_delayed_work(&rockchip->trace_work, 0);
+	} else {
+		rockchip_pcie_writel_apb(rockchip,
+					 PCIE_CLIENT_DBG_DIS,
+					 PCIE_CLIENT_DBG_FIFO_MODE_CON);
+		cancel_delayed_work_sync(&rockchip->trace_work);
+	}
+}
+#else
+static void rockchip_pcie_ltssm_trace(struct rockchip_pcie *rockchip,
+				      bool enable)
+{
+}
+#endif
+
 static void rockchip_pcie_enable_ltssm(struct rockchip_pcie *rockchip)
 {
 	rockchip_pcie_writel_apb(rockchip, PCIE_CLIENT_ENABLE_LTSSM,
@@ -291,6 +398,9 @@ static int rockchip_pcie_start_link(struct dw_pcie *pci)
 	 * 100us as we don't know how long should the device need to reset.
 	 */
 	msleep(PCIE_T_PVPERL_MS);
+
+	rockchip_pcie_ltssm_trace(rockchip, true);
+
 	gpiod_set_value_cansleep(rockchip->rst_gpio, 1);
 
 	return 0;
@@ -301,6 +411,7 @@ static void rockchip_pcie_stop_link(struct dw_pcie *pci)
 	struct rockchip_pcie *rockchip = to_rockchip_pcie(pci);
 
 	rockchip_pcie_disable_ltssm(rockchip);
+	rockchip_pcie_ltssm_trace(rockchip, false);
 }
 
 static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] tracing/osnoise: fix potential deadlock in cpu hotplug
From: hu.shengming @ 2026-03-25  2:25 UTC (permalink / raw)
  To: rostedt
  Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	zhang.run, yang.tao172, ran.xiaokai, luo.haiyang
In-Reply-To: <20260324121918.454d6a7b@gandalf.local.home>

>On Tue, 24 Mar 2026 15:06:16 +0800 (CST)
><hu.shengming@zte.com.cn> wrote:
>
>> From: luohaiyang10243395 <luo.haiyang@zte.com.cn>
>> 
>> The following sequence may leads deadlock in cpu hotplug:
>> 
>>   CPU0                        |  CPU1
>>                               |  schedule_work_on
>>                               |
>>   _cpu_down//set CPU1 offline |
>>   cpus_write_lock             |
>>                               |  osnoise_hotplug_workfn
>>                               |    mutex_lock(&interface_lock);
>>                               |    cpus_read_lock();  //wait cpu_hotplug_lock
>>                               |
>>                               |  cpuhp/1
>>                               |    osnoise_cpu_die
>>                               |      kthread_stop
>>                               |        wait_for_completion //wait osnoise/1 exit
>>                               |
>>                               |  osnoise/1
>>                               |    osnoise_sleep
>>                               |      mutex_lock(&interface_lock); //deadlock
>> 
>> Fix by swap the order of cpus_read_lock() and mutex_lock(&interface_lock).
>
>So the deadlock is due to the "wait_for_completion"?

The osnoise_cpu_init callback returns directly, which may allow another CPU offline task to run, 
the offline task holds the cpu_hotplug_lock while waiting for the osnoise task to exit. 
osnoise_hotplug_workfn may acquire interface_lock first, causing the offline task to be blocked. 
This is an ABBA deadlock.

>How did you find this bug? Inspection, AI, triggered?
>
>Thanks,
>
>-- Steve

We run autotests on kernel-6.6, report following hung task warning, and we think the same issue exists
in linux-stable.
 [39401.476843] INFO: task cpuhp/7:47 blocked for more than 120 seconds.
 [39401.483196]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.490581] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.498398] task:cpuhp/7         state:D stack:0     pid:47    ppid:2      flags:0x00000208
 [39401.506739] Call trace:
 [39401.509175]  __switch_to+0x138/0x180
 [39401.512743]  __schedule+0x250/0x5e8
 [39401.516220]  schedule+0x60/0x100
 [39401.519437]  schedule_timeout+0x1a0/0x1c0
 [39401.523437]  wait_for_completion+0xbc/0x190
 [39401.527609]  kthread_stop+0x7c/0x268
 [39401.531175]  stop_kthread+0x8c/0x178
 [39401.534740]  osnoise_cpu_die+0xc/0x18
 [39401.538391]  cpuhp_invoke_callback+0x148/0x580
 [39401.542822]  cpuhp_thread_fun+0xc8/0x1a0
 [39401.546733]  smpboot_thread_fn+0x224/0x250
 [39401.550817]  kthread+0xf8/0x110
 [39401.553947]  ret_from_fork+0x10/0x20
 [39401.557545] INFO: task sh:28856 blocked for more than 120 seconds.
 [39401.563713]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.571095] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.578912] task:sh              state:D stack:0     pid:28856 ppid:1      flags:0x00800004
 [39401.587251] Call trace:
 [39401.589685]  __switch_to+0x138/0x180
 [39401.593250]  __schedule+0x250/0x5e8
 [39401.596725]  schedule+0x60/0x100
 [39401.599941]  schedule_timeout+0x1a0/0x1c0
 [39401.603940]  wait_for_completion+0xbc/0x190
 [39401.608113]  __flush_work+0x5c/0xa8
 [39401.611590]  work_on_cpu_key+0x88/0xc0
 [39401.615331]  cpu_down_maps_locked+0xd0/0xe8
 [39401.619503]  cpu_device_down+0x38/0x60
 [39401.623240]  cpu_subsys_offline+0x14/0x28
 [39401.627238]  device_offline+0xb8/0x130
 [39401.630976]  online_store+0x64/0xe0
 [39401.634453]  dev_attr_store+0x1c/0x38
 [39401.638104]  sysfs_kf_write+0x48/0x60
 [39401.641756]  kernfs_fop_write_iter+0x118/0x1e8
 [39401.646188]  vfs_write+0x1a4/0x2f8
 [39401.649580]  ksys_write+0x70/0x108
 [39401.652970]  __arm64_sys_write+0x20/0x30
 [39401.656880]  el0_svc_common.constprop.0+0x60/0x138
 [39401.661660]  do_el0_svc+0x20/0x30
 [39401.664964]  el0_svc+0x44/0x1f8
 [39401.668093]  el0t_64_sync_handler+0xf8/0x128
 [39401.672352]  el0t_64_sync+0x17c/0x180
 [39401.875086] INFO: task kworker/7:2:2314252 blocked for more than 121 seconds.
 [39401.882208]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39401.889590] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39401.897406] task:kworker/7:2     state:D stack:0     pid:2314252 ppid:2      flags:0x00000008
 [39401.905917] Workqueue: events osnoise_hotplug_workfn
 [39401.910871] Call trace:
 [39401.913306]  __switch_to+0x138/0x180
 [39401.916870]  __schedule+0x250/0x5e8
 [39401.920345]  schedule+0x60/0x100
 [39401.923561]  percpu_rwsem_wait+0xfc/0x128
 [39401.927559]  __percpu_down_read+0x60/0x198
 [39401.931644]  percpu_down_read.constprop.0+0xac/0xb8
 [39401.936510]  cpus_read_lock+0x14/0x20
 [39401.940160]  osnoise_hotplug_workfn+0x54/0xb0
 [39401.944506]  process_one_work+0x184/0x420
 [39401.948503]  worker_thread+0x2b4/0x3d8
 [39401.952241]  kthread+0xf8/0x110
 [39401.955370]  ret_from_fork+0x10/0x20
 [39402.125508] INFO: task osnoise/0:2356235 blocked for more than 121 seconds.
 [39402.132458]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.139840] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.147656] task:osnoise/0       state:D stack:0     pid:2356235 ppid:2      flags:0x00000008
 [39402.156168] Call trace:
 [39402.158602]  __switch_to+0x138/0x180
 [39402.162166]  __schedule+0x250/0x5e8
 [39402.165643]  schedule+0x60/0x100
 [39402.168860]  schedule_preempt_disabled+0x28/0x48
 [39402.173466]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.178158]  __mutex_lock_slowpath+0x18/0x28
 [39402.182416]  mutex_lock+0x64/0x78
 [39402.185720]  osnoise_sleep+0x30/0x130
 [39402.189371]  osnoise_main+0x164/0x190
 [39402.193021]  kthread+0xf8/0x110
 [39402.196149]  ret_from_fork+0x10/0x20
 [39402.199713] INFO: task osnoise/1:2356236 blocked for more than 121 seconds.
 [39402.206661]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.214044] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.221860] task:osnoise/1       state:D stack:0     pid:2356236 ppid:2      flags:0x00000008
 [39402.230372] Call trace:
 [39402.232804]  __switch_to+0x138/0x180
 [39402.236368]  __schedule+0x250/0x5e8
 [39402.239845]  schedule+0x60/0x100
 [39402.243061]  schedule_preempt_disabled+0x28/0x48
 [39402.247666]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.252359]  __mutex_lock_slowpath+0x18/0x28
 [39402.256618]  mutex_lock+0x64/0x78
 [39402.259921]  osnoise_sleep+0x30/0x130
 [39402.263572]  osnoise_main+0x164/0x190
 [39402.267223]  kthread+0xf8/0x110
 [39402.270352]  ret_from_fork+0x10/0x20
 [39402.273916] INFO: task osnoise/2:2356237 blocked for more than 121 seconds.
 [39402.280865]       Tainted: G            E      6.6.102-5.2.1.an23.103.aarch64 #1
 [39402.288247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [39402.296064] task:osnoise/2       state:D stack:0     pid:2356237 ppid:2      flags:0x00000008
 [39402.304575] Call trace:
 [39402.307010]  __switch_to+0x138/0x180
 [39402.310574]  __schedule+0x250/0x5e8
 [39402.314051]  schedule+0x60/0x100
 [39402.317268]  schedule_preempt_disabled+0x28/0x48
 [39402.321873]  __mutex_lock.constprop.0+0x324/0x5f8
 [39402.326566]  __mutex_lock_slowpath+0x18/0x28
 [39402.330824]  mutex_lock+0x64/0x78
 [39402.334128]  osnoise_sleep+0x30/0x130
 [39402.337778]  osnoise_main+0x164/0x190
 [39402.341429]  kthread+0xf8/0x110
 [39402.344556]  ret_from_fork+0x10/0x20
 [39402.348120] Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings
 [39402.356295] Kernel panic - not syncing: hung_task: blocked tasks 

Thanks,
Haiyang

>> 
>> Signed-off-by: Luo Haiyang <luo.haiyang@zte.com.cn>
>> ---
>>  kernel/trace/trace_osnoise.c | 10 +++++-----
>>  1 file changed, 5 insertions(+), 5 deletions(-)
>> 
>> diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
>> index dee610e465b9..be6cf0bb3c03 100644
>> --- a/kernel/trace/trace_osnoise.c
>> +++ b/kernel/trace/trace_osnoise.c
>> @@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
>>      if (!osnoise_has_registered_instances())
>>          return;
>> 
>> -    guard(mutex)(&interface_lock);
>>      guard(cpus_read_lock)();
>> +    guard(mutex)(&interface_lock);
>> 
>>      if (!cpu_online(cpu))
>>          return;
>> @@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>>      if (running)
>>          stop_per_cpu_kthreads();
>> 
>> -    mutex_lock(&interface_lock);
>>      /*
>>       * avoid CPU hotplug operations that might read options.
>>       */
>>      cpus_read_lock();
>> +    mutex_lock(&interface_lock);
>> 
>>      retval = cnt;
>> 
>> @@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>>              clear_bit(option, &osnoise_options);
>>      }
>> 
>> -    cpus_read_unlock();
>>      mutex_unlock(&interface_lock);
>> +    cpus_read_unlock();
>> 
>>      if (running)
>>          start_per_cpu_kthreads();
>> @@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
>>      if (running)
>>          stop_per_cpu_kthreads();
>> 
>> -    mutex_lock(&interface_lock);
>>      /*
>>       * osnoise_cpumask is read by CPU hotplug operations.
>>       */
>>      cpus_read_lock();
>> +    mutex_lock(&interface_lock);
>> 
>>      cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
>> 
>> -    cpus_read_unlock();
>>      mutex_unlock(&interface_lock);
>> +    cpus_read_unlock();
>> 
>>      if (running)
>>          start_per_cpu_kthreads();

^ permalink raw reply

* [PATCH v13 4/4] ring-buffer: Add persistent ring buffer selftest
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a self-destractive test for the persistent ring buffer. This
will invalidate some sub-buffer pages in the persistent ring buffer
when kernel gets panic, and check whether the number of detected
invalid pages and the total entry_bytes are the same as record
after reboot.

This can ensure the kernel correctly recover partially corrupted
persistent ring buffer when boot.

The test only runs on the persistent ring buffer whose name is
"ptracingtest". And user has to fill it up with events before
kernel panics.

To run the test, enable CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
and you have to setup the kernel cmdline;

 reserve_mem=20M:2M:trace trace_instance=ptracingtest^traceoff@trace
 panic=1

And run following commands after the 1st boot;

 cd /sys/kernel/tracing/instances/ptracingtest
 echo 1 > tracing_on
 echo 1 > events/enable
 sleep 3
 echo c > /proc/sysrq-trigger

After panic message, the kernel will reboot and run the verification
on the persistent ring buffer, e.g.

 Ring buffer meta [2] invalid buffer page detected
 Ring buffer meta [2] is from previous boot! (318 pages discarded)
 Ring buffer testing [2] invalid pages: PASSED (318/318)
 Ring buffer testing [2] entry_bytes: PASSED (1300476/1300476)

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
  - Add entry_bytes test.
  - Do not compile test code if CONFIG_RING_BUFFER_PERSISTENT_SELFTEST=n.
 Changes in v9:
  - Test also reader pages.
---
 include/linux/ring_buffer.h |    1 +
 kernel/trace/Kconfig        |   15 +++++++++
 kernel/trace/ring_buffer.c  |   69 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |    4 ++
 4 files changed, 89 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 994f52b34344..0670742b2d60 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -238,6 +238,7 @@ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
+	RB_FL_TESTING		= 1 << 1,
 };
 
 #ifdef CONFIG_RING_BUFFER
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..094d5511bb17 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1202,6 +1202,21 @@ config RING_BUFFER_VALIDATE_TIME_DELTAS
 	  Only say Y if you understand what this does, and you
 	  still want it enabled. Otherwise say N
 
+config RING_BUFFER_PERSISTENT_SELFTEST
+	bool "Enable persistent ring buffer selftest"
+	depends on RING_BUFFER
+	help
+	  Run a selftest on the persistent ring buffer which names
+	  "ptracingtest" (and its backup) when panic_on_reboot by
+	  invalidating ring buffer pages.
+	  Note that user has to enable events on the persistent ring
+	  buffer manually to fill up ring buffers before rebooting.
+	  Since this invalidates the data on test target ring buffer,
+	  "ptracingtest" persistent ring buffer must not be used for
+	  actual tracing, but only for testing.
+
+	  If unsure, say N
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e5178239f2f9..10443347a6d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -64,6 +64,10 @@ struct ring_buffer_cpu_meta {
 	unsigned long	commit_buffer;
 	__u32		subbuf_size;
 	__u32		nr_subbufs;
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+	__u32		nr_invalid;
+	__u32		entry_bytes;
+#endif
 	int		buffers[];
 };
 
@@ -2077,6 +2081,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 	pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
 		cpu_buffer->cpu, discarded);
+
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+	if (meta->nr_invalid)
+		pr_info("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n",
+			cpu_buffer->cpu,
+			(discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
+			discarded, meta->nr_invalid);
+	if (meta->entry_bytes)
+		pr_info("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n",
+			cpu_buffer->cpu,
+			(entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
+			(long)entry_bytes, (long)meta->entry_bytes);
+#endif
 	return;
 
  invalid:
@@ -2557,12 +2574,64 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_SELFTEST
+static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_cpu_meta *meta;
+	struct buffer_data_page *dpage;
+	u32 entry_bytes = 0;
+	unsigned long ptr;
+	int subbuf_size;
+	int invalid = 0;
+	int cpu;
+	int i;
+
+	if (!(buffer->flags & RB_FL_TESTING))
+		return;
+
+	guard(preempt)();
+	cpu = smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+	meta = cpu_buffer->ring_meta;
+	ptr = (unsigned long)rb_subbufs_from_meta(meta);
+	subbuf_size = meta->subbuf_size;
+
+	for (i = 0; i < meta->nr_subbufs; i++) {
+		int idx = meta->buffers[i];
+
+		dpage = (void *)(ptr + idx * subbuf_size);
+		/* Skip unused pages */
+		if (!local_read(&dpage->commit))
+			continue;
+
+		/* Invalidate even pages. */
+		if (!(i & 0x1)) {
+			local_add(subbuf_size + 1, &dpage->commit);
+			invalid++;
+		} else {
+			/* Count total commit bytes. */
+			entry_bytes += local_read(&dpage->commit);
+		}
+	}
+
+	pr_info("Inject invalidated %d pages on CPU%d, total size: %ld\n",
+		invalid, cpu, (long)entry_bytes);
+	meta->nr_invalid = invalid;
+	meta->entry_bytes = entry_bytes;
+}
+#else /* !CONFIG_RING_BUFFER_PERSISTENT_SELFTEST */
+#define rb_test_inject_invalid_pages(buffer)	do { } while (0)
+#endif
+
 /* Stop recording on a persistent buffer and flush cache if needed. */
 static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
 {
 	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
 
 	ring_buffer_record_off(buffer);
+	rb_test_inject_invalid_pages(buffer);
 	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
 	return NOTIFY_DONE;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4189ec9df6a5..108b0d16badf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9366,6 +9366,8 @@ static void setup_trace_scratch(struct trace_array *tr,
 	memset(tscratch, 0, size);
 }
 
+#define TRACE_TEST_PTRACING_NAME	"ptracingtest"
+
 static int
 allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned long size)
 {
@@ -9378,6 +9380,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned
 	buf->tr = tr;
 
 	if (tr->range_addr_start && tr->range_addr_size) {
+		if (!strcmp(tr->name, TRACE_TEST_PTRACING_NAME))
+			rb_flags |= RB_FL_TESTING;
 		/* Add scratch buffer to handle 128 modules */
 		buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
 						      tr->range_addr_start,


^ permalink raw reply related

* [PATCH v13 3/4] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Skip invalid sub-buffers when rewinding the persistent ring buffer
instead of stopping the rewinding the ring buffer. The skipped
buffers are cleared.

To ensure the rewinding stops at the unused page, this also clears
buffer_data_page::time_stamp when tracing resets the buffer. This
allows us to identify unused pages and empty pages.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v12:
   - Fix build error.
 Changes in v11:
   - Reset timestamp when the buffer is invalid.
   - When rewinding, skip subbuf page if timestamp is wrong and
     check timestamp after validating buffer data page.
 Changes in v10:
   - Newly added.
---
 kernel/trace/ring_buffer.c |   76 +++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 31cad8edd488..e5178239f2f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -363,6 +363,7 @@ struct buffer_page {
 static void rb_init_page(struct buffer_data_page *bpage)
 {
 	local_set(&bpage->commit, 0);
+	bpage->time_stamp = 0;
 }
 
 static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
@@ -1878,12 +1879,14 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+static int rb_validate_buffer(struct buffer_page *bpage, int cpu,
 			      struct ring_buffer_cpu_meta *meta)
 {
+	struct buffer_data_page *dpage = bpage->page;
 	unsigned long long ts;
 	unsigned long tail;
 	u64 delta;
+	int ret = -1;
 
 	/*
 	 * When a sub-buffer is recovered from a read, the commit value may
@@ -1892,9 +1895,17 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
 	 * subbuf_size is considered invalid.
 	 */
 	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
-	if (tail > meta->subbuf_size)
-		return -1;
-	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+	if (tail <= meta->subbuf_size)
+		ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+
+	if (ret < 0) {
+		local_set(&bpage->entries, 0);
+		local_set(&bpage->page->commit, 0);
+	} else {
+		local_set(&bpage->entries, ret);
+	}
+
+	return ret;
 }
 
 /* If the meta data has been validated, now validate the events */
@@ -1915,18 +1926,14 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu, meta);
+	ret = rb_validate_buffer(cpu_buffer->reader_page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
 		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
 			cpu_buffer->cpu);
 		discarded++;
-		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-		local_set(&cpu_buffer->reader_page->entries, 0);
-		local_set(&cpu_buffer->reader_page->page->commit, 0);
 	} else {
 		entries += ret;
 		entry_bytes += rb_page_size(cpu_buffer->reader_page);
-		local_set(&cpu_buffer->reader_page->entries, ret);
 	}
 
 	ts = head_page->page->time_stamp;
@@ -1945,26 +1952,33 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->tail_page)
 			break;
 
-		/* Ensure the page has older data than head. */
-		if (ts < head_page->page->time_stamp)
-			break;
-
-		ts = head_page->page->time_stamp;
-		/* Ensure the page has correct timestamp and some data. */
-		if (!ts || rb_page_commit(head_page) == 0)
-			break;
-
-		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
-		if (ret < 0)
+		/* Rewind until unused page (no timestamp, no commit). */
+		if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
 			break;
 
-		/* Recover the number of entries and update stats. */
-		local_set(&head_page->entries, ret);
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-		entries += ret;
-		entry_bytes += rb_page_commit(head_page);
+		/*
+		 * Skip if the page is invalid, or its timestamp is newer than the
+		 * previous valid page.
+		 */
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
+		if (ret >= 0 && ts < head_page->page->time_stamp) {
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+			head_page->page->time_stamp = ts;
+			ret = -1;
+		}
+		if (ret < 0) {
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+		} else {
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			if (ret > 0)
+				local_inc(&cpu_buffer->pages_touched);
+			ts = head_page->page->time_stamp;
+		}
 	}
 	if (i)
 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2034,15 +2048,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->reader_page)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
 			if (!discarded)
 				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
 					cpu_buffer->cpu);
 			discarded++;
-			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-			local_set(&head_page->entries, 0);
-			local_set(&head_page->page->commit, 0);
 		} else {
 			/* If the buffer has content, update pages_touched */
 			if (ret)
@@ -2050,7 +2061,6 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 			entries += ret;
 			entry_bytes += rb_page_size(head_page);
-			local_set(&head_page->entries, ret);
 		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
@@ -2081,7 +2091,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	/* Reset all the subbuffers */
 	for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
 		local_set(&head_page->entries, 0);
-		local_set(&head_page->page->commit, 0);
+		rb_init_page(head_page->page);
 	}
 }
 


^ permalink raw reply related

* [PATCH v13 2/4] ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Skip invalid sub-buffers when validating the persistent ring buffer
instead of discarding the entire ring buffer. Only skipped buffers
are invalidated (cleared).

If the cache data in memory fails to be synchronized during a reboot,
the persistent ring buffer may become partially corrupted, but other
sub-buffers may still contain readable event data. Only discard the
subbuffers that are found to be corrupted.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
  Changes in v11:
  - Fix a typo.
  Changes in v9:
  - Add meta->subbuf_size check.
  - Fix a typo.
  - Handle invalid reader_page case.
  Changes in v8:
  - Add comment in rb_valudate_buffer()
  - Clear the RB_MISSED_* flags in rb_valudate_buffer() instead of
    skipping subbuf.
  - Remove unused subbuf local variable from rb_cpu_meta_valid().
  Changes in v7:
  - Combined with Handling RB_MISSED_* flags patch, focus on validation at boot.
  - Remove checking subbuffer data when validating metadata, because it should be done
    later.
  - Do not mark the discarded sub buffer page but just reset it.
  Changes in v6:
  - Show invalid page detection message once per CPU.
  Changes in v5:
  - Instead of showing errors for each page, just show the number
    of discarded pages at last.
  Changes in v3:
  - Record missed data event on commit.
---
 kernel/trace/ring_buffer.c |   98 ++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 40 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e793bd1c134..31cad8edd488 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,12 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 	return local_read(&bpage->page->commit);
 }
 
+/* Size is determined by what has been committed */
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+}
+
 static void free_buffer_page(struct buffer_page *bpage)
 {
 	/* Range pages are not to be freed */
@@ -1762,7 +1768,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			      unsigned long *subbuf_mask)
 {
 	int subbuf_size = PAGE_SIZE;
-	struct buffer_data_page *subbuf;
 	unsigned long buffers_start;
 	unsigned long buffers_end;
 	int i;
@@ -1770,6 +1775,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 	if (!subbuf_mask)
 		return false;
 
+	if (meta->subbuf_size != PAGE_SIZE) {
+		pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+		return false;
+	}
+
 	buffers_start = meta->first_buffer;
 	buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
 
@@ -1786,11 +1796,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 		return false;
 	}
 
-	subbuf = rb_subbufs_from_meta(meta);
-
 	bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
 
-	/* Is the meta buffers and the subbufs themselves have correct data? */
+	/*
+	 * Ensure the meta::buffers array has correct data. The data in each subbufs
+	 * are checked later in rb_meta_validate_events().
+	 */
 	for (i = 0; i < meta->nr_subbufs; i++) {
 		if (meta->buffers[i] < 0 ||
 		    meta->buffers[i] >= meta->nr_subbufs) {
@@ -1798,18 +1809,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			return false;
 		}
 
-		if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
-			pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
-			return false;
-		}
-
 		if (test_bit(meta->buffers[i], subbuf_mask)) {
 			pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
 			return false;
 		}
 
 		set_bit(meta->buffers[i], subbuf_mask);
-		subbuf = (void *)subbuf + subbuf_size;
 	}
 
 	return true;
@@ -1873,13 +1878,22 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+			      struct ring_buffer_cpu_meta *meta)
 {
 	unsigned long long ts;
+	unsigned long tail;
 	u64 delta;
-	int tail;
 
-	tail = local_read(&dpage->commit);
+	/*
+	 * When a sub-buffer is recovered from a read, the commit value may
+	 * have RB_MISSED_* bits set, as these bits are reset on reuse.
+	 * Even after clearing these bits, a commit value greater than the
+	 * subbuf_size is considered invalid.
+	 */
+	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+	if (tail > meta->subbuf_size)
+		return -1;
 	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 }
 
@@ -1890,6 +1904,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *head_page, *orig_head;
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
+	int discarded = 0;
 	int ret;
 	u64 ts;
 	int i;
@@ -1900,14 +1915,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
-		pr_info("Ring buffer reader page is invalid\n");
-		goto invalid;
+		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
+			cpu_buffer->cpu);
+		discarded++;
+		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+		local_set(&cpu_buffer->reader_page->entries, 0);
+		local_set(&cpu_buffer->reader_page->page->commit, 0);
+	} else {
+		entries += ret;
+		entry_bytes += rb_page_size(cpu_buffer->reader_page);
+		local_set(&cpu_buffer->reader_page->entries, ret);
 	}
-	entries += ret;
-	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
-	local_set(&cpu_buffer->reader_page->entries, ret);
 
 	ts = head_page->page->time_stamp;
 
@@ -1935,7 +1955,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 			break;
 
 		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0)
 			break;
 
@@ -2014,21 +2034,24 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->reader_page)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
-			pr_info("Ring buffer meta [%d] invalid buffer page\n",
-				cpu_buffer->cpu);
-			goto invalid;
-		}
-
-		/* If the buffer has content, update pages_touched */
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-
-		entries += ret;
-		entry_bytes += local_read(&head_page->page->commit);
-		local_set(&head_page->entries, ret);
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+		} else {
+			/* If the buffer has content, update pages_touched */
+			if (ret)
+				local_inc(&cpu_buffer->pages_touched);
 
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			local_set(&head_page->entries, ret);
+		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
 	}
@@ -2042,7 +2065,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->entries, entries);
 	local_set(&cpu_buffer->entries_bytes, entry_bytes);
 
-	pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+	pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
+		cpu_buffer->cpu, discarded);
 	return;
 
  invalid:
@@ -3329,12 +3353,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	return NULL;
 }
 
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
-	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
 static __always_inline unsigned
 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 {


^ permalink raw reply related

* [PATCH v13 1/4] ring-buffer: Flush and stop persistent ring buffer on panic
From: Masami Hiramatsu (Google) @ 2026-03-25  2:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177440549083.1529621.15486836623498328967.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

On real hardware, panic and machine reboot may not flush hardware cache
to memory. This means the persistent ring buffer, which relies on a
coherent state of memory, may not have its events written to the buffer
and they may be lost. Moreover, there may be inconsistency with the
counters which are used for validation of the integrity of the
persistent ring buffer which may cause all data to be discarded.

To avoid this issue, stop recording of the ring buffer on panic and
flush the cache of the ring buffer's memory.

Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v13:
   - Fix a rebase conflict.
 Changes in v11:
   - Do nothing by default since flush_cache_vmap() does nothing on x86
     but it can cause deadlock on some architectures via on_each_cpu()
     because other CPUs will be stoppped when panic notifier is called.
 Changes in v9:
   - Fix typo of & to &&.
   - Fix typo of "Generic"
 Changes in v6:
   - Introduce asm/ring_buffer.h for arch_ring_buffer_flush_range().
   - Use flush_cache_vmap() instead of flush_cache_all().
 Changes in v5:
   - Use ring_buffer_record_off() instead of ring_buffer_record_disable().
   - Use flush_cache_all() to ensure flush all cache.
 Changes in v3:
   - update patch description.
---
 arch/alpha/include/asm/Kbuild        |    1 +
 arch/arc/include/asm/Kbuild          |    1 +
 arch/arm/include/asm/Kbuild          |    1 +
 arch/arm64/include/asm/ring_buffer.h |   10 ++++++++++
 arch/csky/include/asm/Kbuild         |    1 +
 arch/hexagon/include/asm/Kbuild      |    1 +
 arch/loongarch/include/asm/Kbuild    |    1 +
 arch/m68k/include/asm/Kbuild         |    1 +
 arch/microblaze/include/asm/Kbuild   |    1 +
 arch/mips/include/asm/Kbuild         |    1 +
 arch/nios2/include/asm/Kbuild        |    1 +
 arch/openrisc/include/asm/Kbuild     |    1 +
 arch/parisc/include/asm/Kbuild       |    1 +
 arch/powerpc/include/asm/Kbuild      |    1 +
 arch/riscv/include/asm/Kbuild        |    1 +
 arch/s390/include/asm/Kbuild         |    1 +
 arch/sh/include/asm/Kbuild           |    1 +
 arch/sparc/include/asm/Kbuild        |    1 +
 arch/um/include/asm/Kbuild           |    1 +
 arch/x86/include/asm/Kbuild          |    1 +
 arch/xtensa/include/asm/Kbuild       |    1 +
 include/asm-generic/ring_buffer.h    |   13 +++++++++++++
 kernel/trace/ring_buffer.c           |   22 ++++++++++++++++++++++
 23 files changed, 65 insertions(+)
 create mode 100644 arch/arm64/include/asm/ring_buffer.h
 create mode 100644 include/asm-generic/ring_buffer.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 483965c5a4de..b154b4e3dfa8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += agp.h
 generic-y += asm-offsets.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4c69522e0328..483caacc6988 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -5,5 +5,6 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 03657ff8fbe3..decad5f2c826 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 
 generated-y += mach-types.h
 generated-y += unistd-nr.h
diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h
new file mode 100644
index 000000000000..62316c406888
--- /dev/null
+++ b/arch/arm64/include/asm/ring_buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_ARM64_RING_BUFFER_H
+#define _ASM_ARM64_RING_BUFFER_H
+
+#include <asm/cacheflush.h>
+
+/* Flush D-cache on persistent ring buffer */
+#define arch_ring_buffer_flush_range(start, end)	dcache_clean_pop(start, end)
+
+#endif /* _ASM_ARM64_RING_BUFFER_H */
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 3a5c7f6e5aac..7dca0c6cdc84 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
 generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1efa1e993d4b..0f887d4238ed 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += extable.h
 generic-y += iomap.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 9034b583a88a..7e92957baf6a 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -10,5 +10,6 @@ generic-y += qrwlock.h
 generic-y += user.h
 generic-y += ioctl.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
 generic-y += statfs.h
 generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index b282e0dd8dc1..62543bf305ff 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -3,5 +3,6 @@ generated-y += syscall_table.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += text-patching.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 7178f990e8b3..0030309b47ad 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += syscalls.h
 generic-y += tlb.h
 generic-y += user.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 684569b2ecd6..9771c3d85074 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 28004301c236..0a2530964413 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += cmpxchg.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..8aa34621702d 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += spinlock_types.h
 generic-y += spinlock.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 4fb596d94c89..d48d158f7241 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2e23533b67e3..805b5aeebb6f 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h
 generic-y += agp.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += early_ioremap.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index bd5fc9403295..7721b63642f4 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h
 generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 80bad7de7a04..0c1fc47c3ba0 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,3 +7,4 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 4d3f10ed8275..f0403d3ee8ab 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,4 +3,5 @@ generated-y += syscall_table.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 17ee8a273aa6..49c6bb326b75 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 1b9b82bbe322..2a1629ba8140 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += module.lds.h
 generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += ring_buffer.h
 generic-y += runtime-const.h
 generic-y += softirq_stack.h
 generic-y += switch_to.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4566000e15c4..078fd2c0d69d 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += early_ioremap.h
 generic-y += fprobe.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 13fe45dea296..e57af619263a 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h
new file mode 100644
index 000000000000..201d2aee1005
--- /dev/null
+++ b/include/asm-generic/ring_buffer.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic arch dependent ring_buffer macros.
+ */
+#ifndef __ASM_GENERIC_RING_BUFFER_H__
+#define __ASM_GENERIC_RING_BUFFER_H__
+
+#include <linux/cacheflush.h>
+
+/* Flush cache on ring buffer range if needed. Do nothing by default. */
+#define arch_ring_buffer_flush_range(start, end)	do { } while (0)
+
+#endif /* __ASM_GENERIC_RING_BUFFER_H__ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b6c39bba56d..3e793bd1c134 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
 #include <linux/ring_buffer_types.h>
 #include <linux/sched/isolation.h>
 #include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
 #include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
 #include <linux/oom.h>
 #include <linux/mm.h>
 
+#include <asm/ring_buffer.h>
 #include <asm/local64.h>
 #include <asm/local.h>
 #include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
 
 	unsigned long			range_addr_start;
 	unsigned long			range_addr_end;
+	struct notifier_block		flush_nb;
 
 	struct ring_buffer_meta		*meta;
 
@@ -2520,6 +2523,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+	ring_buffer_record_off(buffer);
+	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+	return NOTIFY_DONE;
+}
+
 static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 					 int order, unsigned long start,
 					 unsigned long end,
@@ -2650,6 +2663,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 
 	mutex_init(&buffer->mutex);
 
+	/* Persistent ring buffer needs to flush cache before reboot. */
+	if (start && end) {
+		buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+		atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+	}
+
 	return_ptr(buffer);
 
  fail_free_buffers:
@@ -2748,6 +2767,9 @@ ring_buffer_free(struct trace_buffer *buffer)
 {
 	int cpu;
 
+	if (buffer->range_addr_start && buffer->range_addr_end)
+		atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
 	cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
 
 	irq_work_sync(&buffer->irq_work.work);


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox