Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* Re: [PATCH net-next v4 13/13] net/mlx5: Add a shared devlink instance for PFs on same chip
From: Adam Young @ 2026-03-20 23:16 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, edumazet, kuba, pabeni, horms, donald.hunter, corbet,
	skhan, saeedm, leon, tariqt, mbloch, przemyslaw.kitszel, mschmidt,
	andrew+netdev, rostedt, mhiramat, mathieu.desnoyers, chuck.lever,
	matttbe, cjubran, daniel.zahka, linux-doc, linux-rdma,
	linux-trace-kernel
In-Reply-To: <20260312100407.551173-14-jiri@resnulli.us>

This breaks on my system:

On 7.0.0 It boots fine.  With net-next/main currently at this commit


commit 8737d7194d6d5947c3d7d8813895b44a25b84477 (net-next/main, 
net-next/HEAD)
Author: Lorenzo Bianconi <lorenzo@kernel.org>
Date:   Fri Mar 13 17:28:36 2026 +0100

I get:

[   21.859081] mlx5_core 0005:01:00.0: probe_one:2017:(pid 10): 
mlx5_shd_init failed with error code -2
[   21.863266] mlx5_core 0005:01:00.0: probe with driver mlx5_core 
failed with error -2
[   21.866360] mlx5_core 0005:01:00.1: probe_one:2017:(pid 10): 
mlx5_shd_init failed with error code -2
[   21.869937] mlx5_core 0005:01:00.1: probe with driver mlx5_core 
failed with error -2


I am happy to help debug:   what do you need from me?


On 3/12/26 06:04, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@nvidia.com>
>
> Use the previously introduced shared devlink infrastructure to create
> a shared devlink instance for mlx5 PFs that reside on the same physical
> chip. The shared instance is identified by the chip's serial number
> extracted from PCI VPD (V3 keyword, with fallback to serial number
> for older devices).
>
> Each PF that probes calls mlx5_shd_init() which extracts the chip serial
> number and uses devlink_shd_get() to get or create the shared instance.
> When a PF is removed, mlx5_shd_uninit() calls devlink_shd_put()
> to release the reference. The shared instance is automatically destroyed
> when the last PF is removed.
>
> Make the PF devlink instances nested in this shared devlink instance,
> allowing userspace to identify which PFs belong to the same physical
> chip.
>
> Example:
>
> pci/0000:08:00.0: index 0
>    nested_devlink:
>      auxiliary/mlx5_core.eth.0
> devlink_index/1: index 1
>    nested_devlink:
>      pci/0000:08:00.0
>      pci/0000:08:00.1
> auxiliary/mlx5_core.eth.0: index 2
> pci/0000:08:00.1: index 3
>    nested_devlink:
>      auxiliary/mlx5_core.eth.1
> auxiliary/mlx5_core.eth.1: index 4
>
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---
> v2->v3:
> - removed "const" from "sn"
> - passing driver pointer to devlink_shd_get()
> ---
>   .../net/ethernet/mellanox/mlx5/core/Makefile  |  5 +-
>   .../net/ethernet/mellanox/mlx5/core/main.c    | 17 ++++++
>   .../ethernet/mellanox/mlx5/core/sh_devlink.c  | 61 +++++++++++++++++++
>   .../ethernet/mellanox/mlx5/core/sh_devlink.h  | 12 ++++
>   include/linux/mlx5/driver.h                   |  1 +
>   5 files changed, 94 insertions(+), 2 deletions(-)
>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> index 8ffa286a18f5..d39fe9c4a87c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> @@ -16,8 +16,9 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
>   		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
>   		fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
>   		lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
> -		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
> -		fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o
> +		diag/fw_tracer.o diag/crdump.o devlink.o sh_devlink.o diag/rsc_dump.o \
> +		diag/reporter_vnic.o fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o \
> +		lib/nv_param.o
>   
>   #
>   # Netdev basic
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index fdc3ba20912e..1c35c3fc3bb3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -74,6 +74,7 @@
>   #include "mlx5_irq.h"
>   #include "hwmon.h"
>   #include "lag/lag.h"
> +#include "sh_devlink.h"
>   
>   MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
>   MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
> @@ -1520,10 +1521,16 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
>   	int err;
>   
>   	devl_lock(devlink);
> +	if (dev->shd) {
> +		err = devl_nested_devlink_set(dev->shd, devlink);
> +		if (err)
> +			goto unlock;
> +	}
>   	devl_register(devlink);
>   	err = mlx5_init_one_devl_locked(dev);
>   	if (err)
>   		devl_unregister(devlink);
> +unlock:
>   	devl_unlock(devlink);
>   	return err;
>   }
> @@ -2005,6 +2012,13 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
>   		goto pci_init_err;
>   	}
>   
> +	err = mlx5_shd_init(dev);
> +	if (err) {
> +		mlx5_core_err(dev, "mlx5_shd_init failed with error code %d\n",
> +			      err);
> +		goto shd_init_err;
> +	}
> +
>   	err = mlx5_init_one(dev);
>   	if (err) {
>   		mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n",
> @@ -2018,6 +2032,8 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
>   	return 0;
>   
>   err_init_one:
> +	mlx5_shd_uninit(dev);
> +shd_init_err:
>   	mlx5_pci_close(dev);
>   pci_init_err:
>   	mlx5_mdev_uninit(dev);
> @@ -2039,6 +2055,7 @@ static void remove_one(struct pci_dev *pdev)
>   	mlx5_drain_health_wq(dev);
>   	mlx5_sriov_disable(pdev, false);
>   	mlx5_uninit_one(dev);
> +	mlx5_shd_uninit(dev);
>   	mlx5_pci_close(dev);
>   	mlx5_mdev_uninit(dev);
>   	mlx5_adev_idx_free(dev->priv.adev_idx);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> new file mode 100644
> index 000000000000..bc33f95302df
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> @@ -0,0 +1,61 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#include <linux/mlx5/driver.h>
> +#include <net/devlink.h>
> +
> +#include "sh_devlink.h"
> +
> +static const struct devlink_ops mlx5_shd_ops = {
> +};
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev)
> +{
> +	u8 *vpd_data __free(kfree) = NULL;
> +	struct pci_dev *pdev = dev->pdev;
> +	unsigned int vpd_size, kw_len;
> +	struct devlink *devlink;
> +	char *sn, *end;
> +	int start;
> +	int err;
> +
> +	if (!mlx5_core_is_pf(dev))
> +		return 0;
> +
> +	vpd_data = pci_vpd_alloc(pdev, &vpd_size);
> +	if (IS_ERR(vpd_data)) {
> +		err = PTR_ERR(vpd_data);
> +		return err == -ENODEV ? 0 : err;
> +	}
> +	start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len);
> +	if (start < 0) {
> +		/* Fall-back to SN for older devices. */
> +		start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size,
> +						     PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len);
> +		if (start < 0)
> +			return -ENOENT;
> +	}
> +	sn = kstrndup(vpd_data + start, kw_len, GFP_KERNEL);
> +	if (!sn)
> +		return -ENOMEM;
> +	/* Firmware may return spaces at the end of the string, strip it. */
> +	end = strchrnul(sn, ' ');
> +	*end = '\0';
> +
> +	/* Get or create shared devlink instance */
> +	devlink = devlink_shd_get(sn, &mlx5_shd_ops, 0, pdev->dev.driver);
> +	kfree(sn);
> +	if (!devlink)
> +		return -ENOMEM;
> +
> +	dev->shd = devlink;
> +	return 0;
> +}
> +
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev)
> +{
> +	if (!dev->shd)
> +		return;
> +
> +	devlink_shd_put(dev->shd);
> +}
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> new file mode 100644
> index 000000000000..8ab8d6940227
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#ifndef __MLX5_SH_DEVLINK_H__
> +#define __MLX5_SH_DEVLINK_H__
> +
> +#include <linux/mlx5/driver.h>
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev);
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev);
> +
> +#endif /* __MLX5_SH_DEVLINK_H__ */
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 04dcd09f7517..1268fcf35ec7 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -798,6 +798,7 @@ struct mlx5_core_dev {
>   	enum mlx5_wc_state wc_state;
>   	/* sync write combining state */
>   	struct mutex wc_state_lock;
> +	struct devlink *shd;
>   };
>   
>   struct mlx5_db {

^ permalink raw reply

* Re: [PATCH] tracing: Adjust cmd_check_undefined to show unexpected undefined symbols
From: Arnd Bergmann @ 2026-03-20 21:34 UTC (permalink / raw)
  To: Nathan Chancellor, Marc Zyngier, Vincent Donnefort
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kvmarm
In-Reply-To: <20260320-cmd_check_undefined-verbose-v1-1-54fc5b061f94@kernel.org>

On Fri, Mar 20, 2026, at 22:29, Nathan Chancellor wrote:
> When the check_undefined command in kernel/trace/Makefile fails, there
> is no output, making it hard to understand why the build failed. Capture
> the output of the $(NM) + grep command and print it when failing to make
> it clearer what the problem is.
>
> Fixes: a717943d8ecc ("tracing: Check for undefined symbols in 
> simple_ring_buffer")
> Signed-off-by: Nathan Chancellor <nathan@kernel.org>

Acked-by: Arnd Bergmann <arnd@arndb.de>

This does seem very helpful, as I still expect this to come up regularly.

      Arnd

^ permalink raw reply

* [PATCH] tracing: Adjust cmd_check_undefined to show unexpected undefined symbols
From: Nathan Chancellor @ 2026-03-20 21:29 UTC (permalink / raw)
  To: Marc Zyngier, Vincent Donnefort
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Arnd Bergmann, linux-kernel, linux-trace-kernel, kvmarm,
	Nathan Chancellor

When the check_undefined command in kernel/trace/Makefile fails, there
is no output, making it hard to understand why the build failed. Capture
the output of the $(NM) + grep command and print it when failing to make
it clearer what the problem is.

Fixes: a717943d8ecc ("tracing: Check for undefined symbols in simple_ring_buffer")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
Commit a717943d8ecc ("tracing: Check for undefined symbols in
simple_ring_buffer") and its follow up fixes are in the kvmarm tree so
this should go there as well. This is the rebased version of my
suggestion in the original thread:

https://lore.kernel.org/20260311221816.GA316631@ax162/
---
 kernel/trace/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c5e14ffd36ee..d662c1a64cd5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -174,7 +174,13 @@ UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitize
 		      $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
 
 quiet_cmd_check_undefined = NM      $<
-      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST))`"
+      cmd_check_undefined = \
+          undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+          if [ -n "$$undefsyms" ]; then \
+              echo "Unexpected symbols in $<:" >&2; \
+              echo "$$undefsyms" >&2; \
+              false; \
+          fi
 
 $(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
 	$(call if_changed,check_undefined)

---
base-commit: e3d585ed3ff891a00c2284fef4be9cf8581735ab
change-id: 20260320-cmd_check_undefined-verbose-7d15f13f615d

Best regards,
--  
Nathan Chancellor <nathan@kernel.org>


^ permalink raw reply related

* Re: [PATCH] coredump: add tracepoint for coredump events
From: Steven Rostedt @ 2026-03-20 18:48 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Breno Leitao, Alexander Viro, Jan Kara, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-fsdevel,
	linux-trace-kernel, bpf, kernel-team, Andrii Nakryiko
In-Reply-To: <20260320-erlitt-ergibt-255e86a66414@brauner>

On Fri, 20 Mar 2026 14:21:23 +0100
Christian Brauner <brauner@kernel.org> wrote:

> > +TRACE_EVENT(coredump,
> > +
> > +	TP_PROTO(int sig),
> > +
> > +	TP_ARGS(sig),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(int, sig)
> > +		__array(char, comm, TASK_COMM_LEN)
> > +		__field(pid_t, pid)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->sig = sig;
> > +		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> > +		__entry->pid = current->pid;  
> 
> That's the TID as seen in the global pid namespace.
> I assume this is what you want but worth noting.

Not to mention the pid is saved in all trace events and is available for
perf and bpf too. Even the change log showed it:

             sleep-634     [036] .....   145.222206: coredump: sig=11 comm=sleep pid=634

                   ^^^                                                               ^^^

So it should not be included. It's duplicate and only wastes space. Now if
you wanted to save the name space pid, that may be useful.

-- Steve

^ permalink raw reply

* Re: [PATCH 3/3] rtla: Parse cmdline using libsubcmd
From: Wander Lairson Costa @ 2026-03-20 17:31 UTC (permalink / raw)
  To: Tomas Glozar
  Cc: Steven Rostedt, John Kacur, Luis Goncalves, Crystal Wood,
	Costa Shulyupin, Ivan Pravdin, Namhyung Kim, Ian Rogers,
	Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
	linux-perf-users
In-Reply-To: <20260320150651.51057-4-tglozar@redhat.com>

On Fri, Mar 20, 2026 at 04:06:51PM +0100, Tomas Glozar wrote:
> Instead of using getopt_long() directly to parse the command line
> arguments given to an RTLA tool, use libsubcmd's parse_options().
> 
> Utilizing libsubcmd for parsing command line arguments has several
> benefits:
> 
> - A help message is automatically generated by libsubcmd from the
>   specification, removing the need of writing it by hand.
> - Options are sorted into groups based on which part of tracing (CPU,
>   thread, auto-analysis, tuning, histogram) they relate to.
> - Common parsing patterns for numerical and boolean values now share
>   code, with the target variable being stored in the option array.
> 
> To avoid duplication of the option parsing logic, RTLA-specific
> macros defining struct option values are created:
> 
> - RTLA_OPT_* for options common to all tools
> - OSNOISE_OPT_* and TIMERLAT_OPT_* for options specific to

[snip]

> -int getopt_auto(int argc, char **argv, const struct option *long_opts);
>  int common_parse_options(int argc, char **argv, struct common_params *common);


The function common_parse_options() body was removed, but the declaration remains.

>  int common_apply_config(struct osnoise_tool *tool, struct common_params *params);

[snip]


^ permalink raw reply

* Re: [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs
From: Laurence Oberman @ 2026-03-20 15:08 UTC (permalink / raw)
  To: Aaron Tomlin, axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: johannes.thumshirn, kch, bvanassche, dlemoal, ritesh.list, neelx,
	sean, mproche, chjohnst, linux-block, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260319221956.332770-3-atomlin@atomlin.com>

On Thu, 2026-03-19 at 18:19 -0400, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED),
> severe
> latency spikes can occur when fast devices are starved of available
> tags.
> 
> This patch introduces two new debugfs attributes for each block
> hardware queue:
>   - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
>   - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag
> 
> These files expose atomic counters that increment each time a
> submitting
> context is forced into an uninterruptible sleep via io_schedule() due
> to
> the complete exhaustion of physical driver tags or software scheduler
> tags, respectively.
> 
> To guarantee zero performance overhead for production kernels
> compiled
> without debugfs, the underlying atomic_t variables and their
> associated
> increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS.
> When this configuration is disabled, the tracking logic compiles down
> to a safe no-op.
> 
> Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
> ---
>  block/blk-mq-debugfs.c | 56
> ++++++++++++++++++++++++++++++++++++++++++
>  block/blk-mq-debugfs.h |  7 ++++++
>  block/blk-mq-tag.c     |  4 +++
>  include/linux/blk-mq.h | 10 ++++++++
>  4 files changed, 77 insertions(+)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 28167c9baa55..078561d7da38 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data,
> struct seq_file *m)
>  	return 0;
>  }
>  
> +/**
> + * hctx_wait_on_hw_tag_show - display hardware tag starvation count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of physical hardware driver tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
> +{
> +	struct blk_mq_hw_ctx *hctx = data;
> +
> +	seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag));
> +	return 0;
> +}
> +
> +/**
> + * hctx_wait_on_sched_tag_show - display scheduler tag starvation
> count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of software scheduler tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file
> *m)
> +{
> +	struct blk_mq_hw_ctx *hctx = data;
> +
> +	seq_printf(m, "%d\n", atomic_read(&hctx-
> >wait_on_sched_tag));
> +	return 0;
> +}
> +
>  #define CTX_RQ_SEQ_OPS(name,
> type)					\
>  static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t
> *pos) \
>  	__acquires(&ctx-
> >lock)						\
> @@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr
> blk_mq_debugfs_hctx_attrs[] = {
>  	{"active", 0400, hctx_active_show},
>  	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
>  	{"type", 0400, hctx_type_show},
> +	{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
> +	{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
>  	{},
>  };
>  
> @@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct
> blk_mq_hw_ctx *hctx)
>  	debugfs_remove_recursive(hctx->sched_debugfs_dir);
>  	hctx->sched_debugfs_dir = NULL;
>  }
> +
> +/**
> + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation
> counters
> + * @hctx: hardware context associated with the tag allocation
> + * @is_sched: boolean indicating whether the starved pool is the
> software scheduler
> + *
> + * Evaluates the exhausted tag pool and increments the appropriate
> debugfs
> + * starvation counter. This is invoked immediately before the
> submitting
> + * context is forced into an uninterruptible sleep via
> io_schedule().
> + */
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> +				  bool is_sched)
> +{
> +	if (is_sched)
> +		atomic_inc(&hctx->wait_on_sched_tag);
> +	else
> +		atomic_inc(&hctx->wait_on_hw_tag);
> +}
> diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
> index 49bb1aaa83dc..2cda555d5730 100644
> --- a/block/blk-mq-debugfs.h
> +++ b/block/blk-mq-debugfs.h
> @@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct
> request_queue *q,
>  void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx
> *hctx);
>  
>  void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> +				  bool is_sched);
>  #else
>  static inline void blk_mq_debugfs_register(struct request_queue *q)
>  {
> @@ -77,6 +79,11 @@ static inline void
> blk_mq_debugfs_register_rq_qos(struct request_queue *q)
>  {
>  }
>  
> +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx
> *hctx,
> +						bool is_sched)
> +{
> +}
> +
>  #endif
>  
>  #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 66138dd043d4..3cc6a97a87a0 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -17,6 +17,7 @@
>  #include "blk.h"
>  #include "blk-mq.h"
>  #include "blk-mq-sched.h"
> +#include "blk-mq-debugfs.h"
>  
>  /*
>   * Recalculate wakeup batch when tag is shared by hctx.
> @@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct
> blk_mq_alloc_data *data)
>  		trace_block_rq_tag_wait(data->q, data->hctx,
>  					data->rq_flags &
> RQF_SCHED_TAGS);
>  
> +		blk_mq_debugfs_inc_wait_tags(data->hctx,
> +					     data->rq_flags &
> RQF_SCHED_TAGS);
> +
>  		bt_prev = bt;
>  		io_schedule();
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 18a2388ba581..f3d8ea93b23f 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -453,6 +453,16 @@ struct blk_mq_hw_ctx {
>  	struct dentry		*debugfs_dir;
>  	/** @sched_debugfs_dir:	debugfs directory for the
> scheduler. */
>  	struct dentry		*sched_debugfs_dir;
> +	/**
> +	 * @wait_on_hw_tag: Cumulative counter incremented each time
> a submitting
> +	 * context is forced to block due to physical hardware
> driver tag exhaustion.
> +	 */
> +	atomic_t		wait_on_hw_tag;
> +	/**
> +	 * @wait_on_sched_tag: Cumulative counter incremented each
> time a submitting
> +	 * context is forced to block due to software scheduler tag
> exhaustion.
> +	 */
> +	atomic_t		wait_on_sched_tag;
>  #endif
>  
>  	/**

For [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs

Tested-by: Laurence Oberman <loberman@redhat.com>

Every 10.0s: grep . /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_*   
rhel95: Fri Mar 20 11:04:15 2026

/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_hw_tag:103260 <---
cumulative 
/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_sched_tag:0

The patch to me looks good, but will need others to confirm
Reviewed-by: Laurence Oberman <loberman@redhat.com>


^ permalink raw reply

* [PATCH 3/3] rtla: Parse cmdline using libsubcmd
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
	Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
	linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>

Instead of using getopt_long() directly to parse the command line
arguments given to an RTLA tool, use libsubcmd's parse_options().

Utilizing libsubcmd for parsing command line arguments has several
benefits:

- A help message is automatically generated by libsubcmd from the
  specification, removing the need of writing it by hand.
- Options are sorted into groups based on which part of tracing (CPU,
  thread, auto-analysis, tuning, histogram) they relate to.
- Common parsing patterns for numerical and boolean values now share
  code, with the target variable being stored in the option array.

To avoid duplication of the option parsing logic, RTLA-specific
macros defining struct option values are created:

- RTLA_OPT_* for options common to all tools
- OSNOISE_OPT_* and TIMERLAT_OPT_* for options specific to
  osnoise/timerlat tools
, HIST_OPT_* macros for options specific to histogram-based tools.

Individual *_parse_args() functions then construct an array out of
these macros that is then passed to libsubcmd's parse_options().

All code specific to command line options parsing is moved out of the
individual tool files into a new file, cli.c, which also contains the
contents of the rtla.c file.

The return value of tool-level help option changes to 129, as this is
the value set by libsubcmd; this is reflected in affected test cases.
The implementation of help for command-level and tracer-level help
remains the same.

Assisted-by: Composer:composer-1.5
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/Build           |    2 +-
 tools/tracing/rtla/src/cli.c           | 1207 ++++++++++++++++++++++++
 tools/tracing/rtla/src/cli.h           |    7 +
 tools/tracing/rtla/src/common.c        |  109 ---
 tools/tracing/rtla/src/common.h        |   26 +-
 tools/tracing/rtla/src/osnoise_hist.c  |  221 +----
 tools/tracing/rtla/src/osnoise_top.c   |  200 +---
 tools/tracing/rtla/src/rtla.c          |   89 --
 tools/tracing/rtla/src/timerlat.h      |    4 +-
 tools/tracing/rtla/src/timerlat_hist.c |  317 +------
 tools/tracing/rtla/src/timerlat_top.c  |  285 +-----
 tools/tracing/rtla/src/utils.c         |   28 +-
 tools/tracing/rtla/src/utils.h         |    3 +-
 tools/tracing/rtla/tests/hwnoise.t     |    2 +-
 14 files changed, 1236 insertions(+), 1264 deletions(-)
 create mode 100644 tools/tracing/rtla/src/cli.c
 create mode 100644 tools/tracing/rtla/src/cli.h
 delete mode 100644 tools/tracing/rtla/src/rtla.c

diff --git a/tools/tracing/rtla/src/Build b/tools/tracing/rtla/src/Build
index 329e24a40cf7..a1f3ab927207 100644
--- a/tools/tracing/rtla/src/Build
+++ b/tools/tracing/rtla/src/Build
@@ -11,4 +11,4 @@ rtla-y += timerlat_hist.o
 rtla-y += timerlat_u.o
 rtla-y += timerlat_aa.o
 rtla-y += timerlat_bpf.o
-rtla-y += rtla.o
+rtla-y += cli.o
diff --git a/tools/tracing/rtla/src/cli.c b/tools/tracing/rtla/src/cli.c
new file mode 100644
index 000000000000..d029a698e8a7
--- /dev/null
+++ b/tools/tracing/rtla/src/cli.c
@@ -0,0 +1,1207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#include <linux/kernel.h>
+#include <subcmd/parse-options.h>
+
+#include "cli.h"
+#include "osnoise.h"
+#include "timerlat.h"
+
+struct osnoise_cb_data {
+	struct osnoise_params *params;
+	char *trace_output;
+};
+
+struct timerlat_cb_data {
+	struct timerlat_params *params;
+	char *trace_output;
+};
+
+static const char * const osnoise_top_usage[] = {
+	"rtla osnoise [top] [<options>]",
+	NULL,
+};
+
+static const char * const osnoise_hist_usage[] = {
+	"rtla osnoise hist [<options>]",
+	NULL,
+};
+
+static const char * const timerlat_top_usage[] = {
+	"rtla timerlat [top] [<options>]",
+	NULL,
+};
+
+static const char * const timerlat_hist_usage[] = {
+	"rtla timerlat hist [<options>]",
+	NULL,
+};
+
+static const char * const hwnoise_usage[] = {
+	"rtla hwnoise [<options>]",
+	NULL,
+};
+
+static const int common_parse_options_flags = PARSE_OPT_OPTARG_ALLOW_NEXT;
+
+/*
+ * Macros for command line options common to all tools
+ *
+ * Note: Some of the options are common to both timerlat and osnoise, but
+ * have a slightly different meaning. Such options take additional arguments
+ * that have to be provided by the *_parse_args() function of the corresponding
+ * tool.
+ *
+ * All macros defined here assume the presence of a params variable of
+ * the corresponding tool type (i.e struct timerlat_params or struct osnoise_params)
+ * and a cb_data variable of the matching type.
+ */
+
+#define RTLA_OPT_STOP(short, long, name) OPT_CALLBACK(short, long, \
+	&params->common.stop_us, \
+	"us", \
+	"stop trace if " name " is higher than the argument in us", \
+	opt_llong_callback)
+
+#define RTLA_OPT_STOP_TOTAL(short, long, name) OPT_CALLBACK(short, long, \
+	&params->common.stop_total_us, \
+	"us", \
+	"stop trace if " name " is higher than the argument in us", \
+	opt_llong_callback)
+
+#define RTLA_OPT_TRACE_OUTPUT(tracer, cb) OPT_CALLBACK_OPTARG('t', "trace", \
+	(const char **)&cb_data.trace_output, \
+	tracer "_trace.txt", \
+	"[file]", \
+	"save the stopped trace to [file|" tracer "_trace.txt]", \
+	cb)
+
+#define RTLA_OPT_CPUS OPT_CALLBACK('c', "cpus", &params->common, \
+	"cpu-list", \
+	"run the tracer only on the given cpus", \
+	opt_cpus_cb)
+
+#define RTLA_OPT_CGROUP OPT_CALLBACK_OPTARG('C', "cgroup", &params->common, \
+	"[cgroup_name]", NULL, \
+	"set cgroup, no argument means rtla's cgroup will be inherited", \
+	opt_cgroup_cb)
+
+#define RTLA_OPT_USER_THREADS OPT_CALLBACK('u', "user-threads", params, NULL, \
+	"use rtla user-space threads instead of kernel-space timerlat threads", \
+	opt_user_threads_cb)
+
+#define RTLA_OPT_KERNEL_THREADS OPT_BOOLEAN('k', "kernel-threads", \
+	&params->common.kernel_workload, \
+	"use timerlat kernel-space threads instead of rtla user-space threads")
+
+#define RTLA_OPT_USER_LOAD OPT_BOOLEAN('U', "user-load", &params->common.user_data, \
+	"enable timerlat for user-defined user-space workload")
+
+#define RTLA_OPT_DURATION OPT_CALLBACK('d', "duration", &params->common, \
+	"time[s|m|h|d]", \
+	"set the duration of the session", \
+	opt_duration_cb)
+
+#define RTLA_OPT_EVENT OPT_CALLBACK('e', "event", &params->common.events, \
+	"sys:event", \
+	"enable the <sys:event> in the trace instance, multiple -e are allowed", \
+	opt_event_cb)
+
+#define RTLA_OPT_HOUSEKEEPING OPT_CALLBACK('H', "house-keeping", &params->common, \
+	"cpu-list", \
+	"run rtla control threads only on the given cpus", \
+	opt_housekeeping_cb)
+
+#define RTLA_OPT_PRIORITY OPT_CALLBACK('P', "priority", &params->common, \
+	"o:prio|r:prio|f:prio|d:runtime:period", \
+	"set scheduling parameters", \
+	opt_priority_cb)
+
+#define RTLA_OPT_TRIGGER OPT_CALLBACK(0, "trigger", &params->common, \
+	"trigger", \
+	"enable a trace event trigger to the previous -e event", \
+	opt_trigger_cb)
+
+#define RTLA_OPT_FILTER OPT_CALLBACK(0, "filter", &params->common, \
+	"filter", \
+	"enable a trace event filter to the previous -e event", \
+	opt_filter_cb)
+
+#define RTLA_OPT_QUIET OPT_BOOLEAN('q', "quiet", &params->common.quiet, \
+	"print only a summary at the end")
+
+#define RTLA_OPT_TRACE_BUFFER_SIZE OPT_CALLBACK(0, "trace-buffer-size", \
+	&params->common.buffer_size, "kB", \
+	"set the per-cpu trace buffer size in kB", \
+	opt_int_callback)
+
+#define RTLA_OPT_WARM_UP OPT_CALLBACK(0, "warm-up", &params->common.warmup, "s", \
+	"let the workload run for s seconds before collecting data", \
+	opt_int_callback)
+
+#define RTLA_OPT_AUTO(cb) OPT_CALLBACK('a', "auto", &cb_data, "us", \
+	"set automatic trace mode, stopping the session if argument in us sample is hit", \
+	cb)
+
+#define RTLA_OPT_ON_THRESHOLD(threshold, cb) OPT_CALLBACK(0, "on-threshold", \
+	&params->common.threshold_actions, \
+	"action", \
+	"define action to be executed at " threshold " threshold, multiple are allowed", \
+	cb)
+
+#define RTLA_OPT_ON_END(cb) OPT_CALLBACK(0, "on-end", &params->common.end_actions, \
+	"action", \
+	"define action to be executed at measurement end, multiple are allowed", \
+	cb)
+
+#define RTLA_OPT_DEBUG OPT_BOOLEAN('D', "debug", &config_debug, \
+	"print debug info")
+
+#define RTLA_OPT_HELP OPT_BOOLEAN('h', "help", (bool *)NULL, \
+	"show help")
+
+/*
+ * Common callback functions for command line options
+ */
+
+static int opt_llong_callback(const struct option *opt, const char *arg, int unset)
+{
+	long long *value = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*value = get_llong_from_str((char *)arg);
+	return 0;
+}
+
+static int opt_int_callback(const struct option *opt, const char *arg, int unset)
+{
+	int *value = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	if (strtoi(arg, value))
+		return -1;
+
+	return 0;
+}
+
+static int opt_cpus_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = parse_cpu_set((char *)arg, &params->monitored_cpus);
+	if (retval)
+		fatal("Invalid -c cpu list");
+	params->cpus = (char *)arg;
+
+	return 0;
+}
+
+static int opt_cgroup_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+
+	params->cgroup = 1;
+	params->cgroup_name = (char *)arg;
+	if (params->cgroup_name && params->cgroup_name[0] == '=')
+		/* Allow -C=<cgroup_name> next to -C[ ]<cgroup_name> */
+		++params->cgroup_name;
+
+	return 0;
+}
+
+static int opt_duration_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	params->duration = parse_seconds_duration((char *)arg);
+	if (!params->duration)
+		fatal("Invalid -d duration");
+
+	return 0;
+}
+
+static int opt_event_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct trace_events **events = opt->value;
+	struct trace_events *tevent;
+
+	if (unset || !arg)
+		return -1;
+
+	tevent = trace_event_alloc((char *)arg);
+	if (!tevent)
+		fatal("Error alloc trace event");
+
+	if (*events)
+		tevent->next = *events;
+	*events = tevent;
+
+	return 0;
+}
+
+static int opt_housekeeping_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	params->hk_cpus = 1;
+	retval = parse_cpu_set((char *)arg, &params->hk_cpu_set);
+	if (retval)
+		fatal("Error parsing house keeping CPUs");
+
+	return 0;
+}
+
+static int opt_priority_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = parse_prio((char *)arg, &params->sched_param);
+	if (retval == -1)
+		fatal("Invalid -P priority");
+	params->set_sched = 1;
+
+	return 0;
+}
+
+static int opt_trigger_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	if (!params->events)
+		fatal("--trigger requires a previous -e");
+
+	trace_event_add_trigger(params->events, (char *)arg);
+
+	return 0;
+}
+
+static int opt_filter_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct common_params *params = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	if (!params->events)
+		fatal("--filter requires a previous -e");
+
+	trace_event_add_filter(params->events, (char *)arg);
+
+	return 0;
+}
+
+/*
+ * Macros for command line options specific to osnoise
+ */
+#define OSNOISE_OPT_PERIOD OPT_CALLBACK('p', "period", &params->period, "us", \
+	"osnoise period in us", \
+	opt_osnoise_period_cb)
+
+#define OSNOISE_OPT_RUNTIME OPT_CALLBACK('r', "runtime", &params->runtime, "us", \
+	"osnoise runtime in us", \
+	opt_osnoise_runtime_cb)
+
+#define OSNOISE_OPT_THRESHOLD OPT_CALLBACK('T', "threshold", &params->threshold, "us", \
+	"the minimum delta to be considered a noise", \
+	opt_osnoise_threshold_cb)
+
+/*
+ * Callback functions for command line options for osnoise tools
+ */
+
+static int opt_osnoise_auto_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct osnoise_cb_data *cb_data = opt->value;
+	struct osnoise_params *params = cb_data->params;
+	long long auto_thresh;
+
+	if (unset || !arg)
+		return -1;
+
+	auto_thresh = get_llong_from_str((char *)arg);
+	params->common.stop_us = auto_thresh;
+	params->threshold = 1;
+
+	if (!cb_data->trace_output)
+		cb_data->trace_output = "osnoise_trace.txt";
+
+	return 0;
+}
+
+static int opt_osnoise_period_cb(const struct option *opt, const char *arg, int unset)
+{
+	unsigned long long *period = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*period = get_llong_from_str((char *)arg);
+	if (*period > 10000000)
+		fatal("Period longer than 10 s");
+
+	return 0;
+}
+
+static int opt_osnoise_runtime_cb(const struct option *opt, const char *arg, int unset)
+{
+	unsigned long long *runtime = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*runtime = get_llong_from_str((char *)arg);
+	if (*runtime < 100)
+		fatal("Runtime shorter than 100 us");
+
+	return 0;
+}
+
+static int opt_osnoise_trace_output_cb(const struct option *opt, const char *arg, int unset)
+{
+	const char **trace_output = opt->value;
+
+	if (unset)
+		return -1;
+
+	if (!arg) {
+		*trace_output = "osnoise_trace.txt";
+	} else {
+		*trace_output = (char *)arg;
+		if (*trace_output && (*trace_output)[0] == '=')
+			/* Allow -t=<trace_output> next to -t[ ]<trace_output> */
+			++*trace_output;
+	}
+
+	return 0;
+}
+
+static int opt_osnoise_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+	long long *threshold = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*threshold = get_llong_from_str((char *)arg);
+
+	return 0;
+}
+
+static int opt_osnoise_on_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct actions *actions = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = actions_parse(actions, (char *)arg, "osnoise_trace.txt");
+	if (retval)
+		fatal("Invalid action %s", arg);
+
+	return 0;
+}
+
+static int opt_osnoise_on_end_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct actions *actions = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = actions_parse(actions, (char *)arg, "osnoise_trace.txt");
+	if (retval)
+		fatal("Invalid action %s", arg);
+
+	return 0;
+}
+
+/*
+ * Macros for command line options specific to timerlat
+ */
+#define TIMERLAT_OPT_PERIOD OPT_CALLBACK('p', "period", &params->timerlat_period_us, "us", \
+	"timerlat period in us", \
+	opt_timerlat_period_cb)
+
+#define TIMERLAT_OPT_STACK OPT_CALLBACK('s', "stack", &params->print_stack, "us", \
+	"save the stack trace at the IRQ if a thread latency is higher than the argument in us", \
+	opt_llong_callback)
+
+#define TIMERLAT_OPT_NANO OPT_CALLBACK('n', "nano", params, NULL, \
+	"display data in nanoseconds", \
+	opt_nano_cb)
+
+#define TIMERLAT_OPT_DMA_LATENCY OPT_CALLBACK(0, "dma-latency", &params->dma_latency, "us", \
+	"set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency", \
+	opt_dma_latency_cb)
+
+#define TIMERLAT_OPT_DEEPEST_IDLE_STATE OPT_CALLBACK(0, "deepest-idle-state", \
+	&params->deepest_idle_state, "n", \
+	"only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", \
+	opt_int_callback)
+
+#define TIMERLAT_OPT_AA_ONLY OPT_CALLBACK(0, "aa-only", params, "us", \
+	"stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)", \
+	opt_aa_only_cb)
+
+#define TIMERLAT_OPT_NO_AA OPT_BOOLEAN(0, "no-aa", &params->no_aa, \
+	"disable auto-analysis, reducing rtla timerlat cpu usage")
+
+#define TIMERLAT_OPT_DUMPS_TASKS OPT_BOOLEAN(0, "dump-tasks", &params->dump_tasks, \
+	"prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)")
+
+#define TIMERLAT_OPT_BPF_ACTION OPT_STRING(0, "bpf-action", &params->bpf_action_program, \
+	"program", \
+	"load and execute BPF program when latency threshold is exceeded")
+
+#define TIMERLAT_OPT_STACK_FORMAT OPT_CALLBACK(0, "stack-format", &params->stack_format, "format", \
+	"set the stack format (truncate, skip, full)", \
+	opt_stack_format_cb)
+
+/*
+ * Callback functions for command line options for timerlat tools
+ */
+
+static int opt_timerlat_period_cb(const struct option *opt, const char *arg, int unset)
+{
+	long long *period = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*period = get_llong_from_str((char *)arg);
+	if (*period > 1000000)
+		fatal("Period longer than 1 s");
+
+	return 0;
+}
+
+static int opt_timerlat_auto_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct timerlat_cb_data *cb_data = opt->value;
+	struct timerlat_params *params = cb_data->params;
+	long long auto_thresh;
+
+	if (unset || !arg)
+		return -1;
+
+	auto_thresh = get_llong_from_str((char *)arg);
+	params->common.stop_total_us = auto_thresh;
+	params->common.stop_us = auto_thresh;
+	params->print_stack = auto_thresh;
+
+	if (!cb_data->trace_output)
+		cb_data->trace_output = "timerlat_trace.txt";
+
+	return 0;
+}
+
+static int opt_dma_latency_cb(const struct option *opt, const char *arg, int unset)
+{
+	int *dma_latency = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = strtoi((char *)arg, dma_latency);
+	if (retval)
+		fatal("Invalid -dma-latency %s", arg);
+	if (*dma_latency < 0 || *dma_latency > 10000)
+		fatal("--dma-latency needs to be >= 0 and < 10000");
+
+	return 0;
+}
+
+static int opt_aa_only_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct timerlat_params *params = opt->value;
+	long long auto_thresh;
+
+	if (unset || !arg)
+		return -1;
+
+	auto_thresh = get_llong_from_str((char *)arg);
+	params->common.stop_total_us = auto_thresh;
+	params->common.stop_us = auto_thresh;
+	params->print_stack = auto_thresh;
+	params->common.aa_only = 1;
+
+	return 0;
+}
+
+static int opt_timerlat_trace_output_cb(const struct option *opt, const char *arg, int unset)
+{
+	const char **trace_output = opt->value;
+
+	if (unset)
+		return -1;
+
+	if (!arg) {
+		*trace_output = "timerlat_trace.txt";
+	} else {
+		*trace_output = (char *)arg;
+		if (*trace_output && (*trace_output)[0] == '=')
+			/* Allow -t=<trace_output> next to -t[ ]<trace_output> */
+			++*trace_output;
+	}
+
+	return 0;
+}
+
+static int opt_timerlat_on_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct actions *actions = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = actions_parse(actions, (char *)arg, "timerlat_trace.txt");
+	if (retval)
+		fatal("Invalid action %s", arg);
+
+	return 0;
+}
+
+static int opt_timerlat_on_end_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct actions *actions = opt->value;
+	int retval;
+
+	if (unset || !arg)
+		return -1;
+
+	retval = actions_parse(actions, (char *)arg, "timerlat_trace.txt");
+	if (retval)
+		fatal("Invalid action %s", arg);
+
+	return 0;
+}
+
+static int opt_user_threads_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct timerlat_params *params = opt->value;
+
+	if (unset)
+		return 0;
+
+	params->common.user_workload = true;
+	params->common.user_data = true;
+
+	return 0;
+}
+
+static int opt_nano_cb(const struct option *opt, const char *arg, int unset)
+{
+	struct timerlat_params *params = opt->value;
+
+	if (unset)
+		return 0;
+
+	params->common.output_divisor = 1;
+
+	return 0;
+}
+
+static int opt_stack_format_cb(const struct option *opt, const char *arg, int unset)
+{
+	int *format = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*format = parse_stack_format((char *)arg);
+
+	if (*format == -1)
+		fatal("Invalid --stack-format option");
+
+	return 0;
+}
+
+/*
+ * Macros for command line options specific to histogram-based tools
+ */
+#define HIST_OPT_BUCKET_SIZE OPT_CALLBACK('b', "bucket-size", \
+	&params->common.hist.bucket_size, "N", \
+	"set the histogram bucket size (default 1)", \
+	opt_bucket_size_cb)
+
+#define HIST_OPT_ENTRIES OPT_CALLBACK('E', "entries", &params->common.hist.entries, "N", \
+	"set the number of entries of the histogram (default 256)", \
+	opt_entries_cb)
+
+#define HIST_OPT_NO_IRQ OPT_BOOLEAN(0, "no-irq", &params->common.hist.no_irq, \
+	"ignore IRQ latencies")
+
+#define HIST_OPT_NO_THREAD OPT_BOOLEAN(0, "no-thread", &params->common.hist.no_thread, \
+	"ignore thread latencies")
+
+#define HIST_OPT_NO_HEADER OPT_BOOLEAN(0, "no-header", &params->common.hist.no_header, \
+	"do not print header")
+
+#define HIST_OPT_NO_SUMMARY OPT_BOOLEAN(0, "no-summary", &params->common.hist.no_summary, \
+	"do not print summary")
+
+#define HIST_OPT_NO_INDEX OPT_BOOLEAN(0, "no-index", &params->common.hist.no_index, \
+	"do not print index")
+
+#define HIST_OPT_WITH_ZEROS OPT_BOOLEAN(0, "with-zeros", &params->common.hist.with_zeros, \
+	"print zero only entries")
+
+/* Histogram-specific callbacks */
+
+static int opt_bucket_size_cb(const struct option *opt, const char *arg, int unset)
+{
+	int *bucket_size = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*bucket_size = get_llong_from_str((char *)arg);
+	if (*bucket_size == 0 || *bucket_size >= 1000000)
+		fatal("Bucket size needs to be > 0 and <= 1000000");
+
+	return 0;
+}
+
+static int opt_entries_cb(const struct option *opt, const char *arg, int unset)
+{
+	int *entries = opt->value;
+
+	if (unset || !arg)
+		return -1;
+
+	*entries = get_llong_from_str((char *)arg);
+	if (*entries < 10 || *entries > 9999999)
+		fatal("Entries must be > 10 and < 9999999");
+
+	return 0;
+}
+
+/*
+ * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters
+ */
+struct common_params *osnoise_top_parse_args(int argc, char **argv)
+{
+	struct osnoise_params *params;
+	struct osnoise_cb_data cb_data;
+	const char * const *usage;
+
+	params = calloc_fatal(1, sizeof(*params));
+
+	cb_data.params = params;
+	cb_data.trace_output = NULL;
+
+	if (strcmp(argv[0], "hwnoise") == 0) {
+		params->mode = MODE_HWNOISE;
+		/*
+		 * Reduce CPU usage for 75% to avoid killing the system.
+		 */
+		params->runtime = 750000;
+		params->period = 1000000;
+		usage = hwnoise_usage;
+	} else {
+		usage = osnoise_top_usage;
+	}
+
+	const struct option osnoise_top_options[] = {
+	OPT_GROUP("Tracing Options:"),
+		OSNOISE_OPT_PERIOD,
+		OSNOISE_OPT_RUNTIME,
+		RTLA_OPT_STOP('s', "stop", "single sample"),
+		RTLA_OPT_STOP_TOTAL('S', "stop-total", "total sample"),
+		OSNOISE_OPT_THRESHOLD,
+		RTLA_OPT_TRACE_OUTPUT("osnoise", opt_osnoise_trace_output_cb),
+
+	OPT_GROUP("Event Configuration:"),
+		RTLA_OPT_EVENT,
+		RTLA_OPT_FILTER,
+		RTLA_OPT_TRIGGER,
+
+	OPT_GROUP("CPU Configuration:"),
+		RTLA_OPT_CPUS,
+		RTLA_OPT_HOUSEKEEPING,
+
+	OPT_GROUP("Thread Configuration:"),
+		RTLA_OPT_PRIORITY,
+		RTLA_OPT_CGROUP,
+
+	OPT_GROUP("Output:"),
+		RTLA_OPT_QUIET,
+
+	OPT_GROUP("System Tuning:"),
+		RTLA_OPT_TRACE_BUFFER_SIZE,
+		RTLA_OPT_WARM_UP,
+
+	OPT_GROUP("Auto Analysis and Actions:"),
+		RTLA_OPT_AUTO(opt_osnoise_auto_cb),
+		RTLA_OPT_ON_THRESHOLD("stop-total", opt_osnoise_on_threshold_cb),
+		RTLA_OPT_ON_END(opt_osnoise_on_end_cb),
+
+	OPT_GROUP("General:"),
+		RTLA_OPT_DURATION,
+		RTLA_OPT_DEBUG,
+		RTLA_OPT_HELP,
+
+	OPT_END(),
+	};
+
+	actions_init(&params->common.threshold_actions);
+	actions_init(&params->common.end_actions);
+
+	argc = parse_options(argc, (const char **)argv,
+			     osnoise_top_options,
+			     usage,
+			     common_parse_options_flags);
+	if (argc < 0)
+		return NULL;
+
+	if (cb_data.trace_output)
+		actions_add_trace_output(&params->common.threshold_actions, cb_data.trace_output);
+
+	if (geteuid())
+		fatal("osnoise needs root permission");
+
+	return &params->common;
+}
+
+/*
+ * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters
+ */
+struct common_params *osnoise_hist_parse_args(int argc, char *argv[])
+{
+	struct osnoise_params *params;
+	struct osnoise_cb_data cb_data;
+
+	params = calloc_fatal(1, sizeof(*params));
+
+	cb_data.params = params;
+	cb_data.trace_output = NULL;
+
+	const struct option osnoise_hist_options[] = {
+	OPT_GROUP("Tracing Options:"),
+		OSNOISE_OPT_PERIOD,
+		OSNOISE_OPT_RUNTIME,
+		RTLA_OPT_STOP('s', "stop", "single sample"),
+		RTLA_OPT_STOP_TOTAL('S', "stop-total", "total sample"),
+		OSNOISE_OPT_THRESHOLD,
+		RTLA_OPT_TRACE_OUTPUT("osnoise", opt_osnoise_trace_output_cb),
+
+	OPT_GROUP("Event Configuration:"),
+		RTLA_OPT_EVENT,
+		RTLA_OPT_FILTER,
+		RTLA_OPT_TRIGGER,
+
+	OPT_GROUP("CPU Configuration:"),
+		RTLA_OPT_CPUS,
+		RTLA_OPT_HOUSEKEEPING,
+
+	OPT_GROUP("Thread Configuration:"),
+		RTLA_OPT_PRIORITY,
+		RTLA_OPT_CGROUP,
+
+	OPT_GROUP("Histogram Options:"),
+		HIST_OPT_BUCKET_SIZE,
+		HIST_OPT_ENTRIES,
+		HIST_OPT_NO_HEADER,
+		HIST_OPT_NO_SUMMARY,
+		HIST_OPT_NO_INDEX,
+		HIST_OPT_WITH_ZEROS,
+
+	OPT_GROUP("System Tuning:"),
+		RTLA_OPT_TRACE_BUFFER_SIZE,
+		RTLA_OPT_WARM_UP,
+
+	OPT_GROUP("Auto Analysis and Actions:"),
+		RTLA_OPT_AUTO(opt_osnoise_auto_cb),
+		RTLA_OPT_ON_THRESHOLD("stop-total", opt_osnoise_on_threshold_cb),
+		RTLA_OPT_ON_END(opt_osnoise_on_end_cb),
+
+	OPT_GROUP("General:"),
+		RTLA_OPT_DURATION,
+		RTLA_OPT_DEBUG,
+		RTLA_OPT_HELP,
+
+	OPT_END(),
+	};
+
+	actions_init(&params->common.threshold_actions);
+	actions_init(&params->common.end_actions);
+
+	/* display data in microseconds */
+	params->common.output_divisor = 1000;
+	params->common.hist.bucket_size = 1;
+	params->common.hist.entries = 256;
+
+	argc = parse_options(argc, (const char **)argv,
+			     osnoise_hist_options, osnoise_hist_usage,
+			     common_parse_options_flags);
+	if (argc < 0)
+		return NULL;
+
+	if (cb_data.trace_output)
+		actions_add_trace_output(&params->common.threshold_actions, cb_data.trace_output);
+
+	if (geteuid())
+		fatal("rtla needs root permission");
+
+	if (params->common.hist.no_index && !params->common.hist.with_zeros)
+		fatal("no-index set and with-zeros not set - it does not make sense");
+
+	return &params->common;
+}
+
+struct common_params *timerlat_top_parse_args(int argc, char **argv)
+{
+	struct timerlat_params *params;
+	struct timerlat_cb_data cb_data;
+
+	params = calloc_fatal(1, sizeof(*params));
+
+	cb_data.params = params;
+	cb_data.trace_output = NULL;
+
+	const struct option timerlat_top_options[] = {
+	OPT_GROUP("Tracing Options:"),
+		TIMERLAT_OPT_PERIOD,
+		RTLA_OPT_STOP('i', "irq", "irq latency"),
+		RTLA_OPT_STOP_TOTAL('T', "thread", "thread latency"),
+		TIMERLAT_OPT_STACK,
+		RTLA_OPT_TRACE_OUTPUT("timerlat", opt_timerlat_trace_output_cb),
+
+	OPT_GROUP("Event Configuration:"),
+		RTLA_OPT_EVENT,
+		RTLA_OPT_FILTER,
+		RTLA_OPT_TRIGGER,
+
+	OPT_GROUP("CPU Configuration:"),
+		RTLA_OPT_CPUS,
+		RTLA_OPT_HOUSEKEEPING,
+
+	OPT_GROUP("Thread Configuration:"),
+		RTLA_OPT_PRIORITY,
+		RTLA_OPT_CGROUP,
+		RTLA_OPT_USER_THREADS,
+		RTLA_OPT_KERNEL_THREADS,
+		RTLA_OPT_USER_LOAD,
+
+	OPT_GROUP("Output:"),
+		TIMERLAT_OPT_NANO,
+		RTLA_OPT_QUIET,
+
+	OPT_GROUP("System Tuning:"),
+		TIMERLAT_OPT_DMA_LATENCY,
+		TIMERLAT_OPT_DEEPEST_IDLE_STATE,
+		RTLA_OPT_TRACE_BUFFER_SIZE,
+		RTLA_OPT_WARM_UP,
+
+	OPT_GROUP("Auto Analysis and Actions:"),
+		RTLA_OPT_AUTO(opt_timerlat_auto_cb),
+		TIMERLAT_OPT_AA_ONLY,
+		TIMERLAT_OPT_NO_AA,
+		TIMERLAT_OPT_DUMPS_TASKS,
+		RTLA_OPT_ON_THRESHOLD("latency", opt_timerlat_on_threshold_cb),
+		RTLA_OPT_ON_END(opt_timerlat_on_end_cb),
+		TIMERLAT_OPT_BPF_ACTION,
+		TIMERLAT_OPT_STACK_FORMAT,
+
+	OPT_GROUP("General:"),
+		RTLA_OPT_DURATION,
+		RTLA_OPT_DEBUG,
+		RTLA_OPT_HELP,
+
+	OPT_END(),
+	};
+
+	actions_init(&params->common.threshold_actions);
+	actions_init(&params->common.end_actions);
+
+	/* disabled by default */
+	params->dma_latency = -1;
+	params->deepest_idle_state = -2;
+
+	/* display data in microseconds */
+	params->common.output_divisor = 1000;
+
+	/* default to BPF mode */
+	params->mode = TRACING_MODE_BPF;
+
+	/* default to truncate stack format */
+	params->stack_format = STACK_FORMAT_TRUNCATE;
+
+	argc = parse_options(argc, (const char **)argv,
+			     timerlat_top_options, timerlat_top_usage,
+			     common_parse_options_flags);
+	if (argc < 0)
+		return NULL;
+
+	if (cb_data.trace_output)
+		actions_add_trace_output(&params->common.threshold_actions, cb_data.trace_output);
+
+	if (geteuid())
+		fatal("rtla needs root permission");
+
+	/*
+	 * Auto analysis only happens if stop tracing, thus:
+	 */
+	if (!params->common.stop_us && !params->common.stop_total_us)
+		params->no_aa = 1;
+
+	if (params->no_aa && params->common.aa_only)
+		fatal("--no-aa and --aa-only are mutually exclusive!");
+
+	if (params->common.kernel_workload && params->common.user_workload)
+		fatal("--kernel-threads and --user-threads are mutually exclusive!");
+
+	/*
+	 * If auto-analysis or trace output is enabled, switch from BPF mode to
+	 * mixed mode
+	 */
+	if (params->mode == TRACING_MODE_BPF &&
+		(params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
+		params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
+		!params->no_aa))
+		params->mode = TRACING_MODE_MIXED;
+
+	return &params->common;
+}
+
+struct common_params *timerlat_hist_parse_args(int argc, char **argv)
+{
+	struct timerlat_params *params;
+	struct timerlat_cb_data cb_data;
+
+	params = calloc_fatal(1, sizeof(*params));
+
+	cb_data.params = params;
+	cb_data.trace_output = NULL;
+
+	const struct option timerlat_hist_options[] = {
+	OPT_GROUP("Tracing Options:"),
+		TIMERLAT_OPT_PERIOD,
+		RTLA_OPT_STOP('i', "irq", "irq latency"),
+		RTLA_OPT_STOP_TOTAL('T', "thread", "thread latency"),
+		TIMERLAT_OPT_STACK,
+		RTLA_OPT_TRACE_OUTPUT("timerlat", opt_timerlat_trace_output_cb),
+
+	OPT_GROUP("Event Configuration:"),
+		RTLA_OPT_EVENT,
+		RTLA_OPT_FILTER,
+		RTLA_OPT_TRIGGER,
+
+	OPT_GROUP("CPU Configuration:"),
+		RTLA_OPT_CPUS,
+		RTLA_OPT_HOUSEKEEPING,
+
+	OPT_GROUP("Thread Configuration:"),
+		RTLA_OPT_PRIORITY,
+		RTLA_OPT_CGROUP,
+		RTLA_OPT_USER_THREADS,
+		RTLA_OPT_KERNEL_THREADS,
+		RTLA_OPT_USER_LOAD,
+
+	OPT_GROUP("Histogram Options:"),
+		HIST_OPT_BUCKET_SIZE,
+		HIST_OPT_ENTRIES,
+		HIST_OPT_NO_IRQ,
+		HIST_OPT_NO_THREAD,
+		HIST_OPT_NO_HEADER,
+		HIST_OPT_NO_SUMMARY,
+		HIST_OPT_NO_INDEX,
+		HIST_OPT_WITH_ZEROS,
+
+	OPT_GROUP("Output:"),
+		TIMERLAT_OPT_NANO,
+
+	OPT_GROUP("System Tuning:"),
+		TIMERLAT_OPT_DMA_LATENCY,
+		TIMERLAT_OPT_DEEPEST_IDLE_STATE,
+		RTLA_OPT_TRACE_BUFFER_SIZE,
+		RTLA_OPT_WARM_UP,
+
+	OPT_GROUP("Auto Analysis and Actions:"),
+		RTLA_OPT_AUTO(opt_timerlat_auto_cb),
+		TIMERLAT_OPT_NO_AA,
+		TIMERLAT_OPT_DUMPS_TASKS,
+		RTLA_OPT_ON_THRESHOLD("latency", opt_timerlat_on_threshold_cb),
+		RTLA_OPT_ON_END(opt_timerlat_on_end_cb),
+		TIMERLAT_OPT_BPF_ACTION,
+		TIMERLAT_OPT_STACK_FORMAT,
+
+	OPT_GROUP("General:"),
+		RTLA_OPT_DURATION,
+		RTLA_OPT_DEBUG,
+		RTLA_OPT_HELP,
+
+		OPT_END(),
+	};
+
+	actions_init(&params->common.threshold_actions);
+	actions_init(&params->common.end_actions);
+
+	/* disabled by default */
+	params->dma_latency = -1;
+
+	/* disabled by default */
+	params->deepest_idle_state = -2;
+
+	/* display data in microseconds */
+	params->common.output_divisor = 1000;
+	params->common.hist.bucket_size = 1;
+	params->common.hist.entries = 256;
+
+	/* default to BPF mode */
+	params->mode = TRACING_MODE_BPF;
+
+	/* default to truncate stack format */
+	params->stack_format = STACK_FORMAT_TRUNCATE;
+
+	argc = parse_options(argc, (const char **)argv,
+			     timerlat_hist_options, timerlat_hist_usage,
+			     common_parse_options_flags);
+	if (argc < 0)
+		return NULL;
+
+	if (cb_data.trace_output)
+		actions_add_trace_output(&params->common.threshold_actions, cb_data.trace_output);
+
+	if (geteuid())
+		fatal("rtla needs root permission");
+
+	if (params->common.hist.no_irq && params->common.hist.no_thread)
+		fatal("no-irq and no-thread set, there is nothing to do here");
+
+	if (params->common.hist.no_index && !params->common.hist.with_zeros)
+		fatal("no-index set with with-zeros is not set - it does not make sense");
+
+	/*
+	 * Auto analysis only happens if stop tracing, thus:
+	 */
+	if (!params->common.stop_us && !params->common.stop_total_us)
+		params->no_aa = 1;
+
+	if (params->common.kernel_workload && params->common.user_workload)
+		fatal("--kernel-threads and --user-threads are mutually exclusive!");
+
+	/*
+	 * If auto-analysis or trace output is enabled, switch from BPF mode to
+	 * mixed mode
+	 */
+	if (params->mode == TRACING_MODE_BPF &&
+		(params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
+		params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
+		!params->no_aa))
+		params->mode = TRACING_MODE_MIXED;
+
+	return &params->common;
+}
+
+/*
+ * rtla_usage - print rtla usage
+ */
+static void rtla_usage(int err)
+{
+	int i;
+
+	static const char *msg[] = {
+		"",
+		"rtla version " VERSION,
+		"",
+		"  usage: rtla COMMAND ...",
+		"",
+		"  commands:",
+		"     osnoise  - gives information about the operating system noise (osnoise)",
+		"     hwnoise  - gives information about hardware-related noise",
+		"     timerlat - measures the timer irq and thread latency",
+		"",
+		NULL,
+	};
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(err);
+}
+
+/*
+ * run_tool_command - try to run a rtla tool command
+ *
+ * It returns 0 if it fails. The tool's main will generally not
+ * return as they should call exit().
+ */
+int run_tool_command(int argc, char **argv, int start_position)
+{
+	if (strcmp(argv[start_position], "osnoise") == 0) {
+		osnoise_main(argc-start_position, &argv[start_position]);
+		goto ran;
+	} else if (strcmp(argv[start_position], "hwnoise") == 0) {
+		hwnoise_main(argc-start_position, &argv[start_position]);
+		goto ran;
+	} else if (strcmp(argv[start_position], "timerlat") == 0) {
+		timerlat_main(argc-start_position, &argv[start_position]);
+		goto ran;
+	}
+
+	return 0;
+ran:
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	int retval;
+
+	/* is it an alias? */
+	retval = run_tool_command(argc, argv, 0);
+	if (retval)
+		exit(0);
+
+	if (argc < 2)
+		goto usage;
+
+	if (strcmp(argv[1], "-h") == 0)
+		rtla_usage(0);
+	else if (strcmp(argv[1], "--help") == 0)
+		rtla_usage(0);
+
+	retval = run_tool_command(argc, argv, 1);
+	if (retval)
+		exit(0);
+
+usage:
+	rtla_usage(1);
+	exit(1);
+}
diff --git a/tools/tracing/rtla/src/cli.h b/tools/tracing/rtla/src/cli.h
new file mode 100644
index 000000000000..c49ccb3e92f5
--- /dev/null
+++ b/tools/tracing/rtla/src/cli.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+
+struct common_params *osnoise_top_parse_args(int argc, char **argv);
+struct common_params *osnoise_hist_parse_args(int argc, char **argv);
+struct common_params *timerlat_top_parse_args(int argc, char **argv);
+struct common_params *timerlat_hist_parse_args(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 35e3d3aa922e..7403dcc8f6c1 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -5,7 +5,6 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
-#include <getopt.h>
 #include <sys/sysinfo.h>
 
 #include "common.h"
@@ -53,114 +52,6 @@ static void unset_signals(struct common_params *params)
 	}
 }
 
-/*
- * getopt_auto - auto-generates optstring from long_options
- */
-int getopt_auto(int argc, char **argv, const struct option *long_opts)
-{
-	char opts[256];
-	int n = 0;
-
-	for (int i = 0; long_opts[i].name; i++) {
-		if (long_opts[i].val < 32 || long_opts[i].val > 127)
-			continue;
-
-		if (n + 4 >= sizeof(opts))
-			fatal("optstring buffer overflow");
-
-		opts[n++] = long_opts[i].val;
-
-		if (long_opts[i].has_arg == required_argument)
-			opts[n++] = ':';
-		else if (long_opts[i].has_arg == optional_argument) {
-			opts[n++] = ':';
-			opts[n++] = ':';
-		}
-	}
-
-	opts[n] = '\0';
-
-	return getopt_long(argc, argv, opts, long_opts, NULL);
-}
-
-/*
- * common_parse_options - parse common command line options
- *
- * @argc: argument count
- * @argv: argument vector
- * @common: common parameters structure
- *
- * Parse command line options that are common to all rtla tools.
- *
- * Returns: non zero if a common option was parsed, or 0
- * if the option should be handled by tool-specific parsing.
- */
-int common_parse_options(int argc, char **argv, struct common_params *common)
-{
-	struct trace_events *tevent;
-	int saved_state = optind;
-	int c;
-
-	static struct option long_options[] = {
-		{"cpus",                required_argument,      0, 'c'},
-		{"cgroup",              optional_argument,      0, 'C'},
-		{"debug",               no_argument,            0, 'D'},
-		{"duration",            required_argument,      0, 'd'},
-		{"event",               required_argument,      0, 'e'},
-		{"house-keeping",       required_argument,      0, 'H'},
-		{"priority",            required_argument,      0, 'P'},
-		{0, 0, 0, 0}
-	};
-
-	opterr = 0;
-	c = getopt_auto(argc, argv, long_options);
-	opterr = 1;
-
-	switch (c) {
-	case 'c':
-		if (parse_cpu_set(optarg, &common->monitored_cpus))
-			fatal("Invalid -c cpu list");
-		common->cpus = optarg;
-		break;
-	case 'C':
-		common->cgroup = 1;
-		common->cgroup_name = parse_optional_arg(argc, argv);
-		break;
-	case 'D':
-		config_debug = 1;
-		break;
-	case 'd':
-		common->duration = parse_seconds_duration(optarg);
-		if (!common->duration)
-			fatal("Invalid -d duration");
-		break;
-	case 'e':
-		tevent = trace_event_alloc(optarg);
-		if (!tevent)
-			fatal("Error alloc trace event");
-
-		if (common->events)
-			tevent->next = common->events;
-		common->events = tevent;
-		break;
-	case 'H':
-		common->hk_cpus = 1;
-		if (parse_cpu_set(optarg, &common->hk_cpu_set))
-			fatal("Error parsing house keeping CPUs");
-		break;
-	case 'P':
-		if (parse_prio(optarg, &common->sched_param) == -1)
-			fatal("Invalid -P priority");
-		common->set_sched = 1;
-		break;
-	default:
-		optind = saved_state;
-		return 0;
-	}
-
-	return c;
-}
-
 /*
  * common_apply_config - apply common configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 51665db4ffce..27439b10ffd5 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #pragma once
 
-#include <getopt.h>
 #include "actions.h"
 #include "timerlat_u.h"
 #include "trace.h"
@@ -58,12 +57,12 @@ extern struct trace_instance *trace_inst;
 extern volatile int stop_tracing;
 
 struct hist_params {
-	char			no_irq;
-	char			no_thread;
-	char			no_header;
-	char			no_summary;
-	char			no_index;
-	char			with_zeros;
+	bool			no_irq;
+	bool			no_thread;
+	bool			no_header;
+	bool			no_summary;
+	bool			no_index;
+	bool			with_zeros;
 	int			bucket_size;
 	int			entries;
 };
@@ -96,12 +95,12 @@ struct common_params {
 	/* Other parameters */
 	struct hist_params	hist;
 	int			output_divisor;
-	int			pretty_output;
-	int			quiet;
-	int			user_workload;
-	int			kernel_workload;
-	int			user_data;
-	int			aa_only;
+	bool			pretty_output;
+	bool			quiet;
+	bool			user_workload;
+	bool			kernel_workload;
+	bool			user_data;
+	bool			aa_only;
 
 	struct actions		threshold_actions;
 	struct actions		end_actions;
@@ -177,7 +176,6 @@ int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us);
 int osnoise_set_stop_total_us(struct osnoise_context *context,
 			      long long stop_total_us);
 
-int getopt_auto(int argc, char **argv, const struct option *long_opts);
 int common_parse_options(int argc, char **argv, struct common_params *common);
 int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
 int top_main_loop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 8ad816b80265..dfa91d0681f8 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -4,7 +4,6 @@
  */
 
 #define _GNU_SOURCE
-#include <getopt.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
@@ -13,6 +12,7 @@
 #include <time.h>
 
 #include "osnoise.h"
+#include "cli.h"
 
 struct osnoise_hist_cpu {
 	int			*samples;
@@ -400,225 +400,6 @@ osnoise_print_stats(struct osnoise_tool *tool)
 	osnoise_report_missed_events(tool);
 }
 
-/*
- * osnoise_hist_usage - prints osnoise hist usage message
- */
-static void osnoise_hist_usage(void)
-{
-	static const char * const msg_start[] = {
-		"[-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
-		"	  [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
-		"	  [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\",
-		"	  [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]",
-		NULL,
-	};
-
-	static const char * const msg_opts[] = {
-		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
-		"	  -p/--period us: osnoise period in us",
-		"	  -r/--runtime us: osnoise runtime in us",
-		"	  -s/--stop us: stop trace if a single sample is higher than the argument in us",
-		"	  -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
-		"	  -T/--threshold us: the minimum delta to be considered a noise",
-		"	  -c/--cpus cpu-list: list of cpus to run osnoise threads",
-		"	  -H/--house-keeping cpus: run rtla control threads only on the given cpus",
-		"	  -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
-		"	  -d/--duration time[s|m|h|d]: duration of the session",
-		"	  -D/--debug: print debug info",
-		"	  -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]",
-		"	  -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
-		"	     --filter <filter>: enable a trace event filter to the previous -e event",
-		"	     --trigger <trigger>: enable a trace event trigger to the previous -e event",
-		"	  -b/--bucket-size N: set the histogram bucket size (default 1)",
-		"	  -E/--entries N: set the number of entries of the histogram (default 256)",
-		"	     --no-header: do not print header",
-		"	     --no-summary: do not print summary",
-		"	     --no-index: do not print index",
-		"	     --with-zeros: print zero only entries",
-		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period: set scheduling parameters",
-		"		o:prio - use SCHED_OTHER with prio",
-		"		r:prio - use SCHED_RR with prio",
-		"		f:prio - use SCHED_FIFO with prio",
-		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
-		"						       in nanoseconds",
-		"	     --warm-up: let the workload run for s seconds before collecting data",
-		"	     --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
-		"	     --on-threshold <action>: define action to be executed at stop-total threshold, multiple are allowed",
-		"	     --on-end <action>: define action to be executed at measurement end, multiple are allowed",
-		NULL,
-	};
-
-	common_usage("osnoise", "hist", "a per-cpu histogram of the OS noise",
-		     msg_start, msg_opts);
-}
-
-/*
- * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*osnoise_hist_parse_args(int argc, char *argv[])
-{
-	struct osnoise_params *params;
-	int retval;
-	int c;
-	char *trace_output = NULL;
-
-	params = calloc_fatal(1, sizeof(*params));
-
-	actions_init(&params->common.threshold_actions);
-	actions_init(&params->common.end_actions);
-
-	/* display data in microseconds */
-	params->common.output_divisor = 1000;
-	params->common.hist.bucket_size = 1;
-	params->common.hist.entries = 256;
-
-	while (1) {
-		static struct option long_options[] = {
-			{"auto",		required_argument,	0, 'a'},
-			{"bucket-size",		required_argument,	0, 'b'},
-			{"entries",		required_argument,	0, 'E'},
-			{"help",		no_argument,		0, 'h'},
-			{"period",		required_argument,	0, 'p'},
-			{"runtime",		required_argument,	0, 'r'},
-			{"stop",		required_argument,	0, 's'},
-			{"stop-total",		required_argument,	0, 'S'},
-			{"trace",		optional_argument,	0, 't'},
-			{"threshold",		required_argument,	0, 'T'},
-			{"no-header",		no_argument,		0, '0'},
-			{"no-summary",		no_argument,		0, '1'},
-			{"no-index",		no_argument,		0, '2'},
-			{"with-zeros",		no_argument,		0, '3'},
-			{"trigger",		required_argument,	0, '4'},
-			{"filter",		required_argument,	0, '5'},
-			{"warm-up",		required_argument,	0, '6'},
-			{"trace-buffer-size",	required_argument,	0, '7'},
-			{"on-threshold",	required_argument,	0, '8'},
-			{"on-end",		required_argument,	0, '9'},
-			{0, 0, 0, 0}
-		};
-
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
-		c = getopt_auto(argc, argv, long_options);
-
-		/* detect the end of the options. */
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a':
-			/* set sample stop to auto_thresh */
-			params->common.stop_us = get_llong_from_str(optarg);
-
-			/* set sample threshold to 1 */
-			params->threshold = 1;
-
-			/* set trace */
-			if (!trace_output)
-				trace_output = "osnoise_trace.txt";
-
-			break;
-		case 'b':
-			params->common.hist.bucket_size = get_llong_from_str(optarg);
-			if (params->common.hist.bucket_size == 0 ||
-			    params->common.hist.bucket_size >= 1000000)
-				fatal("Bucket size needs to be > 0 and <= 1000000");
-			break;
-		case 'E':
-			params->common.hist.entries = get_llong_from_str(optarg);
-			if (params->common.hist.entries < 10 ||
-			    params->common.hist.entries > 9999999)
-				fatal("Entries must be > 10 and < 9999999");
-			break;
-		case 'h':
-		case '?':
-			osnoise_hist_usage();
-			break;
-		case 'p':
-			params->period = get_llong_from_str(optarg);
-			if (params->period > 10000000)
-				fatal("Period longer than 10 s");
-			break;
-		case 'r':
-			params->runtime = get_llong_from_str(optarg);
-			if (params->runtime < 100)
-				fatal("Runtime shorter than 100 us");
-			break;
-		case 's':
-			params->common.stop_us = get_llong_from_str(optarg);
-			break;
-		case 'S':
-			params->common.stop_total_us = get_llong_from_str(optarg);
-			break;
-		case 'T':
-			params->threshold = get_llong_from_str(optarg);
-			break;
-		case 't':
-			trace_output = parse_optional_arg(argc, argv);
-			if (!trace_output)
-				trace_output = "osnoise_trace.txt";
-			break;
-		case '0': /* no header */
-			params->common.hist.no_header = 1;
-			break;
-		case '1': /* no summary */
-			params->common.hist.no_summary = 1;
-			break;
-		case '2': /* no index */
-			params->common.hist.no_index = 1;
-			break;
-		case '3': /* with zeros */
-			params->common.hist.with_zeros = 1;
-			break;
-		case '4': /* trigger */
-			if (params->common.events)
-				trace_event_add_trigger(params->common.events, optarg);
-			else
-				fatal("--trigger requires a previous -e");
-			break;
-		case '5': /* filter */
-			if (params->common.events)
-				trace_event_add_filter(params->common.events, optarg);
-			else
-				fatal("--filter requires a previous -e");
-			break;
-		case '6':
-			params->common.warmup = get_llong_from_str(optarg);
-			break;
-		case '7':
-			params->common.buffer_size = get_llong_from_str(optarg);
-			break;
-		case '8':
-			retval = actions_parse(&params->common.threshold_actions, optarg,
-					       "osnoise_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '9':
-			retval = actions_parse(&params->common.end_actions, optarg,
-					       "osnoise_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		default:
-			fatal("Invalid option");
-		}
-	}
-
-	if (trace_output)
-		actions_add_trace_output(&params->common.threshold_actions, trace_output);
-
-	if (geteuid())
-		fatal("rtla needs root permission");
-
-	if (params->common.hist.no_index && !params->common.hist.with_zeros)
-		fatal("no-index set and with-zeros not set - it does not make sense");
-
-	return &params->common;
-}
-
 /*
  * osnoise_hist_apply_config - apply the hist configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 244bdce022ad..512a6299cb01 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -4,7 +4,6 @@
  */
 
 #define _GNU_SOURCE
-#include <getopt.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
@@ -13,6 +12,7 @@
 #include <time.h>
 
 #include "osnoise.h"
+#include "cli.h"
 
 struct osnoise_top_cpu {
 	unsigned long long	sum_runtime;
@@ -245,204 +245,6 @@ osnoise_print_stats(struct osnoise_tool *top)
 	osnoise_report_missed_events(top);
 }
 
-/*
- * osnoise_top_usage - prints osnoise top usage message
- */
-static void osnoise_top_usage(struct osnoise_params *params)
-{
-	const char *tool, *mode, *desc;
-
-	static const char * const msg_start[] = {
-		"[-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
-		"	  [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
-		"	  [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]",
-		NULL,
-	};
-
-	static const char * const msg_opts[] = {
-		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
-		"	  -p/--period us: osnoise period in us",
-		"	  -r/--runtime us: osnoise runtime in us",
-		"	  -s/--stop us: stop trace if a single sample is higher than the argument in us",
-		"	  -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
-		"	  -T/--threshold us: the minimum delta to be considered a noise",
-		"	  -c/--cpus cpu-list: list of cpus to run osnoise threads",
-		"	  -H/--house-keeping cpus: run rtla control threads only on the given cpus",
-		"	  -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
-		"	  -d/--duration time[s|m|h|d]: duration of the session",
-		"	  -D/--debug: print debug info",
-		"	  -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]",
-		"	  -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
-		"	     --filter <filter>: enable a trace event filter to the previous -e event",
-		"	     --trigger <trigger>: enable a trace event trigger to the previous -e event",
-		"	  -q/--quiet print only a summary at the end",
-		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
-		"		o:prio - use SCHED_OTHER with prio",
-		"		r:prio - use SCHED_RR with prio",
-		"		f:prio - use SCHED_FIFO with prio",
-		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
-		"						       in nanoseconds",
-		"	     --warm-up s: let the workload run for s seconds before collecting data",
-		"	     --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
-		"	     --on-threshold <action>: define action to be executed at stop-total threshold, multiple are allowed",
-		"	     --on-end: define action to be executed at measurement end, multiple are allowed",
-		NULL,
-	};
-
-	if (params->mode == MODE_OSNOISE) {
-		tool = "osnoise";
-		mode = "top";
-		desc = "a per-cpu summary of the OS noise";
-	} else {
-		tool = "hwnoise";
-		mode = "";
-		desc = "a summary of hardware-related noise";
-	}
-
-	common_usage(tool, mode, desc, msg_start, msg_opts);
-}
-
-/*
- * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters
- */
-struct common_params *osnoise_top_parse_args(int argc, char **argv)
-{
-	struct osnoise_params *params;
-	int retval;
-	int c;
-	char *trace_output = NULL;
-
-	params = calloc_fatal(1, sizeof(*params));
-
-	actions_init(&params->common.threshold_actions);
-	actions_init(&params->common.end_actions);
-
-	if (strcmp(argv[0], "hwnoise") == 0) {
-		params->mode = MODE_HWNOISE;
-		/*
-		 * Reduce CPU usage for 75% to avoid killing the system.
-		 */
-		params->runtime = 750000;
-		params->period = 1000000;
-	}
-
-	while (1) {
-		static struct option long_options[] = {
-			{"auto",		required_argument,	0, 'a'},
-			{"help",		no_argument,		0, 'h'},
-			{"period",		required_argument,	0, 'p'},
-			{"quiet",		no_argument,		0, 'q'},
-			{"runtime",		required_argument,	0, 'r'},
-			{"stop",		required_argument,	0, 's'},
-			{"stop-total",		required_argument,	0, 'S'},
-			{"threshold",		required_argument,	0, 'T'},
-			{"trace",		optional_argument,	0, 't'},
-			{"trigger",		required_argument,	0, '0'},
-			{"filter",		required_argument,	0, '1'},
-			{"warm-up",		required_argument,	0, '2'},
-			{"trace-buffer-size",	required_argument,	0, '3'},
-			{"on-threshold",	required_argument,	0, '4'},
-			{"on-end",		required_argument,	0, '5'},
-			{0, 0, 0, 0}
-		};
-
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
-		c = getopt_auto(argc, argv, long_options);
-
-		/* Detect the end of the options. */
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a':
-			/* set sample stop to auto_thresh */
-			params->common.stop_us = get_llong_from_str(optarg);
-
-			/* set sample threshold to 1 */
-			params->threshold = 1;
-
-			/* set trace */
-			if (!trace_output)
-				trace_output = "osnoise_trace.txt";
-
-			break;
-		case 'h':
-		case '?':
-			osnoise_top_usage(params);
-			break;
-		case 'p':
-			params->period = get_llong_from_str(optarg);
-			if (params->period > 10000000)
-				fatal("Period longer than 10 s");
-			break;
-		case 'q':
-			params->common.quiet = 1;
-			break;
-		case 'r':
-			params->runtime = get_llong_from_str(optarg);
-			if (params->runtime < 100)
-				fatal("Runtime shorter than 100 us");
-			break;
-		case 's':
-			params->common.stop_us = get_llong_from_str(optarg);
-			break;
-		case 'S':
-			params->common.stop_total_us = get_llong_from_str(optarg);
-			break;
-		case 't':
-			trace_output = parse_optional_arg(argc, argv);
-			if (!trace_output)
-				trace_output = "osnoise_trace.txt";
-			break;
-		case 'T':
-			params->threshold = get_llong_from_str(optarg);
-			break;
-		case '0': /* trigger */
-			if (params->common.events)
-				trace_event_add_trigger(params->common.events, optarg);
-			else
-				fatal("--trigger requires a previous -e");
-			break;
-		case '1': /* filter */
-			if (params->common.events)
-				trace_event_add_filter(params->common.events, optarg);
-			else
-				fatal("--filter requires a previous -e");
-			break;
-		case '2':
-			params->common.warmup = get_llong_from_str(optarg);
-			break;
-		case '3':
-			params->common.buffer_size = get_llong_from_str(optarg);
-			break;
-		case '4':
-			retval = actions_parse(&params->common.threshold_actions, optarg,
-					       "osnoise_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '5':
-			retval = actions_parse(&params->common.end_actions, optarg,
-					       "osnoise_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		default:
-			fatal("Invalid option");
-		}
-	}
-
-	if (trace_output)
-		actions_add_trace_output(&params->common.threshold_actions, trace_output);
-
-	if (geteuid())
-		fatal("osnoise needs root permission");
-
-	return &params->common;
-}
-
 /*
  * osnoise_top_apply_config - apply the top configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
deleted file mode 100644
index 845932f902ef..000000000000
--- a/tools/tracing/rtla/src/rtla.c
+++ /dev/null
@@ -1,89 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
- */
-
-#include <getopt.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "osnoise.h"
-#include "timerlat.h"
-
-/*
- * rtla_usage - print rtla usage
- */
-static void rtla_usage(int err)
-{
-	int i;
-
-	static const char *msg[] = {
-		"",
-		"rtla version " VERSION,
-		"",
-		"  usage: rtla COMMAND ...",
-		"",
-		"  commands:",
-		"     osnoise  - gives information about the operating system noise (osnoise)",
-		"     hwnoise  - gives information about hardware-related noise",
-		"     timerlat - measures the timer irq and thread latency",
-		"",
-		NULL,
-	};
-
-	for (i = 0; msg[i]; i++)
-		fprintf(stderr, "%s\n", msg[i]);
-	exit(err);
-}
-
-/*
- * run_tool_command - try to run a rtla tool command
- *
- * It returns 0 if it fails. The tool's main will generally not
- * return as they should call exit().
- */
-int run_tool_command(int argc, char **argv, int start_position)
-{
-	if (strcmp(argv[start_position], "osnoise") == 0) {
-		osnoise_main(argc-start_position, &argv[start_position]);
-		goto ran;
-	} else if (strcmp(argv[start_position], "hwnoise") == 0) {
-		hwnoise_main(argc-start_position, &argv[start_position]);
-		goto ran;
-	} else if (strcmp(argv[start_position], "timerlat") == 0) {
-		timerlat_main(argc-start_position, &argv[start_position]);
-		goto ran;
-	}
-
-	return 0;
-ran:
-	return 1;
-}
-
-int main(int argc, char *argv[])
-{
-	int retval;
-
-	/* is it an alias? */
-	retval = run_tool_command(argc, argv, 0);
-	if (retval)
-		exit(0);
-
-	if (argc < 2)
-		goto usage;
-
-	if (strcmp(argv[1], "-h") == 0) {
-		rtla_usage(0);
-	} else if (strcmp(argv[1], "--help") == 0) {
-		rtla_usage(0);
-	}
-
-	retval = run_tool_command(argc, argv, 1);
-	if (retval)
-		exit(0);
-
-usage:
-	rtla_usage(1);
-	exit(1);
-}
diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h
index 364203a29abd..37a808f1611e 100644
--- a/tools/tracing/rtla/src/timerlat.h
+++ b/tools/tracing/rtla/src/timerlat.h
@@ -23,8 +23,8 @@ struct timerlat_params {
 	long long		timerlat_period_us;
 	long long		print_stack;
 	int			dma_latency;
-	int			no_aa;
-	int			dump_tasks;
+	bool			no_aa;
+	bool			dump_tasks;
 	int			deepest_idle_state;
 	enum timerlat_tracing_mode mode;
 	const char		*bpf_action_program;
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 79142af4f566..df7b1398a966 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -4,7 +4,6 @@
  */
 
 #define _GNU_SOURCE
-#include <getopt.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
@@ -17,6 +16,7 @@
 #include "timerlat.h"
 #include "timerlat_aa.h"
 #include "timerlat_bpf.h"
+#include "cli.h"
 #include "common.h"
 
 struct timerlat_hist_cpu {
@@ -685,321 +685,6 @@ timerlat_print_stats(struct osnoise_tool *tool)
 	osnoise_report_missed_events(tool);
 }
 
-/*
- * timerlat_hist_usage - prints timerlat top usage message
- */
-static void timerlat_hist_usage(void)
-{
-	static const char * const msg_start[] = {
-		"[-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\",
-		"         [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
-		"	  [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\",
-		"	  [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]",
-		"	  [--warm-up s] [--deepest-idle-state n]",
-		NULL,
-	};
-
-	static const char * const msg_opts[] = {
-		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
-		"	  -p/--period us: timerlat period in us",
-		"	  -i/--irq us: stop trace if the irq latency is higher than the argument in us",
-		"	  -T/--thread us: stop trace if the thread latency is higher than the argument in us",
-		"	  -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
-		"	  -c/--cpus cpus: run the tracer only on the given cpus",
-		"	  -H/--house-keeping cpus: run rtla control threads only on the given cpus",
-		"	  -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
-		"	  -d/--duration time[m|h|d]: duration of the session in seconds",
-		"	     --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
-		"	  -D/--debug: print debug info",
-		"	  -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]",
-		"	  -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
-		"	     --filter <filter>: enable a trace event filter to the previous -e event",
-		"	     --trigger <trigger>: enable a trace event trigger to the previous -e event",
-		"	  -n/--nano: display data in nanoseconds",
-		"	     --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
-		"	  -b/--bucket-size N: set the histogram bucket size (default 1)",
-		"	  -E/--entries N: set the number of entries of the histogram (default 256)",
-		"	     --no-irq: ignore IRQ latencies",
-		"	     --no-thread: ignore thread latencies",
-		"	     --no-header: do not print header",
-		"	     --no-summary: do not print summary",
-		"	     --no-index: do not print index",
-		"	     --with-zeros: print zero only entries",
-		"	     --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
-		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
-		"		o:prio - use SCHED_OTHER with prio",
-		"		r:prio - use SCHED_RR with prio",
-		"		f:prio - use SCHED_FIFO with prio",
-		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
-		"						       in nanoseconds",
-		"	  -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads",
-		"	  -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads",
-		"	  -U/--user-load: enable timerlat for user-defined user-space workload",
-		"	     --warm-up s: let the workload run for s seconds before collecting data",
-		"	     --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
-		"	     --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
-		"	     --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
-		"	     --on-end <action>: define action to be executed at measurement end, multiple are allowed",
-		"	     --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
-		"	     --stack-format <format>: set the stack format (truncate, skip, full)",
-		NULL,
-	};
-
-	common_usage("timerlat", "hist", "a per-cpu histogram of the timer latency",
-		     msg_start, msg_opts);
-}
-
-/*
- * timerlat_hist_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*timerlat_hist_parse_args(int argc, char *argv[])
-{
-	struct timerlat_params *params;
-	int auto_thresh;
-	int retval;
-	int c;
-	char *trace_output = NULL;
-
-	params = calloc_fatal(1, sizeof(*params));
-
-	actions_init(&params->common.threshold_actions);
-	actions_init(&params->common.end_actions);
-
-	/* disabled by default */
-	params->dma_latency = -1;
-
-	/* disabled by default */
-	params->deepest_idle_state = -2;
-
-	/* display data in microseconds */
-	params->common.output_divisor = 1000;
-	params->common.hist.bucket_size = 1;
-	params->common.hist.entries = 256;
-
-	/* default to BPF mode */
-	params->mode = TRACING_MODE_BPF;
-
-	/* default to truncate stack format */
-	params->stack_format = STACK_FORMAT_TRUNCATE;
-
-	while (1) {
-		static struct option long_options[] = {
-			{"auto",		required_argument,	0, 'a'},
-			{"bucket-size",		required_argument,	0, 'b'},
-			{"entries",		required_argument,	0, 'E'},
-			{"help",		no_argument,		0, 'h'},
-			{"irq",			required_argument,	0, 'i'},
-			{"nano",		no_argument,		0, 'n'},
-			{"period",		required_argument,	0, 'p'},
-			{"stack",		required_argument,	0, 's'},
-			{"thread",		required_argument,	0, 'T'},
-			{"trace",		optional_argument,	0, 't'},
-			{"user-threads",	no_argument,		0, 'u'},
-			{"kernel-threads",	no_argument,		0, 'k'},
-			{"user-load",		no_argument,		0, 'U'},
-			{"no-irq",		no_argument,		0, '0'},
-			{"no-thread",		no_argument,		0, '1'},
-			{"no-header",		no_argument,		0, '2'},
-			{"no-summary",		no_argument,		0, '3'},
-			{"no-index",		no_argument,		0, '4'},
-			{"with-zeros",		no_argument,		0, '5'},
-			{"trigger",		required_argument,	0, '6'},
-			{"filter",		required_argument,	0, '7'},
-			{"dma-latency",		required_argument,	0, '8'},
-			{"no-aa",		no_argument,		0, '9'},
-			{"dump-task",		no_argument,		0, '\1'},
-			{"warm-up",		required_argument,	0, '\2'},
-			{"trace-buffer-size",	required_argument,	0, '\3'},
-			{"deepest-idle-state",	required_argument,	0, '\4'},
-			{"on-threshold",	required_argument,	0, '\5'},
-			{"on-end",		required_argument,	0, '\6'},
-			{"bpf-action",		required_argument,	0, '\7'},
-			{"stack-format",	required_argument,	0, '\10'},
-			{0, 0, 0, 0}
-		};
-
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
-		c = getopt_auto(argc, argv, long_options);
-
-		/* detect the end of the options. */
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a':
-			auto_thresh = get_llong_from_str(optarg);
-
-			/* set thread stop to auto_thresh */
-			params->common.stop_total_us = auto_thresh;
-			params->common.stop_us = auto_thresh;
-
-			/* get stack trace */
-			params->print_stack = auto_thresh;
-
-			/* set trace */
-			if (!trace_output)
-				trace_output = "timerlat_trace.txt";
-
-			break;
-		case 'b':
-			params->common.hist.bucket_size = get_llong_from_str(optarg);
-			if (params->common.hist.bucket_size == 0 ||
-			    params->common.hist.bucket_size >= 1000000)
-				fatal("Bucket size needs to be > 0 and <= 1000000");
-			break;
-		case 'E':
-			params->common.hist.entries = get_llong_from_str(optarg);
-			if (params->common.hist.entries < 10 ||
-			    params->common.hist.entries > 9999999)
-				fatal("Entries must be > 10 and < 9999999");
-			break;
-		case 'h':
-		case '?':
-			timerlat_hist_usage();
-			break;
-		case 'i':
-			params->common.stop_us = get_llong_from_str(optarg);
-			break;
-		case 'k':
-			params->common.kernel_workload = 1;
-			break;
-		case 'n':
-			params->common.output_divisor = 1;
-			break;
-		case 'p':
-			params->timerlat_period_us = get_llong_from_str(optarg);
-			if (params->timerlat_period_us > 1000000)
-				fatal("Period longer than 1 s");
-			break;
-		case 's':
-			params->print_stack = get_llong_from_str(optarg);
-			break;
-		case 'T':
-			params->common.stop_total_us = get_llong_from_str(optarg);
-			break;
-		case 't':
-			trace_output = parse_optional_arg(argc, argv);
-			if (!trace_output)
-				trace_output = "timerlat_trace.txt";
-			break;
-		case 'u':
-			params->common.user_workload = 1;
-			/* fallback: -u implies in -U */
-		case 'U':
-			params->common.user_data = 1;
-			break;
-		case '0': /* no irq */
-			params->common.hist.no_irq = 1;
-			break;
-		case '1': /* no thread */
-			params->common.hist.no_thread = 1;
-			break;
-		case '2': /* no header */
-			params->common.hist.no_header = 1;
-			break;
-		case '3': /* no summary */
-			params->common.hist.no_summary = 1;
-			break;
-		case '4': /* no index */
-			params->common.hist.no_index = 1;
-			break;
-		case '5': /* with zeros */
-			params->common.hist.with_zeros = 1;
-			break;
-		case '6': /* trigger */
-			if (params->common.events)
-				trace_event_add_trigger(params->common.events, optarg);
-			else
-				fatal("--trigger requires a previous -e");
-			break;
-		case '7': /* filter */
-			if (params->common.events)
-				trace_event_add_filter(params->common.events, optarg);
-			else
-				fatal("--filter requires a previous -e");
-			break;
-		case '8':
-			params->dma_latency = get_llong_from_str(optarg);
-			if (params->dma_latency < 0 || params->dma_latency > 10000)
-				fatal("--dma-latency needs to be >= 0 and < 10000");
-			break;
-		case '9':
-			params->no_aa = 1;
-			break;
-		case '\1':
-			params->dump_tasks = 1;
-			break;
-		case '\2':
-			params->common.warmup = get_llong_from_str(optarg);
-			break;
-		case '\3':
-			params->common.buffer_size = get_llong_from_str(optarg);
-			break;
-		case '\4':
-			params->deepest_idle_state = get_llong_from_str(optarg);
-			break;
-		case '\5':
-			retval = actions_parse(&params->common.threshold_actions, optarg,
-					       "timerlat_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '\6':
-			retval = actions_parse(&params->common.end_actions, optarg,
-					       "timerlat_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '\7':
-			params->bpf_action_program = optarg;
-			break;
-		case '\10':
-			params->stack_format = parse_stack_format(optarg);
-			if (params->stack_format == -1)
-				fatal("Invalid --stack-format option");
-			break;
-		default:
-			fatal("Invalid option");
-		}
-	}
-
-	if (trace_output)
-		actions_add_trace_output(&params->common.threshold_actions, trace_output);
-
-	if (geteuid())
-		fatal("rtla needs root permission");
-
-	if (params->common.hist.no_irq && params->common.hist.no_thread)
-		fatal("no-irq and no-thread set, there is nothing to do here");
-
-	if (params->common.hist.no_index && !params->common.hist.with_zeros)
-		fatal("no-index set with with-zeros is not set - it does not make sense");
-
-	/*
-	 * Auto analysis only happens if stop tracing, thus:
-	 */
-	if (!params->common.stop_us && !params->common.stop_total_us)
-		params->no_aa = 1;
-
-	if (params->common.kernel_workload && params->common.user_workload)
-		fatal("--kernel-threads and --user-threads are mutually exclusive!");
-
-	/*
-	 * If auto-analysis or trace output is enabled, switch from BPF mode to
-	 * mixed mode
-	 */
-	if (params->mode == TRACING_MODE_BPF &&
-	    (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
-	     params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
-	     !params->no_aa))
-		params->mode = TRACING_MODE_MIXED;
-
-	return &params->common;
-}
-
 /*
  * timerlat_hist_apply_config - apply the hist configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 64cbdcc878b0..18e1071a2e24 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -4,7 +4,6 @@
  */
 
 #define _GNU_SOURCE
-#include <getopt.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
@@ -17,6 +16,7 @@
 #include "timerlat.h"
 #include "timerlat_aa.h"
 #include "timerlat_bpf.h"
+#include "cli.h"
 #include "common.h"
 
 struct timerlat_top_cpu {
@@ -459,289 +459,6 @@ timerlat_print_stats(struct osnoise_tool *top)
 	osnoise_report_missed_events(top);
 }
 
-/*
- * timerlat_top_usage - prints timerlat top usage message
- */
-static void timerlat_top_usage(void)
-{
-	static const char *const msg_start[] = {
-		"[-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\",
-		"	  [[-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
-		"	  [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]",
-		NULL,
-	};
-
-	static const char *const msg_opts[] = {
-		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
-		"	     --aa-only us: stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)",
-		"	  -p/--period us: timerlat period in us",
-		"	  -i/--irq us: stop trace if the irq latency is higher than the argument in us",
-		"	  -T/--thread us: stop trace if the thread latency is higher than the argument in us",
-		"	  -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
-		"	  -c/--cpus cpus: run the tracer only on the given cpus",
-		"	  -H/--house-keeping cpus: run rtla control threads only on the given cpus",
-		"	  -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
-		"	  -d/--duration time[s|m|h|d]: duration of the session",
-		"	  -D/--debug: print debug info",
-		"	     --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
-		"	  -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]",
-		"	  -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
-		"	     --filter <command>: enable a trace event filter to the previous -e event",
-		"	     --trigger <command>: enable a trace event trigger to the previous -e event",
-		"	  -n/--nano: display data in nanoseconds",
-		"	     --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
-		"	  -q/--quiet print only a summary at the end",
-		"	     --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
-		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
-		"		o:prio - use SCHED_OTHER with prio",
-		"		r:prio - use SCHED_RR with prio",
-		"		f:prio - use SCHED_FIFO with prio",
-		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
-		"						       in nanoseconds",
-		"	  -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads",
-		"	  -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads",
-		"	  -U/--user-load: enable timerlat for user-defined user-space workload",
-		"	     --warm-up s: let the workload run for s seconds before collecting data",
-		"	     --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
-		"	     --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
-		"	     --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
-		"	     --on-end: define action to be executed at measurement end, multiple are allowed",
-		"	     --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
-		"	     --stack-format <format>: set the stack format (truncate, skip, full)",
-		NULL,
-	};
-
-	common_usage("timerlat", "top", "a per-cpu summary of the timer latency",
-		     msg_start, msg_opts);
-}
-
-/*
- * timerlat_top_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*timerlat_top_parse_args(int argc, char **argv)
-{
-	struct timerlat_params *params;
-	long long auto_thresh;
-	int retval;
-	int c;
-	char *trace_output = NULL;
-
-	params = calloc_fatal(1, sizeof(*params));
-
-	actions_init(&params->common.threshold_actions);
-	actions_init(&params->common.end_actions);
-
-	/* disabled by default */
-	params->dma_latency = -1;
-
-	/* disabled by default */
-	params->deepest_idle_state = -2;
-
-	/* display data in microseconds */
-	params->common.output_divisor = 1000;
-
-	/* default to BPF mode */
-	params->mode = TRACING_MODE_BPF;
-
-	/* default to truncate stack format */
-	params->stack_format = STACK_FORMAT_TRUNCATE;
-
-	while (1) {
-		static struct option long_options[] = {
-			{"auto",		required_argument,	0, 'a'},
-			{"help",		no_argument,		0, 'h'},
-			{"irq",			required_argument,	0, 'i'},
-			{"nano",		no_argument,		0, 'n'},
-			{"period",		required_argument,	0, 'p'},
-			{"quiet",		no_argument,		0, 'q'},
-			{"stack",		required_argument,	0, 's'},
-			{"thread",		required_argument,	0, 'T'},
-			{"trace",		optional_argument,	0, 't'},
-			{"user-threads",	no_argument,		0, 'u'},
-			{"kernel-threads",	no_argument,		0, 'k'},
-			{"user-load",		no_argument,		0, 'U'},
-			{"trigger",		required_argument,	0, '0'},
-			{"filter",		required_argument,	0, '1'},
-			{"dma-latency",		required_argument,	0, '2'},
-			{"no-aa",		no_argument,		0, '3'},
-			{"dump-tasks",		no_argument,		0, '4'},
-			{"aa-only",		required_argument,	0, '5'},
-			{"warm-up",		required_argument,	0, '6'},
-			{"trace-buffer-size",	required_argument,	0, '7'},
-			{"deepest-idle-state",	required_argument,	0, '8'},
-			{"on-threshold",	required_argument,	0, '9'},
-			{"on-end",		required_argument,	0, '\1'},
-			{"bpf-action",		required_argument,	0, '\2'},
-			{"stack-format",	required_argument,	0, '\3'},
-			{0, 0, 0, 0}
-		};
-
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
-		c = getopt_auto(argc, argv, long_options);
-
-		/* detect the end of the options. */
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a':
-			auto_thresh = get_llong_from_str(optarg);
-
-			/* set thread stop to auto_thresh */
-			params->common.stop_total_us = auto_thresh;
-			params->common.stop_us = auto_thresh;
-
-			/* get stack trace */
-			params->print_stack = auto_thresh;
-
-			/* set trace */
-			if (!trace_output)
-				trace_output = "timerlat_trace.txt";
-
-			break;
-		case '5':
-			/* it is here because it is similar to -a */
-			auto_thresh = get_llong_from_str(optarg);
-
-			/* set thread stop to auto_thresh */
-			params->common.stop_total_us = auto_thresh;
-			params->common.stop_us = auto_thresh;
-
-			/* get stack trace */
-			params->print_stack = auto_thresh;
-
-			/* set aa_only to avoid parsing the trace */
-			params->common.aa_only = 1;
-			break;
-		case 'h':
-		case '?':
-			timerlat_top_usage();
-			break;
-		case 'i':
-			params->common.stop_us = get_llong_from_str(optarg);
-			break;
-		case 'k':
-			params->common.kernel_workload = true;
-			break;
-		case 'n':
-			params->common.output_divisor = 1;
-			break;
-		case 'p':
-			params->timerlat_period_us = get_llong_from_str(optarg);
-			if (params->timerlat_period_us > 1000000)
-				fatal("Period longer than 1 s");
-			break;
-		case 'q':
-			params->common.quiet = 1;
-			break;
-		case 's':
-			params->print_stack = get_llong_from_str(optarg);
-			break;
-		case 'T':
-			params->common.stop_total_us = get_llong_from_str(optarg);
-			break;
-		case 't':
-			trace_output = parse_optional_arg(argc, argv);
-			if (!trace_output)
-				trace_output = "timerlat_trace.txt";
-			break;
-		case 'u':
-			params->common.user_workload = true;
-			/* fallback: -u implies -U */
-		case 'U':
-			params->common.user_data = true;
-			break;
-		case '0': /* trigger */
-			if (params->common.events)
-				trace_event_add_trigger(params->common.events, optarg);
-			else
-				fatal("--trigger requires a previous -e");
-			break;
-		case '1': /* filter */
-			if (params->common.events)
-				trace_event_add_filter(params->common.events, optarg);
-			else
-				fatal("--filter requires a previous -e");
-			break;
-		case '2': /* dma-latency */
-			params->dma_latency = get_llong_from_str(optarg);
-			if (params->dma_latency < 0 || params->dma_latency > 10000)
-				fatal("--dma-latency needs to be >= 0 and < 10000");
-			break;
-		case '3': /* no-aa */
-			params->no_aa = 1;
-			break;
-		case '4':
-			params->dump_tasks = 1;
-			break;
-		case '6':
-			params->common.warmup = get_llong_from_str(optarg);
-			break;
-		case '7':
-			params->common.buffer_size = get_llong_from_str(optarg);
-			break;
-		case '8':
-			params->deepest_idle_state = get_llong_from_str(optarg);
-			break;
-		case '9':
-			retval = actions_parse(&params->common.threshold_actions, optarg,
-					       "timerlat_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '\1':
-			retval = actions_parse(&params->common.end_actions, optarg,
-					       "timerlat_trace.txt");
-			if (retval)
-				fatal("Invalid action %s", optarg);
-			break;
-		case '\2':
-			params->bpf_action_program = optarg;
-			break;
-		case '\3':
-			params->stack_format = parse_stack_format(optarg);
-			if (params->stack_format == -1)
-				fatal("Invalid --stack-format option");
-			break;
-		default:
-			fatal("Invalid option");
-		}
-	}
-
-	if (trace_output)
-		actions_add_trace_output(&params->common.threshold_actions, trace_output);
-
-	if (geteuid())
-		fatal("rtla needs root permission");
-
-	/*
-	 * Auto analysis only happens if stop tracing, thus:
-	 */
-	if (!params->common.stop_us && !params->common.stop_total_us)
-		params->no_aa = 1;
-
-	if (params->no_aa && params->common.aa_only)
-		fatal("--no-aa and --aa-only are mutually exclusive!");
-
-	if (params->common.kernel_workload && params->common.user_workload)
-		fatal("--kernel-threads and --user-threads are mutually exclusive!");
-
-	/*
-	 * If auto-analysis or trace output is enabled, switch from BPF mode to
-	 * mixed mode
-	 */
-	if (params->mode == TRACING_MODE_BPF &&
-	    (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
-	     params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
-	     !params->no_aa))
-		params->mode = TRACING_MODE_MIXED;
-
-	return &params->common;
-}
-
 /*
  * timerlat_top_apply_config - apply the top configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 9cec5b3e02c8..cb187e7d48d1 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -22,7 +22,7 @@
 #include "common.h"
 
 #define MAX_MSG_LENGTH	1024
-int config_debug;
+bool config_debug;
 
 /*
  * err_msg - print an error message to the stderr
@@ -1011,32 +1011,6 @@ int auto_house_keeping(cpu_set_t *monitored_cpus)
 	return 1;
 }
 
-/**
- * parse_optional_arg - Parse optional argument value
- *
- * Parse optional argument value, which can be in the form of:
- * -sarg, -s/--long=arg, -s/--long arg
- *
- * Returns arg value if found, NULL otherwise.
- */
-char *parse_optional_arg(int argc, char **argv)
-{
-	if (optarg) {
-		if (optarg[0] == '=') {
-			/* skip the = */
-			return &optarg[1];
-		} else {
-			return optarg;
-		}
-	/* parse argument of form -s [arg] and --long [arg]*/
-	} else if (optind < argc && argv[optind][0] != '-') {
-		/* consume optind */
-		return argv[optind++];
-	} else {
-		return NULL;
-	}
-}
-
 /*
  * strtoi - convert string to integer with error checking
  *
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index 96fd72042717..2ba3333669bb 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -39,7 +39,7 @@ static inline bool str_has_prefix(const char *str, const char *prefix)
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
 
-extern int config_debug;
+extern bool config_debug;
 void debug_msg(const char *fmt, ...);
 void err_msg(const char *fmt, ...);
 void fatal(const char *fmt, ...);
@@ -47,7 +47,6 @@ void fatal(const char *fmt, ...);
 long parse_seconds_duration(char *val);
 void get_duration(time_t start_time, char *output, int output_size);
 
-char *parse_optional_arg(int argc, char **argv);
 long long get_llong_from_str(char *start);
 
 static inline void
diff --git a/tools/tracing/rtla/tests/hwnoise.t b/tools/tracing/rtla/tests/hwnoise.t
index 23ce250a6852..cfe687ff5ee1 100644
--- a/tools/tracing/rtla/tests/hwnoise.t
+++ b/tools/tracing/rtla/tests/hwnoise.t
@@ -6,7 +6,7 @@ test_begin
 set_timeout 2m
 
 check "verify help page" \
-	"hwnoise --help" 0 "summary of hardware-related noise"
+	"hwnoise --help" 129 "Usage: rtla hwnoise"
 check "detect noise higher than one microsecond" \
 	"hwnoise -c 0 -T 1 -d 5s -q" 0
 check "set the automatic trace mode" \
-- 
2.53.0


^ permalink raw reply related

* [PATCH 2/3] tools subcmd: support optarg as separate argument
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
	Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
	linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>

In addition to "-ovalue" and "--opt=value" syntax, allow also "-o value"
and "--opt value" for options with optional argument when the newly
added PARSE_OPT_OPTARG_ALLOW_NEXT flag is set.

This behavior is turned off by default since it does not make sense for
tools using non-option command line arguments. Consider the ambiguity
of "cmd -d x", where "-d x" can mean either "-d with argument of x" or
"-d without argument, followed by non-option argument x". This is not an
issue in the case that the tool takes no non-option arguments.

To implement this, a new local variable, force_defval, is created in
get_value(), along with a comment explaining the logic.

Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/lib/subcmd/parse-options.c | 53 +++++++++++++++++++++++++++-----
 tools/lib/subcmd/parse-options.h |  1 +
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index 555d617c1f50..664b2053bb77 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -72,6 +72,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 	const char *s, *arg = NULL;
 	const int unset = flags & OPT_UNSET;
 	int err;
+	bool force_defval = false;
 
 	if (unset && p->opt)
 		return opterror(opt, "takes no value", flags);
@@ -123,6 +124,42 @@ static int get_value(struct parse_opt_ctx_t *p,
 		}
 	}
 
+	if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (!(p->flags & PARSE_OPT_OPTARG_ALLOW_NEXT)) {
+			/*
+			 * If the option has an optional argument, and the argument is not
+			 * provided in the option itself, do not attempt to get it from
+			 * the next argument, unless PARSE_OPT_OPTARG_ALLOW_NEXT is set.
+			 *
+			 * This prevents a non-option argument from being interpreted as an
+			 * optional argument of a preceding option, for example:
+			 *
+			 * $ cmd --opt val
+			 * -> is "val" argument of "--opt" or a separate non-option
+			 * argument?
+			 *
+			 * With PARSE_OPT_OPTARG_ALLOW_NEXT, "val" is interpreted as
+			 * the argument of "--opt", i.e. the same as "--opt=val".
+			 * Without PARSE_OPT_OPTARG_ALLOW_NEXT, --opt is interpreted
+			 * as having the default value, and "val" as a separate non-option
+			 * argument.
+			 *
+			 * PARSE_OPT_OPTARG_ALLOW_NEXT is useful for commands that take no
+			 * non-option arguments and want to allow more flexibility in
+			 * optional argument passing.
+			 */
+			force_defval = true;
+		}
+
+		if (p->argc <= 1 || p->argv[1][0] == '-') {
+			/*
+			 * If next argument is an option or does not exist,
+			 * use the default value.
+			 */
+			force_defval = true;
+		}
+	}
+
 	if (opt->flags & PARSE_OPT_NOBUILD) {
 		char reason[128];
 		bool noarg = false;
@@ -148,7 +185,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			noarg = true;
 		if (opt->flags & PARSE_OPT_NOARG)
 			noarg = true;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+		if (force_defval)
 			noarg = true;
 
 		switch (opt->type) {
@@ -212,7 +249,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 		err = 0;
 		if (unset)
 			*(const char **)opt->value = NULL;
-		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+		else if (force_defval)
 			*(const char **)opt->value = (const char *)opt->defval;
 		else
 			err = get_arg(p, opt, flags, (const char **)opt->value);
@@ -244,7 +281,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
 		if (opt->flags & PARSE_OPT_NOARG)
 			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+		if (force_defval)
 			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
 		if (get_arg(p, opt, flags, &arg))
 			return -1;
@@ -255,7 +292,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			*(int *)opt->value = 0;
 			return 0;
 		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (force_defval) {
 			*(int *)opt->value = opt->defval;
 			return 0;
 		}
@@ -271,7 +308,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			*(unsigned int *)opt->value = 0;
 			return 0;
 		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (force_defval) {
 			*(unsigned int *)opt->value = opt->defval;
 			return 0;
 		}
@@ -289,7 +326,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			*(long *)opt->value = 0;
 			return 0;
 		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (force_defval) {
 			*(long *)opt->value = opt->defval;
 			return 0;
 		}
@@ -305,7 +342,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			*(unsigned long *)opt->value = 0;
 			return 0;
 		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (force_defval) {
 			*(unsigned long *)opt->value = opt->defval;
 			return 0;
 		}
@@ -321,7 +358,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 			*(u64 *)opt->value = 0;
 			return 0;
 		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+		if (force_defval) {
 			*(u64 *)opt->value = opt->defval;
 			return 0;
 		}
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
index 8e9147358a28..c573a0ca5ca6 100644
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -33,6 +33,7 @@ enum parse_opt_flags {
 	PARSE_OPT_KEEP_ARGV0 = 4,
 	PARSE_OPT_KEEP_UNKNOWN = 8,
 	PARSE_OPT_NO_INTERNAL_HELP = 16,
+	PARSE_OPT_OPTARG_ALLOW_NEXT = 32,
 };
 
 enum parse_opt_option_flags {
-- 
2.53.0


^ permalink raw reply related

* [PATCH 1/3] rtla: Add libsubcmd dependency
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
	Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
	linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>

In preparation to migrating RTLA to libsubcmd, build libsubcmd from the
appropriate directory next to the RTLA build proper, and link the
resulting object to RTLA.

libsubcmd uses str_error_r() and strlcpy() at several places. To support
these, also link the respective libraries from tools/lib.

For completeness, also add tools/include to include path. This will
allow other userspace function and macros shipped with the kernel to be
used in RTLA; perf and bpftool, two other users of libsubcmd, already do
that.

To prevent name conflict, rename RTLA's run_command() function to
run_tool_command(), and replace RTLA's own container_of implementation
with the one in tools/include/linux/container_of.h.

Assisted-by: Composer:composer-1
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/.gitignore  |  1 +
 tools/tracing/rtla/Makefile    | 53 +++++++++++++++++++++++++++++-----
 tools/tracing/rtla/src/rtla.c  |  8 ++---
 tools/tracing/rtla/src/utils.h |  6 ++--
 4 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore
index 4d39d64ac08c..123c2d5ed7ac 100644
--- a/tools/tracing/rtla/.gitignore
+++ b/tools/tracing/rtla/.gitignore
@@ -9,3 +9,4 @@ custom_filename.txt
 osnoise_irq_noise_hist.txt
 osnoise_trace.txt
 timerlat_trace.txt
+libsubcmd/
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 45690ee14544..289e44c9664b 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -27,6 +27,24 @@ endif
 RTLA		:= $(OUTPUT)rtla
 RTLA_IN		:= $(RTLA)-in.o
 
+LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
+ifneq ($(OUTPUT),)
+  LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
+else
+  LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
+endif
+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
+LIBSUBCMD_INCLUDES = -I$(LIBSUBCMD_OUTPUT)/include
+LIBSUBCMD_MAKEFLAGS = O=$(LIBSUBCMD_OUTPUT) DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir=
+
+TOOLS_INCLUDES = -I$(srctree)/tools/include
+
+LIB_STRING = $(OUTPUT)string.o
+LIB_STRING_SRC = $(srctree)/tools/lib/string.c
+
+LIB_STR_ERROR_R = $(OUTPUT)str_error_r.o
+LIB_STR_ERROR_R_SRC = $(srctree)/tools/lib/str_error_r.c
+
 VERSION		:= $(shell sh -c "make -sC ../../.. kernelversion | grep -v make")
 DOCSRC		:= ../../../Documentation/tools/rtla/
 
@@ -66,7 +84,7 @@ ifeq ($(config),1)
   include Makefile.config
 endif
 
-CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES)
+CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES) $(TOOLS_INCLUDES) $(LIBSUBCMD_INCLUDES)
 
 export CFLAGS OUTPUT srctree
 
@@ -93,20 +111,41 @@ tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
 	$(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o"
 endif
 
-$(RTLA): $(RTLA_IN)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RTLA) $(RTLA_IN) $(EXTLIBS)
+$(RTLA): $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RTLA) $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R) $(EXTLIBS)
 
-static: $(RTLA_IN)
+static: $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R)
 	$(eval LDFLAGS += -static)
-	$(QUIET_LINK)$(CC) -static $(LDFLAGS) -o $(RTLA)-static $(RTLA_IN)  $(EXTLIBS)
+	$(QUIET_LINK)$(CC) -static $(LDFLAGS) -o $(RTLA)-static $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R) $(EXTLIBS)
 
 rtla.%: fixdep FORCE
 	make -f $(srctree)/tools/build/Makefile.build dir=. $@
 
-$(RTLA_IN): fixdep FORCE src/timerlat.skel.h
+$(RTLA_IN): fixdep FORCE src/timerlat.skel.h $(LIBSUBCMD_INCLUDES)
 	make $(build)=rtla
 
-clean: doc_clean fixdep-clean
+$(LIBSUBCMD_OUTPUT):
+	$(Q)$(MKDIR) -p $@
+
+$(LIBSUBCMD_INCLUDES): $(LIBSUBCMD_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) $(LIBSUBCMD_MAKEFLAGS) \
+		install_headers
+
+$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) $(LIBSUBCMD_MAKEFLAGS) \
+		$@
+
+$(LIB_STR_ERROR_R): $(LIB_STR_ERROR_R_SRC)
+	$(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB_STRING): $(LIB_STRING_SRC)
+	$(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIBSUBCMD)-clean:
+	$(call QUIET_CLEAN, libsubcmd)
+	$(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
+
+clean: doc_clean fixdep-clean $(LIBSUBCMD)-clean
 	$(call QUIET_CLEAN, rtla)
 	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
index 7635c70123ab..845932f902ef 100644
--- a/tools/tracing/rtla/src/rtla.c
+++ b/tools/tracing/rtla/src/rtla.c
@@ -38,12 +38,12 @@ static void rtla_usage(int err)
 }
 
 /*
- * run_command - try to run a rtla tool command
+ * run_tool_command - try to run a rtla tool command
  *
  * It returns 0 if it fails. The tool's main will generally not
  * return as they should call exit().
  */
-int run_command(int argc, char **argv, int start_position)
+int run_tool_command(int argc, char **argv, int start_position)
 {
 	if (strcmp(argv[start_position], "osnoise") == 0) {
 		osnoise_main(argc-start_position, &argv[start_position]);
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
 	int retval;
 
 	/* is it an alias? */
-	retval = run_command(argc, argv, 0);
+	retval = run_tool_command(argc, argv, 0);
 	if (retval)
 		exit(0);
 
@@ -79,7 +79,7 @@ int main(int argc, char *argv[])
 		rtla_usage(0);
 	}
 
-	retval = run_command(argc, argv, 1);
+	retval = run_tool_command(argc, argv, 1);
 	if (retval)
 		exit(0);
 
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index e794ede64b2c..96fd72042717 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -7,6 +7,8 @@
 #include <stdbool.h>
 #include <stdlib.h>
 
+#include <linux/container_of.h>
+
 /*
  * '18446744073709551615\0'
  */
@@ -37,10 +39,6 @@ static inline bool str_has_prefix(const char *str, const char *prefix)
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
 
-#define container_of(ptr, type, member)({			\
-	const typeof(((type *)0)->member) *__mptr = (ptr);	\
-	(type *)((char *)__mptr - offsetof(type, member)) ; })
-
 extern int config_debug;
 void debug_msg(const char *fmt, ...);
 void err_msg(const char *fmt, ...);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 0/3] rtla: Migrate to libsubcmd for command line option parsing
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
	Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
	linux-perf-users

[ CC to linux-perf-users for the libsubcmd code change (second commit) ]

rtla currently uses its own implementation that uses getopt_long() to
parse command-line arguments.

Migrate rtla to use libsubcmd for command line argument parsing,
similarly to what is already done by other tools like perf, bpftool,
and objtool. Among other benefits, this allows help messages to be
generated automatically rather then having to by typed out manually
for each tool.

libsubcmd is extended with an option to parse optarg from separate
argument if a new flag is turned on. Without the flag, the old behavior
is preserved. That keeps the parsing working for tools that use
positional arguments, and allows RTLA to keep its flexible syntax for -C
and -t options and their long variants, --cgroup and --trace-output.

The new implementation is moved into a separate file, cli.c, together
with a tiny header counterpart, cli.h. This helps separate the parsing
logic, which has little in common with the rest of RTLA, in a separate
module.

Macros to generate struct option array fields for libsubcmd's
parse_args() are used to preserve the consolidation of argument parsing
code across different RTLA tools. Kernel and user threads are, as
an exception, treated as common, although they are currently implemented
for timerlat only, in line with earlier consolidation changes.

I expect more improvements to the code being possible in the future,
like creating macros for option groups to further deduplicate the code,
or reduce the amount of extra code in the _parse_args() functions.

Tomas Glozar (3):
  rtla: Add libsubcmd dependency
  tools subcmd: support optarg as separate argument
  rtla: Parse cmdline using libsubcmd

 tools/lib/subcmd/parse-options.c       |   53 +-
 tools/lib/subcmd/parse-options.h       |    1 +
 tools/tracing/rtla/.gitignore          |    1 +
 tools/tracing/rtla/Makefile            |   53 +-
 tools/tracing/rtla/src/Build           |    2 +-
 tools/tracing/rtla/src/cli.c           | 1207 ++++++++++++++++++++++++
 tools/tracing/rtla/src/cli.h           |    7 +
 tools/tracing/rtla/src/common.c        |  109 ---
 tools/tracing/rtla/src/common.h        |   26 +-
 tools/tracing/rtla/src/osnoise_hist.c  |  221 +----
 tools/tracing/rtla/src/osnoise_top.c   |  200 +---
 tools/tracing/rtla/src/rtla.c          |   89 --
 tools/tracing/rtla/src/timerlat.h      |    4 +-
 tools/tracing/rtla/src/timerlat_hist.c |  317 +------
 tools/tracing/rtla/src/timerlat_top.c  |  285 +-----
 tools/tracing/rtla/src/utils.c         |   28 +-
 tools/tracing/rtla/src/utils.h         |    9 +-
 tools/tracing/rtla/tests/hwnoise.t     |    2 +-
 18 files changed, 1331 insertions(+), 1283 deletions(-)
 create mode 100644 tools/tracing/rtla/src/cli.c
 create mode 100644 tools/tracing/rtla/src/cli.h
 delete mode 100644 tools/tracing/rtla/src/rtla.c

-- 
2.53.0

^ permalink raw reply

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Mathieu Desnoyers @ 2026-03-20 14:20 UTC (permalink / raw)
  To: Harry Yoo (Oracle)
  Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
	Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
	Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <7458d8fd-5922-4e0b-9cd5-91880282aaa3@efficios.com>

On 2026-03-20 09:31, Mathieu Desnoyers wrote:
> On 2026-03-20 09:21, Harry Yoo (Oracle) wrote:
>> On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
>>> On 2026-03-20 00:17, Harry Yoo wrote:
>>> [...]
>>>>> [1]: https://lore.kernel.org/20260227153730.1556542-4- 
>>>>> mathieu.desnoyers@efficios.com/
>>>>
>>>> @Mathieu: In patch 1/3 description,
>>>>> Changes since v7:
>>>>> - Explicitly initialize the subsystem from start_kernel() right
>>>>>     after mm_core_init() so it is up and running before the 
>>>>> creation of
>>>>>     the first mm at boot.
>>>>
>>>> But how does this work when someone calls mm_cpumask() on init_mm 
>>>> early?
>>>> Looks like it will behave incorrectly because get_rss_stat_items_size()
>>>> returns zero?
>>>
>>> It doesn't work as expected at all. I missed that all users of 
>>> mm_cpumask()
>>> end up relying on get_rss_stat_items_size(), which now calls
>>> percpu_counter_tree_items_size(), which depends on initialization from
>>> percpu_counter_tree_subsystem_init().
>>>
>>> If you add a call to percpu_counter_tree_subsystem_init in
>>> arch/powerpc/kernel/setup_arch() just before:

[...]

One thing we could do to catch this kind of init sequence issue
is to add a WARN_ON_ONCE in percpu_counter_tree_items_size:

size_t percpu_counter_tree_items_size(void)
{
         if (WARN_ON_ONCE(!nr_cpus_order))
                 return 0;
         return counter_config->nr_items * sizeof(struct percpu_counter_tree_level_item);
}

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

^ permalink raw reply

* Re: [PATCH] coredump: add tracepoint for coredump events
From: Breno Leitao @ 2026-03-20 14:18 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, linux-fsdevel, linux-trace-kernel, bpf, kernel-team,
	Andrii Nakryiko, Alexander Viro, Jan Kara, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <20260320-habilitation-umworben-edeb157af1a3@brauner>

On Fri, Mar 20, 2026 at 02:21:48PM +0100, Christian Brauner wrote:
> On Fri, 20 Mar 2026 05:33:34 -0700, Breno Leitao wrote:
> > Coredump is a generally useful and interesting event in the lifetime
> > of a process. Add a tracepoint so it can be monitored through the
> > standard kernel tracing infrastructure.
> >
> > BPF-based crash monitoring is an advanced approach that
> > allows real-time crash interception: by attaching a BPF program at
> > this point, tools can use bpf_get_stack() with BPF_F_USER_STACK to
> > capture the user-space stack trace at the exact moment of the crash,
> > before the process is fully terminated, without waiting for a
> > coredump file to be written and parsed.
> >
> > [...]
>
> "stable" with a grain of salt. We make no such guarantees that it won't be
> moved around if needed.

Ack. At least tracepoints offer more stability compared to
fentry/function-based approaches which can be inlined, renamed, or
otherwise modified.

Thanks for reviewing this.
--breno

^ permalink raw reply

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Mathieu Desnoyers @ 2026-03-20 13:31 UTC (permalink / raw)
  To: Harry Yoo (Oracle)
  Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
	Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
	Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <ab1J9ODkX5iChu-C@hyeyoo>

On 2026-03-20 09:21, Harry Yoo (Oracle) wrote:
> On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
>> On 2026-03-20 00:17, Harry Yoo wrote:
>> [...]
>>>> [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
>>>
>>> @Mathieu: In patch 1/3 description,
>>>> Changes since v7:
>>>> - Explicitly initialize the subsystem from start_kernel() right
>>>>     after mm_core_init() so it is up and running before the creation of
>>>>     the first mm at boot.
>>>
>>> But how does this work when someone calls mm_cpumask() on init_mm early?
>>> Looks like it will behave incorrectly because get_rss_stat_items_size()
>>> returns zero?
>>
>> It doesn't work as expected at all. I missed that all users of mm_cpumask()
>> end up relying on get_rss_stat_items_size(), which now calls
>> percpu_counter_tree_items_size(), which depends on initialization from
>> percpu_counter_tree_subsystem_init().
>>
>> If you add a call to percpu_counter_tree_subsystem_init in
>> arch/powerpc/kernel/setup_arch() just before:
>>
>>          VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm)));
>>          cpumask_set_cpu(smp_processor_id(), mm_cpumask(&init_mm));
>>
>> Does the warning go away ?
> 
> Hmm it goes away, but I'm not sure if it is it okay to use nr_cpu_ids
> before setup_nr_cpu_ids() is called?

AFAIU on powerpc setup_nr_cpu_ids() is called near the end of
smp_setup_cpu_maps(), which is called early in setup_arch,
at least before the two lines which use mm_cpumask.
  
>> Alternatively, would could use a lazy initialization invoking
>> percpu_counter_tree_subsystem_init from percpu_counter_tree_items_size
>> when the initialization is not already done.
> 
> So this probably isn't a way to go?

I'd favor explicit initialization, so the inter-dependencies are clear.

> Hmm perhaps we should treat init_mm as a special case in
> mm_cpus_allowed() and mm_cpumask().

I'd prefer not to go there if boot sequence permits and keep things
simple.

I think we're in a situation very similar to tree RCU, here is what
is done in rcu_init_geometry:

         static bool initialized;

         if (initialized) {
                 /*
                  * Warn if setup_nr_cpu_ids() had not yet been invoked,
                  * unless nr_cpus_ids == NR_CPUS, in which case who cares?
                  */
                 WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
                 return;
         }

         old_nr_cpu_ids = nr_cpu_ids;
         initialized = true;

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

^ permalink raw reply

* Re: [PATCH] coredump: add tracepoint for coredump events
From: Christian Brauner @ 2026-03-20 13:21 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Christian Brauner, linux-kernel, linux-fsdevel,
	linux-trace-kernel, bpf, kernel-team, Andrii Nakryiko,
	Alexander Viro, Jan Kara, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers
In-Reply-To: <20260320-coredump_tracepoint-v1-1-34864746cbb3@debian.org>

On Fri, 20 Mar 2026 05:33:34 -0700, Breno Leitao wrote:
> Coredump is a generally useful and interesting event in the lifetime
> of a process. Add a tracepoint so it can be monitored through the
> standard kernel tracing infrastructure.
> 
> BPF-based crash monitoring is an advanced approach that
> allows real-time crash interception: by attaching a BPF program at
> this point, tools can use bpf_get_stack() with BPF_F_USER_STACK to
> capture the user-space stack trace at the exact moment of the crash,
> before the process is fully terminated, without waiting for a
> coredump file to be written and parsed.
> 
> [...]

"stable" with a grain of salt. We make no such guarantees that it won't be
moved around if needed.

---

Applied to the vfs-7.1.misc branch of the vfs/vfs.git tree.
Patches in the vfs-7.1.misc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.1.misc

[1/1] coredump: add tracepoint for coredump events
      https://git.kernel.org/vfs/vfs/c/8e69edaf49bc

^ permalink raw reply

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Harry Yoo (Oracle) @ 2026-03-20 13:21 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
	Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
	Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <7780a471-9d99-40a7-ade7-0c4594ac36c7@efficios.com>

On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
> On 2026-03-20 00:17, Harry Yoo wrote:
> [...]
> > > [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
> > 
> > @Mathieu: In patch 1/3 description,
> > > Changes since v7:
> > > - Explicitly initialize the subsystem from start_kernel() right
> > >    after mm_core_init() so it is up and running before the creation of
> > >    the first mm at boot.
> > 
> > But how does this work when someone calls mm_cpumask() on init_mm early?
> > Looks like it will behave incorrectly because get_rss_stat_items_size()
> > returns zero?
> 
> It doesn't work as expected at all. I missed that all users of mm_cpumask()
> end up relying on get_rss_stat_items_size(), which now calls
> percpu_counter_tree_items_size(), which depends on initialization from
> percpu_counter_tree_subsystem_init().
> 
> If you add a call to percpu_counter_tree_subsystem_init in
> arch/powerpc/kernel/setup_arch() just before:
> 
>         VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm)));
>         cpumask_set_cpu(smp_processor_id(), mm_cpumask(&init_mm));
> 
> Does the warning go away ?

Hmm it goes away, but I'm not sure if it is it okay to use nr_cpu_ids
before setup_nr_cpu_ids() is called?

> Alternatively, would could use a lazy initialization invoking
> percpu_counter_tree_subsystem_init from percpu_counter_tree_items_size
> when the initialization is not already done.

So this probably isn't a way to go?

Hmm perhaps we should treat init_mm as a special case in
mm_cpus_allowed() and mm_cpumask().

-- 
Cheers,
Harry / Hyeonggon

^ permalink raw reply

* Re: [PATCH] coredump: add tracepoint for coredump events
From: Christian Brauner @ 2026-03-20 13:21 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Alexander Viro, Jan Kara, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-fsdevel,
	linux-trace-kernel, bpf, kernel-team, Andrii Nakryiko
In-Reply-To: <20260320-coredump_tracepoint-v1-1-34864746cbb3@debian.org>

On Fri, Mar 20, 2026 at 05:33:34AM -0700, Breno Leitao wrote:
> Coredump is a generally useful and interesting event in the lifetime
> of a process. Add a tracepoint so it can be monitored through the
> standard kernel tracing infrastructure.
> 
> BPF-based crash monitoring is an advanced approach that
> allows real-time crash interception: by attaching a BPF program at
> this point, tools can use bpf_get_stack() with BPF_F_USER_STACK to
> capture the user-space stack trace at the exact moment of the crash,
> before the process is fully terminated, without waiting for a
> coredump file to be written and parsed.
> 
> However, there is currently no stable kernel API for this use case.
> Existing tools rely on attaching fentry probes to do_coredump(),
> which is an internal function whose signature changes across kernel
> versions, breaking these tools.
> 
> Add a stable tracepoint that fires at the beginning of
> do_coredump(), providing BPF programs a reliable attachment point.
> At tracepoint time, the crashing process context is still live, so
> BPF programs can call bpf_get_stack() with BPF_F_USER_STACK to
> extract the user-space backtrace.
> 
> The tracepoint records:
>   - sig: signal number that triggered the coredump
>   - comm: process name
>   - pid: process PID
> 
> Example output:
> 
>   $ echo 1 > /sys/kernel/tracing/events/coredump/coredump/enable
>   $ sleep 999 &
>   $ kill -SEGV $!
>   $ cat /sys/kernel/tracing/trace
>   #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
>   #              | |         |   |||||     |         |
>              sleep-634     [036] .....   145.222206: coredump: sig=11 comm=sleep pid=634
> 
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  fs/coredump.c                   |  5 +++++
>  include/trace/events/coredump.h | 47 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 52 insertions(+)
> 
> diff --git a/fs/coredump.c b/fs/coredump.c
> index 29df8aa19e2e7..bb6fdb1f458e9 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -63,6 +63,9 @@
>  
>  #include <trace/events/sched.h>
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/coredump.h>
> +
>  static bool dump_vma_snapshot(struct coredump_params *cprm);
>  static void free_vma_snapshot(struct coredump_params *cprm);
>  
> @@ -1090,6 +1093,8 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
>  static void do_coredump(struct core_name *cn, struct coredump_params *cprm,
>  			size_t **argv, int *argc, const struct linux_binfmt *binfmt)
>  {
> +	trace_coredump(cprm->siginfo->si_signo);
> +
>  	if (!coredump_parse(cn, cprm, argv, argc)) {
>  		coredump_report_failure("format_corename failed, aborting core");
>  		return;
> diff --git a/include/trace/events/coredump.h b/include/trace/events/coredump.h
> new file mode 100644
> index 0000000000000..59617eba3dbcf
> --- /dev/null
> +++ b/include/trace/events/coredump.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
> + */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM coredump
> +
> +#if !defined(_TRACE_COREDUMP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_COREDUMP_H
> +
> +#include <linux/sched.h>
> +#include <linux/tracepoint.h>
> +
> +/**
> + * coredump - called when a coredump starts
> + * @sig: signal number that triggered the coredump
> + *
> + * This tracepoint fires at the beginning of a coredump attempt,
> + * providing a stable interface for monitoring coredump events.
> + */
> +TRACE_EVENT(coredump,
> +
> +	TP_PROTO(int sig),
> +
> +	TP_ARGS(sig),
> +
> +	TP_STRUCT__entry(
> +		__field(int, sig)
> +		__array(char, comm, TASK_COMM_LEN)
> +		__field(pid_t, pid)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->sig = sig;
> +		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> +		__entry->pid = current->pid;

That's the TID as seen in the global pid namespace.
I assume this is what you want but worth noting.

^ permalink raw reply

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Mathieu Desnoyers @ 2026-03-20 12:35 UTC (permalink / raw)
  To: Harry Yoo, Nathan Chancellor
  Cc: Thomas Weißschuh, Michal Clapinski, Andrew Morton,
	Thomas Gleixner, Steven Rostedt, Masami Hiramatsu, linux-mm,
	linux-trace-kernel, linux-kernel
In-Reply-To: <abzKcGiRSR_E8lLN@hyeyoo>

On 2026-03-20 00:17, Harry Yoo wrote:
[...]
>> [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
> 
> @Mathieu: In patch 1/3 description,
>> Changes since v7:
>> - Explicitly initialize the subsystem from start_kernel() right
>>    after mm_core_init() so it is up and running before the creation of
>>    the first mm at boot.
> 
> But how does this work when someone calls mm_cpumask() on init_mm early?
> Looks like it will behave incorrectly because get_rss_stat_items_size()
> returns zero?

It doesn't work as expected at all. I missed that all users of mm_cpumask()
end up relying on get_rss_stat_items_size(), which now calls
percpu_counter_tree_items_size(), which depends on initialization from
percpu_counter_tree_subsystem_init().

If you add a call to percpu_counter_tree_subsystem_init in
arch/powerpc/kernel/setup_arch() just before:

         VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm)));
         cpumask_set_cpu(smp_processor_id(), mm_cpumask(&init_mm));

Does the warning go away ?

Alternatively, would could use a lazy initialization invoking
percpu_counter_tree_subsystem_init from percpu_counter_tree_items_size
when the initialization is not already done.

Any preference ?

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

^ permalink raw reply

* [PATCH] coredump: add tracepoint for coredump events
From: Breno Leitao @ 2026-03-20 12:33 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-fsdevel, linux-trace-kernel, bpf, kernel-team,
	Andrii Nakryiko, Breno Leitao

Coredump is a generally useful and interesting event in the lifetime
of a process. Add a tracepoint so it can be monitored through the
standard kernel tracing infrastructure.

BPF-based crash monitoring is an advanced approach that
allows real-time crash interception: by attaching a BPF program at
this point, tools can use bpf_get_stack() with BPF_F_USER_STACK to
capture the user-space stack trace at the exact moment of the crash,
before the process is fully terminated, without waiting for a
coredump file to be written and parsed.

However, there is currently no stable kernel API for this use case.
Existing tools rely on attaching fentry probes to do_coredump(),
which is an internal function whose signature changes across kernel
versions, breaking these tools.

Add a stable tracepoint that fires at the beginning of
do_coredump(), providing BPF programs a reliable attachment point.
At tracepoint time, the crashing process context is still live, so
BPF programs can call bpf_get_stack() with BPF_F_USER_STACK to
extract the user-space backtrace.

The tracepoint records:
  - sig: signal number that triggered the coredump
  - comm: process name
  - pid: process PID

Example output:

  $ echo 1 > /sys/kernel/tracing/events/coredump/coredump/enable
  $ sleep 999 &
  $ kill -SEGV $!
  $ cat /sys/kernel/tracing/trace
  #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
  #              | |         |   |||||     |         |
             sleep-634     [036] .....   145.222206: coredump: sig=11 comm=sleep pid=634

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
 fs/coredump.c                   |  5 +++++
 include/trace/events/coredump.h | 47 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/fs/coredump.c b/fs/coredump.c
index 29df8aa19e2e7..bb6fdb1f458e9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -63,6 +63,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/coredump.h>
+
 static bool dump_vma_snapshot(struct coredump_params *cprm);
 static void free_vma_snapshot(struct coredump_params *cprm);
 
@@ -1090,6 +1093,8 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
 static void do_coredump(struct core_name *cn, struct coredump_params *cprm,
 			size_t **argv, int *argc, const struct linux_binfmt *binfmt)
 {
+	trace_coredump(cprm->siginfo->si_signo);
+
 	if (!coredump_parse(cn, cprm, argv, argc)) {
 		coredump_report_failure("format_corename failed, aborting core");
 		return;
diff --git a/include/trace/events/coredump.h b/include/trace/events/coredump.h
new file mode 100644
index 0000000000000..59617eba3dbcf
--- /dev/null
+++ b/include/trace/events/coredump.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM coredump
+
+#if !defined(_TRACE_COREDUMP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_COREDUMP_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+/**
+ * coredump - called when a coredump starts
+ * @sig: signal number that triggered the coredump
+ *
+ * This tracepoint fires at the beginning of a coredump attempt,
+ * providing a stable interface for monitoring coredump events.
+ */
+TRACE_EVENT(coredump,
+
+	TP_PROTO(int sig),
+
+	TP_ARGS(sig),
+
+	TP_STRUCT__entry(
+		__field(int, sig)
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+	),
+
+	TP_fast_assign(
+		__entry->sig = sig;
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid = current->pid;
+	),
+
+	TP_printk("sig=%d comm=%s pid=%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_COREDUMP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

---
base-commit: b5d083a3ed1e2798396d5e491432e887da8d4a06
change-id: 20260320-coredump_tracepoint-4de4399ce1b6

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply related

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Michał Cłapiński @ 2026-03-20 12:23 UTC (permalink / raw)
  To: Harry Yoo
  Cc: Nathan Chancellor, Mathieu Desnoyers, Thomas Weißschuh,
	Andrew Morton, Thomas Gleixner, Steven Rostedt, Masami Hiramatsu,
	linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <abzKcGiRSR_E8lLN@hyeyoo>

On Fri, Mar 20, 2026 at 5:18 AM Harry Yoo <harry.yoo@oracle.com> wrote:
>
> On Thu, Mar 19, 2026 at 04:37:45PM -0700, Nathan Chancellor wrote:
> > Hi all,
> >
> > I am not really sure whose bug this is, as it only appears when three
> > seemingly independent patch series are applied together, so I have added
> > the patch authors and their committers (along with the tracing
> > maintainers) to this thread. Feel free to expand or reduce that list as
> > necessary.
> >
> > Our continuous integration has noticed a crash when booting
> > ppc64_guest_defconfig in QEMU on the past few -next versions.
> >
> >   https://github.com/ClangBuiltLinux/continuous-integration2/actions/runs/23311154492/job/67811527112
> >
> > This does not appear to be clang related, as it can be reproduced with
> > GCC 15.2.0 as well. Through multiple bisects, I was able to land on
> > applying:
> >
> >   mm: improve RSS counter approximation accuracy for proc interfaces [1]
> >   vdso/datastore: Allocate data pages dynamically [2]
> >   kho: fix deferred init of kho scratch [3]
> >
> > and their dependent changes on top of 7.0-rc4 is enough to reproduce
> > this (at least on two of my machines with the same commands). I have
> > attached the diff from the result of the following 'git apply' commands
> > below, done in a linux-next checkout.
> >
> >   $ git checkout v7.0-rc4
> >   HEAD is now at f338e7738378 Linux 7.0-rc4
> >
> >   # [1]
> >   $ git diff 60ddf3eed4999bae440d1cf9e5868ccb3f308b64^..087dd6d2cc12c82945ab859194c32e8e977daae3 | git apply -3v
> >   ...
> >
> >   # [2]
> >   # Fix trivial conflict in init/main.c around headers
> >   $ git diff dc432ab7130bb39f5a351281a02d4bc61e85a14a^..05988dba11791ccbb458254484826b32f17f4ad2 | git apply -3v
> >   ...
> >
> >   # [3]
> >   # Fix conflict in kernel/liveupdate/kexec_handover.c due to lack of kho_mem_retrieve(), just add pfn_is_kho_scratch()
> >   $ git show 4a78467ffb537463486968232daef1e8a2f105e3 | git apply -3v
> >   ...
> >
> >   $ make -skj"$(nproc)" ARCH=powerpc CROSS_COMPILE=powerpc64-linux- mrproper ppc64_guest_defconfig vmlinux
> >
> >   $ curl -LSs https://github.com/ClangBuiltLinux/boot-utils/releases/download/20241120-044434/ppc64-rootfs.cpio.zst | zstd -d >rootfs.cpio
> >
> >   $ qemu-system-ppc64 \
> >       -display none \
> >       -nodefaults \
> >       -cpu power8 \
> >       -machine pseries \
> >       -vga none \
> >       -kernel vmlinux \
> >       -initrd rootfs.cpio \
> >       -m 1G \
> >       -serial mon:stdio
>
> Thanks, such a detailed steps to reproduce!
> Interestingly, the combination of my compiler (GCC 13.3.0) and
> QEMU (8.2.2) don't trigger this bug.
>
> >   [    0.000000][    T0] Linux version 7.0.0-rc4-dirty (nathan@framework-amd-ryzen-maxplus-395) (powerpc64-linux-gcc (GCC) 15.2.0, GNU ld (GNU Binutils) 2.45) #1 SMP PREEMPT Thu Mar 19 15:45:53 MST 2026
> >   ...
> >   [    0.216764][    T1] vgaarb: loaded
> >   [    0.217590][    T1] clocksource: Switched to clocksource timebase
> >   [    0.221007][   T12] BUG: Kernel NULL pointer dereference at 0x00000010
> >   [    0.221049][   T12] Faulting instruction address: 0xc00000000044947c
> >   [    0.221237][   T12] Oops: Kernel access of bad area, sig: 11 [#1]
> >   [    0.221276][   T12] BE PAGE_SIZE=64K MMU=Hash  SMP NR_CPUS=2048 NUMA pSeries
> >   [    0.221359][   T12] Modules linked in:
> >   [    0.221556][   T12] CPU: 0 UID: 0 PID: 12 Comm: kworker/u4:0 Not tainted 7.0.0-rc4-dirty #1 PREEMPTLAZY
> >   [    0.221631][   T12] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
> >   [    0.221765][   T12] Workqueue: trace_init_wq tracer_init_tracefs_work_func
> >   [    0.222065][   T12] NIP:  c00000000044947c LR: c00000000041a584 CTR: c00000000053aa90
> >   [    0.222084][   T12] REGS: c000000003bc7960 TRAP: 0380   Not tainted  (7.0.0-rc4-dirty)
> >   [    0.222111][   T12] MSR:  8000000000009032 <SF,EE,ME,IR,DR,RI>  CR: 44000204  XER: 00000000
> >   [    0.222287][   T12] CFAR: c000000000449420 IRQMASK: 0
> >   [    0.222287][   T12] GPR00: c00000000041a584 c000000003bc7c00 c000000001c08100 c000000002892f20
> >   [    0.222287][   T12] GPR04: c0000000019cfa68 c0000000019cfa60 0000000000000001 0000000000000064
> >   [    0.222287][   T12] GPR08: 0000000000000002 0000000000000000 c000000003bba000 0000000000000010
> >   [    0.222287][   T12] GPR12: c00000000053aa90 c000000002c50000 c000000001ab25f8 c000000001626690
> >   [    0.222287][   T12] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >   [    0.222287][   T12] GPR20: c000000001624868 c000000001ab2708 c0000000019cfa08 c000000001a00d18
> >   [    0.222287][   T12] GPR24: c0000000019cfa18 fffffffffffffef7 c000000003051205 c0000000019cfa68
> >   [    0.222287][   T12] GPR28: 0000000000000000 c0000000019cfa60 c000000002894e90 0000000000000000
> >   [    0.222526][   T12] NIP [c00000000044947c] __find_event_file+0x9c/0x110
> >   [    0.222572][   T12] LR [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
> >   [    0.222643][   T12] Call Trace:
> >   [    0.222690][   T12] [c000000003bc7c00] [c000000000b943b0] tracefs_create_file+0x1a0/0x2b0 (unreliable)
> >   [    0.222766][   T12] [c000000003bc7c50] [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
> >   [    0.222791][   T12] [c000000003bc7dc0] [c000000002046f1c] tracer_init_tracefs_work_func+0x50/0x320
> >   [    0.222809][   T12] [c000000003bc7e50] [c000000000276958] process_one_work+0x1b8/0x530
> >   [    0.222828][   T12] [c000000003bc7f10] [c00000000027778c] worker_thread+0x1dc/0x3d0
> >   [    0.222883][   T12] [c000000003bc7f90] [c000000000284c44] kthread+0x194/0x1b0
> >   [    0.222900][   T12] [c000000003bc7fe0] [c00000000000cf30] start_kernel_thread+0x14/0x18
> >   [    0.222961][   T12] Code: 7c691b78 7f63db78 2c090000 40820018 e89c0000 49107f21 60000000 2c030000 41820048 ebff0000 7c3ff040 41820038 <e93f0010> 7fa3eb78 81490058 e8890018
> >   [    0.223190][   T12] ---[ end trace 0000000000000000 ]---
> >   ...
> >
> > Interestingly, turning on CONFIG_KASAN appears to hide this, maybe
> > pointing to some sort of memory corruption (or something timing
> > related)? If there is any other information I can provide, I am more
> > than happy to do so.
>
> I don't have much idea on how things end up causing
> NULL-pointer-deref... but let's point out suspicious things.
>
> > [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
>
> @Mathieu: In patch 1/3 description,
> > Changes since v7:
> > - Explicitly initialize the subsystem from start_kernel() right
> >   after mm_core_init() so it is up and running before the creation of
> >   the first mm at boot.
>
> But how does this work when someone calls mm_cpumask() on init_mm early?
> Looks like it will behave incorrectly because get_rss_stat_items_size()
> returns zero?
>
> While it doesn't crash on my environment, it triggers a two warnings
> (with -smp 2 option added). IIUC the cpu bit should have been set in
> setup_arch(), but at the wrong location. After the
> percpu_counter_tree_subsystem_init() function is called, the bit doesn't
> appear to be set.
>
> [    1.392787][    T1] ------------[ cut here ]------------
> [    1.392935][    T1] WARNING: arch/powerpc/mm/mmu_context.c:106 at switch_mm_irqs_off+0x190/0x1c0, CPU#0: swapper/0/1
> [    1.393187][    T1] Modules linked in:
> [    1.393458][    T1] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-rc4-next-20260319 #1 PREEMPTLAZY
> [    1.393600][    T1] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
> [    1.393711][    T1] NIP:  c00000000014e390 LR: c00000000014e30c CTR: 0000000000000000
> [    1.393752][    T1] REGS: c000000003def7b0 TRAP: 0700   Not tainted  (7.0.0-rc4-next-20260319)
> [    1.393807][    T1] MSR:  8000000002021032 <SF,VEC,ME,IR,DR,RI>  CR: 2800284a  XER: 00000000
> [    1.393944][    T1] CFAR: c00000000014e328 IRQMASK: 3
> [    1.393944][    T1] GPR00: c00000000014e36c c000000003defa50 c000000001bb8100 c0000000028d8c80
> [    1.393944][    T1] GPR04: c000000004ddc04a 000000000000000a 0000000022222222 2222222222222222
> [    1.393944][    T1] GPR08: 2222222222222222 0000000000000000 0000000000000001 0000000000008000
> [    1.393944][    T1] GPR12: c000000000521e80 c000000002c70000 c00000000000fff0 0000000000000000
> [    1.393944][    T1] GPR16: 0000000000000000 c00000000606c600 c000000003623ac0 0000000000000000
> [    1.393944][    T1] GPR20: c000000004c66300 c00000000606fc00 0000000000000001 0000000000000001
> [    1.393944][    T1] GPR24: c000000006069c00 c00000000272c500 0000000000000000 0000000000000000
> [    1.393944][    T1] GPR28: c000000003d68200 0000000000000000 c0000000028d8a80 c00000000272bd00
> [    1.394355][    T1] NIP [c00000000014e390] switch_mm_irqs_off+0x190/0x1c0
> [    1.394395][    T1] LR [c00000000014e30c] switch_mm_irqs_off+0x10c/0x1c0
> [    1.394519][    T1] Call Trace:
> [    1.394584][    T1] [c000000003defa50] [c00000000014e36c] switch_mm_irqs_off+0x16c/0x1c0 (unreliable)
> [    1.394676][    T1] [c000000003defab0] [c0000000006edbf0] begin_new_exec+0x534/0xf60
> [    1.394732][    T1] [c000000003defb20] [c000000000795538] load_elf_binary+0x494/0x1d1c
> [    1.394765][    T1] [c000000003defc70] [c0000000006eb910] bprm_execve+0x380/0x720
> [    1.394796][    T1] [c000000003defd00] [c0000000006ed5a8] kernel_execve+0x12c/0x1bc
> [    1.394831][    T1] [c000000003defd50] [c00000000000eda8] run_init_process+0xf8/0x160
> [    1.394864][    T1] [c000000003defde0] [c0000000000100b4] kernel_init+0xcc/0x268
> [    1.394899][    T1] [c000000003defe50] [c00000000000cf14] ret_from_kernel_user_thread+0x14/0x1c
> [    1.394946][    T1] ---- interrupt: 0 at 0x0
> [    1.395205][    T1] Code: 7fe4fb78 7f83e378 48009171 60000000 4bffff98 60000000 60000000 60000000 0fe00000 4bffff00 60000000 60000000 <0fe00000> 4bffff98 60000000 60000000
> [    1.395420][    T1] ---[ end trace 0000000000000000 ]---
> [    1.526024][   T67] mount (67) used greatest stack depth: 28432 bytes left
> [    1.605803][   T69] mount (69) used greatest stack depth: 27872 bytes left
> [    1.667853][   T71] mkdir (71) used greatest stack depth: 27248 bytes left
> Saving 256 bits of creditable seed for next boot
> [    1.926636][   T80] ------------[ cut here ]------------
> [    1.926719][   T80] WARNING: arch/powerpc/mm/mmu_context.c:51 at switch_mm_irqs_off+0x180/0x1c0, CPU#0: S01seedrng/80
> [    1.926782][   T80] Modules linked in:
> [    1.926910][   T80] CPU: 0 UID: 0 PID: 80 Comm: S01seedrng Tainted: G        W           7.0.0-rc4-next-20260319 #1 PREEMPTLAZY
> [    1.926990][   T80] Tainted: [W]=WARN
> [    1.927025][   T80] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
> [    1.927091][   T80] NIP:  c00000000014e380 LR: c00000000014e24c CTR: c000000000232894
> [    1.927131][   T80] REGS: c000000004d5f800 TRAP: 0700   Tainted: G        W            (7.0.0-rc4-next-20260319)
> [    1.927179][   T80] MSR:  8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 28002828  XER: 20000000
> [    1.927253][   T80] CFAR: c00000000014e280 IRQMASK: 1
> [    1.927253][   T80] GPR00: c0000000002328ec c000000004d5faa0 c000000001bb8100 0000000000000080
> [    1.927253][   T80] GPR04: c0000000028d8280 c000000004509c00 0000000000000002 c00000000272c700
> [    1.927253][   T80] GPR08: fffffffffffffffe c0000000028d8280 0000000000000000 0000000048002828
> [    1.927253][   T80] GPR12: c000000000232894 c000000002c70000 0000000000000000 0000000000000002
> [    1.927253][   T80] GPR16: 0000000000000000 000001002f0a2958 000001002f0a2950 ffffffffffffffff
> [    1.927253][   T80] GPR20: 0000000000000000 0000000000000000 c000000002ab1400 c00000000272c700
> [    1.927253][   T80] GPR24: 0000000000000000 c0000000028d8a80 0000000000000000 0000000000000000
> [    1.927253][   T80] GPR28: c000000004509c00 0000000000000000 c00000000272bd00 c0000000028d8280
> [    1.927629][   T80] NIP [c00000000014e380] switch_mm_irqs_off+0x180/0x1c0
> [    1.927678][   T80] LR [c00000000014e24c] switch_mm_irqs_off+0x4c/0x1c0
> [    1.927715][   T80] Call Trace:
> [    1.927737][   T80] [c000000004d5faa0] [c000000004d5faf0] 0xc000000004d5faf0 (unreliable)
> [    1.927804][   T80] [c000000004d5fb00] [c0000000002328ec] do_shoot_lazy_tlb+0x58/0x84
> [    1.927853][   T80] [c000000004d5fb30] [c000000000388304] smp_call_function_many_cond+0x6a0/0x8d8
> [    1.927902][   T80] [c000000004d5fc20] [c000000000388624] on_each_cpu_cond_mask+0x40/0x7c
> [    1.927943][   T80] [c000000004d5fc50] [c000000000232ad4] __mmdrop+0x88/0x2ec
> [    1.927986][   T80] [c000000004d5fce0] [c000000000242104] do_exit+0x350/0xde4
> [    1.928028][   T80] [c000000004d5fdb0] [c000000000242de0] do_group_exit+0x48/0xbc
> [    1.928072][   T80] [c000000004d5fdf0] [c000000000242e74] pid_child_should_wake+0x0/0x84
> [    1.928128][   T80] [c000000004d5fe10] [c000000000030218] system_call_exception+0x148/0x3c0
> [    1.928176][   T80] [c000000004d5fe50] [c00000000000c6d4] system_call_common+0xf4/0x258
> [    1.928217][   T80] ---- interrupt: c00 at 0x7fff8ade507c
> [    1.928253][   T80] NIP:  00007fff8ade507c LR: 00007fff8ade5034 CTR: 0000000000000000
> [    1.928291][   T80] REGS: c000000004d5fe80 TRAP: 0c00   Tainted: G        W            (7.0.0-rc4-next-20260319)
> [    1.928333][   T80] MSR:  800000000280f032 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI>  CR: 24002824  XER: 00000000
> [    1.928413][   T80] IRQMASK: 0
> [    1.928413][   T80] GPR00: 00000000000000ea 00007fffe75beb50 00007fff8aed7300 0000000000000000
> [    1.928413][   T80] GPR04: 0000000000000000 00007fffe75beda0 00007fffe75bedb0 0000000000000000
> [    1.928413][   T80] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [    1.928413][   T80] GPR12: 0000000000000000 00007fff8afaae00 00007fffca692568 0000000133cf0440
> [    1.928413][   T80] GPR16: 0000000000000000 000001002f0a2958 000001002f0a2950 ffffffffffffffff
> [    1.928413][   T80] GPR20: 0000000000000000 0000000000000000 00007fffe75bf838 00007fff8afa0000
> [    1.928413][   T80] GPR24: 0000000126911328 0000000000000001 00007fff8af9dc00 00007fffe75bf818
> [    1.928413][   T80] GPR28: 0000000000000003 fffffffffffff000 0000000000000000 00007fff8afa3e10
> [    1.928765][   T80] NIP [00007fff8ade507c] 0x7fff8ade507c
> [    1.928795][   T80] LR [00007fff8ade5034] 0x7fff8ade5034
> [    1.928835][   T80] ---- interrupt: c00
> [    1.928924][   T80] Code: 7c0803a6 4e800020 60000000 60000000 7fe4fb78 7f83e378 48009171 60000000 4bffff98 60000000 60000000 60000000 <0fe00000> 4bffff00 60000000 60000000
> [    1.929054][   T80] ---[ end trace 0000000000000000 ]---
>
> > [2]: https://lore.kernel.org/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de/
>
> > [3]: https://lore.kernel.org/20260311125539.4123672-2-mclapinski@google.com/
>
> @Michal: Something my AI buddy pointed out... (that I think is valid):
>
> > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > index df34797691bd..7363b5b0d22a 100644
> > --- a/mm/mm_init.c
> > +++ b/mm/mm_init.c
> > @@ -2078,9 +2082,11 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> >                       unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
> >                       unsigned long chunk_end = min(mo_pfn, epfn);
> >
> > -                     nr_pages += deferred_init_pages(zone, spfn, chunk_end);
>
> Previously, deferred_init_pages() returned nr of pages to add, which is
> (end_pfn (= chunk_end) - spfn).
>
> > -                     deferred_free_pages(spfn, chunk_end - spfn);
> > +                     // KHO scratch is MAX_ORDER_NR_PAGES aligned.
> > +                     if (!pfn_is_kho_scratch(spfn))
> > +                             deferred_init_pages(zone, spfn, chunk_end);
>
> But since the function is not always called with the change,
> the calculation is moved to...
>
> > +                     deferred_free_pages(spfn, chunk_end - spfn);
> >                       spfn = chunk_end;
> >
> >                       if (can_resched)
> > @@ -2088,6 +2094,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> >                       else
> >                               touch_nmi_watchdog();
> >               }
> > +             nr_pages += epfn - spfn;
>
> Here.
>
> But this is incorrect, because here we have:
> > static unsigned long __init
> > deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> >                            struct zone *zone, bool can_resched)
> > {
> >         int nid = zone_to_nid(zone);
> >         unsigned long nr_pages = 0;
> >         phys_addr_t start, end;
> >         u64 i = 0;
> >
> >         for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
> >                 unsigned long spfn = PFN_UP(start);
> >                 unsigned long epfn = PFN_DOWN(end);
> >
> >                 if (spfn >= end_pfn)
> >                         break;
> >
> >                 spfn = max(spfn, start_pfn);
> >                 epfn = min(epfn, end_pfn);
> >
> >                 while (spfn < epfn) {
>
> The loop condition is (spfn < epfn), and by the time the loop terminates...
>
> >                         unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
> >                         unsigned long chunk_end = min(mo_pfn, epfn);
> >
> >                         // KHO scratch is MAX_ORDER_NR_PAGES aligned.
> >                         if (!pfn_is_kho_scratch(spfn))
> >                                 deferred_init_pages(zone, spfn, chunk_end);
> >
> >                         deferred_free_pages(spfn, chunk_end - spfn);
> >                         spfn = chunk_end;
> >
> >                         if (can_resched)
> >                                 cond_resched();
> >                         else
> >                                 touch_nmi_watchdog();
> >                 }
> >                 nr_pages += epfn - spfn;
>
> epfn - spfn <= 0.
>
> So the number of pages returned by deferred_init_memmap_chunk() becomes
> incorrect.
>
> The equivalent translation of what's there before would be doing
> `nr_pages += chunk_end - spfn;` within the loop.

Good point, thank you. This patch has already been removed from mm-new.

> --
> Cheers,
> Harry / Hyeonggon

^ permalink raw reply

* Re: [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Marek Szyprowski @ 2026-03-20 11:09 UTC (permalink / raw)
  To: Leon Romanovsky, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel, Will Deacon,
	Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260316-dma-debug-overlap-v3-0-1dde90a7f08b@nvidia.com>

On 16.03.2026 20:06, Leon Romanovsky wrote:
> Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
> mappings that must run on a DMA‑coherent system. Such buffers cannot
> use the SWIOTLB path, may overlap with CPU caches, and do not depend on
> explicit cache flushing.
>
> Mappings using this attribute are rejected on systems where cache
> side‑effects could lead to data corruption, and therefore do not need
> the cache‑overlap debugging logic. This series also includes fixes for
> DMA_ATTR_CPU_CACHE_CLEAN handling.
> Thanks.
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

Applied to dma-mapping-fixes. Thanks!

> ---
> Changes in v3:
> - Enriched commit messages and documentation
> - Added ROB tags
> - Link to v2: https://protect2.fireeye.com/v1/url?k=9c1ba148-fd90b40f-9c1a2a07-000babff99aa-86ebd022a97425b3&q=1&e=3c8e10cc-4c34-4bf6-aa9d-c339877d6a27&u=https%3A%2F%2Fpatch.msgid.link%2F20260311-dma-debug-overlap-v2-0-e00bc2ca346d%40nvidia.com
>
> Changes in v2:
> - Added DMA_ATTR_REQUIRE_COHERENT attribute
> - Added HMM patch which needs this attribute as well
> - Renamed DMA_ATTR_CPU_CACHE_CLEAN to be DMA_ATTR_DEBUGGING_IGNORE_CACHELINES
> - Link to v1: https://protect2.fireeye.com/v1/url?k=cc0590de-ad8e8599-cc041b91-000babff99aa-07e4da206b7e0d97&q=1&e=3c8e10cc-4c34-4bf6-aa9d-c339877d6a27&u=https%3A%2F%2Fpatch.msgid.link%2F20260307-dma-debug-overlap-v1-0-c034c38872af%40nvidia.com
>
> ---
> Leon Romanovsky (8):
>        dma-debug: Allow multiple invocations of overlapping entries
>        dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
>        dma-mapping: Clarify valid conditions for CPU cache line overlap
>        dma-mapping: Introduce DMA require coherency attribute
>        dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
>        iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
>        RDMA/umem: Tell DMA mapping that UMEM requires coherency
>        mm/hmm: Indicate that HMM requires DMA coherency
>
>   Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
>   drivers/infiniband/core/umem.c            |  5 ++--
>   drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
>   drivers/virtio/virtio_ring.c              | 10 ++++----
>   include/linux/dma-mapping.h               | 15 ++++++++----
>   include/trace/events/dma.h                |  4 +++-
>   kernel/dma/debug.c                        |  9 ++++----
>   kernel/dma/direct.h                       |  7 +++---
>   kernel/dma/mapping.c                      |  6 +++++
>   mm/hmm.c                                  |  4 ++--
>   10 files changed, 86 insertions(+), 33 deletions(-)
> ---
> base-commit: 11439c4635edd669ae435eec308f4ab8a0804808
> change-id: 20260305-dma-debug-overlap-21487c3fa02c
>
> Best regards,
> --
> Leon Romanovsky <leonro@nvidia.com>
>
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply

* Re: [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Marek Szyprowski @ 2026-03-20 11:08 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Robin Murphy, Michael S. Tsirkin, Petr Tesarik, Jonathan Corbet,
	Shuah Khan, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Jason Gunthorpe, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Joerg Roedel, Will Deacon, Andrew Morton,
	iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260318081858.GE61385@unreal>

Hi Leon,

On 18.03.2026 09:18, Leon Romanovsky wrote:
> On Wed, Mar 18, 2026 at 09:03:00AM +0100, Marek Szyprowski wrote:
>> On 17.03.2026 20:05, Leon Romanovsky wrote:
>>> On Mon, Mar 16, 2026 at 09:06:44PM +0200, Leon Romanovsky wrote:
>>>> Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
>>>> mappings that must run on a DMA‑coherent system. Such buffers cannot
>>>> use the SWIOTLB path, may overlap with CPU caches, and do not depend on
>>>> explicit cache flushing.
>>>>
>>>> Mappings using this attribute are rejected on systems where cache
>>>> side‑effects could lead to data corruption, and therefore do not need
>>>> the cache‑overlap debugging logic. This series also includes fixes for
>>>> DMA_ATTR_CPU_CACHE_CLEAN handling.
>>>> Thanks.
>>> <...>
>>>
>>>> ---
>>>> Leon Romanovsky (8):
>>>>         dma-debug: Allow multiple invocations of overlapping entries
>>>>         dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
>>>>         dma-mapping: Clarify valid conditions for CPU cache line overlap
>>>>         dma-mapping: Introduce DMA require coherency attribute
>>>>         dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
>>>>         iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
>>>>         RDMA/umem: Tell DMA mapping that UMEM requires coherency
>>>>         mm/hmm: Indicate that HMM requires DMA coherency
>>>>
>>>>    Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
>>>>    drivers/infiniband/core/umem.c            |  5 ++--
>>>>    drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
>>>>    drivers/virtio/virtio_ring.c              | 10 ++++----
>>>>    include/linux/dma-mapping.h               | 15 ++++++++----
>>>>    include/trace/events/dma.h                |  4 +++-
>>>>    kernel/dma/debug.c                        |  9 ++++----
>>>>    kernel/dma/direct.h                       |  7 +++---
>>>>    kernel/dma/mapping.c                      |  6 +++++
>>>>    mm/hmm.c                                  |  4 ++--
>>>>    10 files changed, 86 insertions(+), 33 deletions(-)
>>> Marek,
>>>
>>> Despite the "RDMA ..." tag in the subject, the diffstat clearly shows that
>>> you are the appropriate person to take this patch.
>> I plan to take the first 2 patches to the dma-mapping-fixes branch
>> (v7.0-rc) and the next to dma-mapping-for-next. Should I also take the
>> RDMA and HMM patches, or do You want a stable branch for merging them
>> via respective subsystem trees?
> I suggest taking all patches into the -fixes branch, as the "RDMA/..." patch
> also resolves the dmesg splat. With -fixes, there is no need to worry about
> a shared branch since we do not expect merge conflicts in that area.
>
> If you still prefer to split the series between -fixes and -next, it would be
> better to use a shared branch in that case. There are patches on the RDMA
> list targeted for -next that touch ib_umem_get().

Okay, I will merge all patches to the -fixes branch then.

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply

* Re: [PATCHv3 bpf-next 08/24] bpf: Add bpf_trampoline_multi_attach/detach functions
From: kernel test robot @ 2026-03-20 10:18 UTC (permalink / raw)
  To: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: llvm, oe-kbuild-all, bpf, linux-trace-kernel, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, Menglong Dong,
	Steven Rostedt
In-Reply-To: <20260316075138.465430-9-jolsa@kernel.org>

Hi Jiri,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Jiri-Olsa/ftrace-Add-ftrace_hash_count-function/20260316-160117
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20260316075138.465430-9-jolsa%40kernel.org
patch subject: [PATCHv3 bpf-next 08/24] bpf: Add bpf_trampoline_multi_attach/detach functions
config: x86_64-randconfig-075-20260320 (https://download.01.org/0day-ci/archive/20260320/202603201820.zsM5FRDS-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260320/202603201820.zsM5FRDS-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603201820.zsM5FRDS-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/bpf/trampoline.c:1520:8: error: call to undeclared function 'btf_distill_func_proto'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1520 |         err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel);
         |               ^
   1 error generated.


vim +/btf_distill_func_proto +1520 kernel/bpf/trampoline.c

  1498	
  1499	static int bpf_get_btf_id_target(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
  1500					 struct bpf_attach_target_info *tgt_info)
  1501	{
  1502		const struct btf_type *t;
  1503		unsigned long addr;
  1504		const char *tname;
  1505		int err;
  1506	
  1507		if (!btf_id || !btf)
  1508			return -EINVAL;
  1509		t = btf_type_by_id(btf, btf_id);
  1510		if (!t)
  1511			return -EINVAL;
  1512		tname = btf_name_by_offset(btf, t->name_off);
  1513		if (!tname)
  1514			return -EINVAL;
  1515		if (!btf_type_is_func(t))
  1516			return -EINVAL;
  1517		t = btf_type_by_id(btf, t->type);
  1518		if (!btf_type_is_func_proto(t))
  1519			return -EINVAL;
> 1520		err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel);
  1521		if (err < 0)
  1522			return err;
  1523		if (btf_is_module(btf)) {
  1524			/* The bpf program already holds refference to module. */
  1525			if (WARN_ON_ONCE(!prog->aux->mod))
  1526				return -EINVAL;
  1527			addr = find_kallsyms_symbol_value(prog->aux->mod, tname);
  1528		} else {
  1529			addr = kallsyms_lookup_name(tname);
  1530		}
  1531		if (!addr || !ftrace_location(addr))
  1532			return -ENOENT;
  1533		tgt_info->tgt_addr = addr;
  1534		return 0;
  1535	}
  1536	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Harry Yoo @ 2026-03-20  4:17 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: Mathieu Desnoyers, Thomas Weißschuh, Michal Clapinski,
	Andrew Morton, Thomas Gleixner, Steven Rostedt, Masami Hiramatsu,
	linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <20260319233745.GA769346@ax162>

On Thu, Mar 19, 2026 at 04:37:45PM -0700, Nathan Chancellor wrote:
> Hi all,
> 
> I am not really sure whose bug this is, as it only appears when three
> seemingly independent patch series are applied together, so I have added
> the patch authors and their committers (along with the tracing
> maintainers) to this thread. Feel free to expand or reduce that list as
> necessary.
> 
> Our continuous integration has noticed a crash when booting
> ppc64_guest_defconfig in QEMU on the past few -next versions.
> 
>   https://github.com/ClangBuiltLinux/continuous-integration2/actions/runs/23311154492/job/67811527112
> 
> This does not appear to be clang related, as it can be reproduced with
> GCC 15.2.0 as well. Through multiple bisects, I was able to land on
> applying:
> 
>   mm: improve RSS counter approximation accuracy for proc interfaces [1]
>   vdso/datastore: Allocate data pages dynamically [2]
>   kho: fix deferred init of kho scratch [3]
> 
> and their dependent changes on top of 7.0-rc4 is enough to reproduce
> this (at least on two of my machines with the same commands). I have
> attached the diff from the result of the following 'git apply' commands
> below, done in a linux-next checkout.
> 
>   $ git checkout v7.0-rc4
>   HEAD is now at f338e7738378 Linux 7.0-rc4
> 
>   # [1]
>   $ git diff 60ddf3eed4999bae440d1cf9e5868ccb3f308b64^..087dd6d2cc12c82945ab859194c32e8e977daae3 | git apply -3v
>   ...
> 
>   # [2]
>   # Fix trivial conflict in init/main.c around headers
>   $ git diff dc432ab7130bb39f5a351281a02d4bc61e85a14a^..05988dba11791ccbb458254484826b32f17f4ad2 | git apply -3v
>   ...
> 
>   # [3]
>   # Fix conflict in kernel/liveupdate/kexec_handover.c due to lack of kho_mem_retrieve(), just add pfn_is_kho_scratch()
>   $ git show 4a78467ffb537463486968232daef1e8a2f105e3 | git apply -3v
>   ...
> 
>   $ make -skj"$(nproc)" ARCH=powerpc CROSS_COMPILE=powerpc64-linux- mrproper ppc64_guest_defconfig vmlinux
> 
>   $ curl -LSs https://github.com/ClangBuiltLinux/boot-utils/releases/download/20241120-044434/ppc64-rootfs.cpio.zst | zstd -d >rootfs.cpio
> 
>   $ qemu-system-ppc64 \
>       -display none \
>       -nodefaults \
>       -cpu power8 \
>       -machine pseries \
>       -vga none \
>       -kernel vmlinux \
>       -initrd rootfs.cpio \
>       -m 1G \
>       -serial mon:stdio

Thanks, such a detailed steps to reproduce!
Interestingly, the combination of my compiler (GCC 13.3.0) and
QEMU (8.2.2) don't trigger this bug.

>   [    0.000000][    T0] Linux version 7.0.0-rc4-dirty (nathan@framework-amd-ryzen-maxplus-395) (powerpc64-linux-gcc (GCC) 15.2.0, GNU ld (GNU Binutils) 2.45) #1 SMP PREEMPT Thu Mar 19 15:45:53 MST 2026
>   ...
>   [    0.216764][    T1] vgaarb: loaded
>   [    0.217590][    T1] clocksource: Switched to clocksource timebase
>   [    0.221007][   T12] BUG: Kernel NULL pointer dereference at 0x00000010
>   [    0.221049][   T12] Faulting instruction address: 0xc00000000044947c
>   [    0.221237][   T12] Oops: Kernel access of bad area, sig: 11 [#1]
>   [    0.221276][   T12] BE PAGE_SIZE=64K MMU=Hash  SMP NR_CPUS=2048 NUMA pSeries
>   [    0.221359][   T12] Modules linked in:
>   [    0.221556][   T12] CPU: 0 UID: 0 PID: 12 Comm: kworker/u4:0 Not tainted 7.0.0-rc4-dirty #1 PREEMPTLAZY
>   [    0.221631][   T12] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
>   [    0.221765][   T12] Workqueue: trace_init_wq tracer_init_tracefs_work_func
>   [    0.222065][   T12] NIP:  c00000000044947c LR: c00000000041a584 CTR: c00000000053aa90
>   [    0.222084][   T12] REGS: c000000003bc7960 TRAP: 0380   Not tainted  (7.0.0-rc4-dirty)
>   [    0.222111][   T12] MSR:  8000000000009032 <SF,EE,ME,IR,DR,RI>  CR: 44000204  XER: 00000000
>   [    0.222287][   T12] CFAR: c000000000449420 IRQMASK: 0
>   [    0.222287][   T12] GPR00: c00000000041a584 c000000003bc7c00 c000000001c08100 c000000002892f20
>   [    0.222287][   T12] GPR04: c0000000019cfa68 c0000000019cfa60 0000000000000001 0000000000000064
>   [    0.222287][   T12] GPR08: 0000000000000002 0000000000000000 c000000003bba000 0000000000000010
>   [    0.222287][   T12] GPR12: c00000000053aa90 c000000002c50000 c000000001ab25f8 c000000001626690
>   [    0.222287][   T12] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>   [    0.222287][   T12] GPR20: c000000001624868 c000000001ab2708 c0000000019cfa08 c000000001a00d18
>   [    0.222287][   T12] GPR24: c0000000019cfa18 fffffffffffffef7 c000000003051205 c0000000019cfa68
>   [    0.222287][   T12] GPR28: 0000000000000000 c0000000019cfa60 c000000002894e90 0000000000000000
>   [    0.222526][   T12] NIP [c00000000044947c] __find_event_file+0x9c/0x110
>   [    0.222572][   T12] LR [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
>   [    0.222643][   T12] Call Trace:
>   [    0.222690][   T12] [c000000003bc7c00] [c000000000b943b0] tracefs_create_file+0x1a0/0x2b0 (unreliable)
>   [    0.222766][   T12] [c000000003bc7c50] [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
>   [    0.222791][   T12] [c000000003bc7dc0] [c000000002046f1c] tracer_init_tracefs_work_func+0x50/0x320
>   [    0.222809][   T12] [c000000003bc7e50] [c000000000276958] process_one_work+0x1b8/0x530
>   [    0.222828][   T12] [c000000003bc7f10] [c00000000027778c] worker_thread+0x1dc/0x3d0
>   [    0.222883][   T12] [c000000003bc7f90] [c000000000284c44] kthread+0x194/0x1b0
>   [    0.222900][   T12] [c000000003bc7fe0] [c00000000000cf30] start_kernel_thread+0x14/0x18
>   [    0.222961][   T12] Code: 7c691b78 7f63db78 2c090000 40820018 e89c0000 49107f21 60000000 2c030000 41820048 ebff0000 7c3ff040 41820038 <e93f0010> 7fa3eb78 81490058 e8890018
>   [    0.223190][   T12] ---[ end trace 0000000000000000 ]---
>   ...
>
> Interestingly, turning on CONFIG_KASAN appears to hide this, maybe
> pointing to some sort of memory corruption (or something timing
> related)? If there is any other information I can provide, I am more
> than happy to do so.

I don't have much idea on how things end up causing
NULL-pointer-deref... but let's point out suspicious things.

> [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/

@Mathieu: In patch 1/3 description,
> Changes since v7:
> - Explicitly initialize the subsystem from start_kernel() right
>   after mm_core_init() so it is up and running before the creation of
>   the first mm at boot.

But how does this work when someone calls mm_cpumask() on init_mm early?
Looks like it will behave incorrectly because get_rss_stat_items_size()
returns zero?

While it doesn't crash on my environment, it triggers a two warnings
(with -smp 2 option added). IIUC the cpu bit should have been set in
setup_arch(), but at the wrong location. After the
percpu_counter_tree_subsystem_init() function is called, the bit doesn't
appear to be set.

[    1.392787][    T1] ------------[ cut here ]------------
[    1.392935][    T1] WARNING: arch/powerpc/mm/mmu_context.c:106 at switch_mm_irqs_off+0x190/0x1c0, CPU#0: swapper/0/1
[    1.393187][    T1] Modules linked in:
[    1.393458][    T1] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-rc4-next-20260319 #1 PREEMPTLAZY
[    1.393600][    T1] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
[    1.393711][    T1] NIP:  c00000000014e390 LR: c00000000014e30c CTR: 0000000000000000
[    1.393752][    T1] REGS: c000000003def7b0 TRAP: 0700   Not tainted  (7.0.0-rc4-next-20260319)
[    1.393807][    T1] MSR:  8000000002021032 <SF,VEC,ME,IR,DR,RI>  CR: 2800284a  XER: 00000000
[    1.393944][    T1] CFAR: c00000000014e328 IRQMASK: 3
[    1.393944][    T1] GPR00: c00000000014e36c c000000003defa50 c000000001bb8100 c0000000028d8c80
[    1.393944][    T1] GPR04: c000000004ddc04a 000000000000000a 0000000022222222 2222222222222222
[    1.393944][    T1] GPR08: 2222222222222222 0000000000000000 0000000000000001 0000000000008000
[    1.393944][    T1] GPR12: c000000000521e80 c000000002c70000 c00000000000fff0 0000000000000000
[    1.393944][    T1] GPR16: 0000000000000000 c00000000606c600 c000000003623ac0 0000000000000000
[    1.393944][    T1] GPR20: c000000004c66300 c00000000606fc00 0000000000000001 0000000000000001
[    1.393944][    T1] GPR24: c000000006069c00 c00000000272c500 0000000000000000 0000000000000000
[    1.393944][    T1] GPR28: c000000003d68200 0000000000000000 c0000000028d8a80 c00000000272bd00
[    1.394355][    T1] NIP [c00000000014e390] switch_mm_irqs_off+0x190/0x1c0
[    1.394395][    T1] LR [c00000000014e30c] switch_mm_irqs_off+0x10c/0x1c0
[    1.394519][    T1] Call Trace:
[    1.394584][    T1] [c000000003defa50] [c00000000014e36c] switch_mm_irqs_off+0x16c/0x1c0 (unreliable)
[    1.394676][    T1] [c000000003defab0] [c0000000006edbf0] begin_new_exec+0x534/0xf60
[    1.394732][    T1] [c000000003defb20] [c000000000795538] load_elf_binary+0x494/0x1d1c
[    1.394765][    T1] [c000000003defc70] [c0000000006eb910] bprm_execve+0x380/0x720
[    1.394796][    T1] [c000000003defd00] [c0000000006ed5a8] kernel_execve+0x12c/0x1bc
[    1.394831][    T1] [c000000003defd50] [c00000000000eda8] run_init_process+0xf8/0x160
[    1.394864][    T1] [c000000003defde0] [c0000000000100b4] kernel_init+0xcc/0x268
[    1.394899][    T1] [c000000003defe50] [c00000000000cf14] ret_from_kernel_user_thread+0x14/0x1c
[    1.394946][    T1] ---- interrupt: 0 at 0x0
[    1.395205][    T1] Code: 7fe4fb78 7f83e378 48009171 60000000 4bffff98 60000000 60000000 60000000 0fe00000 4bffff00 60000000 60000000 <0fe00000> 4bffff98 60000000 60000000
[    1.395420][    T1] ---[ end trace 0000000000000000 ]---
[    1.526024][   T67] mount (67) used greatest stack depth: 28432 bytes left
[    1.605803][   T69] mount (69) used greatest stack depth: 27872 bytes left
[    1.667853][   T71] mkdir (71) used greatest stack depth: 27248 bytes left
Saving 256 bits of creditable seed for next boot
[    1.926636][   T80] ------------[ cut here ]------------
[    1.926719][   T80] WARNING: arch/powerpc/mm/mmu_context.c:51 at switch_mm_irqs_off+0x180/0x1c0, CPU#0: S01seedrng/80
[    1.926782][   T80] Modules linked in:
[    1.926910][   T80] CPU: 0 UID: 0 PID: 80 Comm: S01seedrng Tainted: G        W           7.0.0-rc4-next-20260319 #1 PREEMPTLAZY
[    1.926990][   T80] Tainted: [W]=WARN
[    1.927025][   T80] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
[    1.927091][   T80] NIP:  c00000000014e380 LR: c00000000014e24c CTR: c000000000232894
[    1.927131][   T80] REGS: c000000004d5f800 TRAP: 0700   Tainted: G        W            (7.0.0-rc4-next-20260319)
[    1.927179][   T80] MSR:  8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 28002828  XER: 20000000
[    1.927253][   T80] CFAR: c00000000014e280 IRQMASK: 1
[    1.927253][   T80] GPR00: c0000000002328ec c000000004d5faa0 c000000001bb8100 0000000000000080
[    1.927253][   T80] GPR04: c0000000028d8280 c000000004509c00 0000000000000002 c00000000272c700
[    1.927253][   T80] GPR08: fffffffffffffffe c0000000028d8280 0000000000000000 0000000048002828
[    1.927253][   T80] GPR12: c000000000232894 c000000002c70000 0000000000000000 0000000000000002
[    1.927253][   T80] GPR16: 0000000000000000 000001002f0a2958 000001002f0a2950 ffffffffffffffff
[    1.927253][   T80] GPR20: 0000000000000000 0000000000000000 c000000002ab1400 c00000000272c700
[    1.927253][   T80] GPR24: 0000000000000000 c0000000028d8a80 0000000000000000 0000000000000000
[    1.927253][   T80] GPR28: c000000004509c00 0000000000000000 c00000000272bd00 c0000000028d8280
[    1.927629][   T80] NIP [c00000000014e380] switch_mm_irqs_off+0x180/0x1c0
[    1.927678][   T80] LR [c00000000014e24c] switch_mm_irqs_off+0x4c/0x1c0
[    1.927715][   T80] Call Trace:
[    1.927737][   T80] [c000000004d5faa0] [c000000004d5faf0] 0xc000000004d5faf0 (unreliable)
[    1.927804][   T80] [c000000004d5fb00] [c0000000002328ec] do_shoot_lazy_tlb+0x58/0x84
[    1.927853][   T80] [c000000004d5fb30] [c000000000388304] smp_call_function_many_cond+0x6a0/0x8d8
[    1.927902][   T80] [c000000004d5fc20] [c000000000388624] on_each_cpu_cond_mask+0x40/0x7c
[    1.927943][   T80] [c000000004d5fc50] [c000000000232ad4] __mmdrop+0x88/0x2ec
[    1.927986][   T80] [c000000004d5fce0] [c000000000242104] do_exit+0x350/0xde4
[    1.928028][   T80] [c000000004d5fdb0] [c000000000242de0] do_group_exit+0x48/0xbc
[    1.928072][   T80] [c000000004d5fdf0] [c000000000242e74] pid_child_should_wake+0x0/0x84
[    1.928128][   T80] [c000000004d5fe10] [c000000000030218] system_call_exception+0x148/0x3c0
[    1.928176][   T80] [c000000004d5fe50] [c00000000000c6d4] system_call_common+0xf4/0x258
[    1.928217][   T80] ---- interrupt: c00 at 0x7fff8ade507c
[    1.928253][   T80] NIP:  00007fff8ade507c LR: 00007fff8ade5034 CTR: 0000000000000000
[    1.928291][   T80] REGS: c000000004d5fe80 TRAP: 0c00   Tainted: G        W            (7.0.0-rc4-next-20260319)
[    1.928333][   T80] MSR:  800000000280f032 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI>  CR: 24002824  XER: 00000000
[    1.928413][   T80] IRQMASK: 0
[    1.928413][   T80] GPR00: 00000000000000ea 00007fffe75beb50 00007fff8aed7300 0000000000000000
[    1.928413][   T80] GPR04: 0000000000000000 00007fffe75beda0 00007fffe75bedb0 0000000000000000
[    1.928413][   T80] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[    1.928413][   T80] GPR12: 0000000000000000 00007fff8afaae00 00007fffca692568 0000000133cf0440
[    1.928413][   T80] GPR16: 0000000000000000 000001002f0a2958 000001002f0a2950 ffffffffffffffff
[    1.928413][   T80] GPR20: 0000000000000000 0000000000000000 00007fffe75bf838 00007fff8afa0000
[    1.928413][   T80] GPR24: 0000000126911328 0000000000000001 00007fff8af9dc00 00007fffe75bf818
[    1.928413][   T80] GPR28: 0000000000000003 fffffffffffff000 0000000000000000 00007fff8afa3e10
[    1.928765][   T80] NIP [00007fff8ade507c] 0x7fff8ade507c
[    1.928795][   T80] LR [00007fff8ade5034] 0x7fff8ade5034
[    1.928835][   T80] ---- interrupt: c00
[    1.928924][   T80] Code: 7c0803a6 4e800020 60000000 60000000 7fe4fb78 7f83e378 48009171 60000000 4bffff98 60000000 60000000 60000000 <0fe00000> 4bffff00 60000000 60000000
[    1.929054][   T80] ---[ end trace 0000000000000000 ]---

> [2]: https://lore.kernel.org/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de/

> [3]: https://lore.kernel.org/20260311125539.4123672-2-mclapinski@google.com/

@Michal: Something my AI buddy pointed out... (that I think is valid):

> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index df34797691bd..7363b5b0d22a 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -2078,9 +2082,11 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>  			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
>  			unsigned long chunk_end = min(mo_pfn, epfn);
>  
> -			nr_pages += deferred_init_pages(zone, spfn, chunk_end);

Previously, deferred_init_pages() returned nr of pages to add, which is
(end_pfn (= chunk_end) - spfn).

> -			deferred_free_pages(spfn, chunk_end - spfn);
> +			// KHO scratch is MAX_ORDER_NR_PAGES aligned.
> +			if (!pfn_is_kho_scratch(spfn))
> +				deferred_init_pages(zone, spfn, chunk_end);

But since the function is not always called with the change,
the calculation is moved to...

> +			deferred_free_pages(spfn, chunk_end - spfn);
>  			spfn = chunk_end;
>  
>  			if (can_resched)
> @@ -2088,6 +2094,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>  			else
>  				touch_nmi_watchdog();
>  		}
> +		nr_pages += epfn - spfn;

Here.

But this is incorrect, because here we have:
> static unsigned long __init
> deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>                            struct zone *zone, bool can_resched)
> {
>         int nid = zone_to_nid(zone);
>         unsigned long nr_pages = 0;
>         phys_addr_t start, end;
>         u64 i = 0;
> 
>         for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
>                 unsigned long spfn = PFN_UP(start);
>                 unsigned long epfn = PFN_DOWN(end);
> 
>                 if (spfn >= end_pfn)
>                         break;
> 
>                 spfn = max(spfn, start_pfn);
>                 epfn = min(epfn, end_pfn);
> 
>                 while (spfn < epfn) {

The loop condition is (spfn < epfn), and by the time the loop terminates...

>                         unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
>                         unsigned long chunk_end = min(mo_pfn, epfn);
> 
>                         // KHO scratch is MAX_ORDER_NR_PAGES aligned.
>                         if (!pfn_is_kho_scratch(spfn))
>                                 deferred_init_pages(zone, spfn, chunk_end);
> 
>                         deferred_free_pages(spfn, chunk_end - spfn);
>                         spfn = chunk_end;
> 
>                         if (can_resched)
>                                 cond_resched();
>                         else
>                                 touch_nmi_watchdog();
>                 }
>                 nr_pages += epfn - spfn;

epfn - spfn <= 0.

So the number of pages returned by deferred_init_memmap_chunk() becomes
incorrect.

The equivalent translation of what's there before would be doing
`nr_pages += chunk_end - spfn;` within the loop.

-- 
Cheers,
Harry / Hyeonggon

^ permalink raw reply

* [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run4 (2)
From: Qing Wang @ 2026-03-20  2:41 UTC (permalink / raw)
  To: syzbot+ca51b6e7e751edd6bbfd, Alexei Starovoitov, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: bpf, linux-kernel, linux-trace-kernel, syzkaller-bugs
In-Reply-To: <69bc31f8.050a0220.18f14c.0051.GAE@google.com>

There was a fix patch [1] for this issue, and it is the same as syz-AI's
analysis.
 [1] https://lore.kernel.org/all/20260304092345.233522-1-wangqing7171@gmail.com/T/

Some similar issues which have syz reproducer:
  https://syzkaller.appspot.com/bug?extid=9ea7c90be2b24e189592
  https://syzkaller.appspot.com/bug?extid=b4c5ad098c821bf8d8bc

Welcome to review and comment this patch.
--
Qing

^ permalink raw reply

* NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Nathan Chancellor @ 2026-03-19 23:37 UTC (permalink / raw)
  To: Mathieu Desnoyers, Thomas Weißschuh, Michal Clapinski
  Cc: Andrew Morton, Thomas Gleixner, Steven Rostedt, Masami Hiramatsu,
	linux-mm, linux-trace-kernel, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 5881 bytes --]

Hi all,

I am not really sure whose bug this is, as it only appears when three
seemingly independent patch series are applied together, so I have added
the patch authors and their committers (along with the tracing
maintainers) to this thread. Feel free to expand or reduce that list as
necessary.

Our continuous integration has noticed a crash when booting
ppc64_guest_defconfig in QEMU on the past few -next versions.

  https://github.com/ClangBuiltLinux/continuous-integration2/actions/runs/23311154492/job/67811527112

This does not appear to be clang related, as it can be reproduced with
GCC 15.2.0 as well. Through multiple bisects, I was able to land on
applying:

  mm: improve RSS counter approximation accuracy for proc interfaces [1]
  vdso/datastore: Allocate data pages dynamically [2]
  kho: fix deferred init of kho scratch [3]

and their dependent changes on top of 7.0-rc4 is enough to reproduce
this (at least on two of my machines with the same commands). I have
attached the diff from the result of the following 'git apply' commands
below, done in a linux-next checkout.

  $ git checkout v7.0-rc4
  HEAD is now at f338e7738378 Linux 7.0-rc4

  # [1]
  $ git diff 60ddf3eed4999bae440d1cf9e5868ccb3f308b64^..087dd6d2cc12c82945ab859194c32e8e977daae3 | git apply -3v
  ...

  # [2]
  # Fix trivial conflict in init/main.c around headers
  $ git diff dc432ab7130bb39f5a351281a02d4bc61e85a14a^..05988dba11791ccbb458254484826b32f17f4ad2 | git apply -3v
  ...

  # [3]
  # Fix conflict in kernel/liveupdate/kexec_handover.c due to lack of kho_mem_retrieve(), just add pfn_is_kho_scratch()
  $ git show 4a78467ffb537463486968232daef1e8a2f105e3 | git apply -3v
  ...

  $ make -skj"$(nproc)" ARCH=powerpc CROSS_COMPILE=powerpc64-linux- mrproper ppc64_guest_defconfig vmlinux

  $ curl -LSs https://github.com/ClangBuiltLinux/boot-utils/releases/download/20241120-044434/ppc64-rootfs.cpio.zst | zstd -d >rootfs.cpio

  $ qemu-system-ppc64 \
      -display none \
      -nodefaults \
      -cpu power8 \
      -machine pseries \
      -vga none \
      -kernel vmlinux \
      -initrd rootfs.cpio \
      -m 1G \
      -serial mon:stdio
  ...
  [    0.000000][    T0] Linux version 7.0.0-rc4-dirty (nathan@framework-amd-ryzen-maxplus-395) (powerpc64-linux-gcc (GCC) 15.2.0, GNU ld (GNU Binutils) 2.45) #1 SMP PREEMPT Thu Mar 19 15:45:53 MST 2026
  ...
  [    0.216764][    T1] vgaarb: loaded
  [    0.217590][    T1] clocksource: Switched to clocksource timebase
  [    0.221007][   T12] BUG: Kernel NULL pointer dereference at 0x00000010
  [    0.221049][   T12] Faulting instruction address: 0xc00000000044947c
  [    0.221237][   T12] Oops: Kernel access of bad area, sig: 11 [#1]
  [    0.221276][   T12] BE PAGE_SIZE=64K MMU=Hash  SMP NR_CPUS=2048 NUMA pSeries
  [    0.221359][   T12] Modules linked in:
  [    0.221556][   T12] CPU: 0 UID: 0 PID: 12 Comm: kworker/u4:0 Not tainted 7.0.0-rc4-dirty #1 PREEMPTLAZY
  [    0.221631][   T12] Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD pSeries
  [    0.221765][   T12] Workqueue: trace_init_wq tracer_init_tracefs_work_func
  [    0.222065][   T12] NIP:  c00000000044947c LR: c00000000041a584 CTR: c00000000053aa90
  [    0.222084][   T12] REGS: c000000003bc7960 TRAP: 0380   Not tainted  (7.0.0-rc4-dirty)
  [    0.222111][   T12] MSR:  8000000000009032 <SF,EE,ME,IR,DR,RI>  CR: 44000204  XER: 00000000
  [    0.222287][   T12] CFAR: c000000000449420 IRQMASK: 0
  [    0.222287][   T12] GPR00: c00000000041a584 c000000003bc7c00 c000000001c08100 c000000002892f20
  [    0.222287][   T12] GPR04: c0000000019cfa68 c0000000019cfa60 0000000000000001 0000000000000064
  [    0.222287][   T12] GPR08: 0000000000000002 0000000000000000 c000000003bba000 0000000000000010
  [    0.222287][   T12] GPR12: c00000000053aa90 c000000002c50000 c000000001ab25f8 c000000001626690
  [    0.222287][   T12] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  [    0.222287][   T12] GPR20: c000000001624868 c000000001ab2708 c0000000019cfa08 c000000001a00d18
  [    0.222287][   T12] GPR24: c0000000019cfa18 fffffffffffffef7 c000000003051205 c0000000019cfa68
  [    0.222287][   T12] GPR28: 0000000000000000 c0000000019cfa60 c000000002894e90 0000000000000000
  [    0.222526][   T12] NIP [c00000000044947c] __find_event_file+0x9c/0x110
  [    0.222572][   T12] LR [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
  [    0.222643][   T12] Call Trace:
  [    0.222690][   T12] [c000000003bc7c00] [c000000000b943b0] tracefs_create_file+0x1a0/0x2b0 (unreliable)
  [    0.222766][   T12] [c000000003bc7c50] [c00000000041a584] init_tracer_tracefs+0x274/0xcc0
  [    0.222791][   T12] [c000000003bc7dc0] [c000000002046f1c] tracer_init_tracefs_work_func+0x50/0x320
  [    0.222809][   T12] [c000000003bc7e50] [c000000000276958] process_one_work+0x1b8/0x530
  [    0.222828][   T12] [c000000003bc7f10] [c00000000027778c] worker_thread+0x1dc/0x3d0
  [    0.222883][   T12] [c000000003bc7f90] [c000000000284c44] kthread+0x194/0x1b0
  [    0.222900][   T12] [c000000003bc7fe0] [c00000000000cf30] start_kernel_thread+0x14/0x18
  [    0.222961][   T12] Code: 7c691b78 7f63db78 2c090000 40820018 e89c0000 49107f21 60000000 2c030000 41820048 ebff0000 7c3ff040 41820038 <e93f0010> 7fa3eb78 81490058 e8890018
  [    0.223190][   T12] ---[ end trace 0000000000000000 ]---
  ...

Interestingly, turning on CONFIG_KASAN appears to hide this, maybe
pointing to some sort of memory corruption (or something timing
related)? If there is any other information I can provide, I am more
than happy to do so.

[1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
[2]: https://lore.kernel.org/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de/
[3]: https://lore.kernel.org/20260311125539.4123672-2-mclapinski@google.com/

Cheers,
Nathan

[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 77979 bytes --]

diff --git a/Documentation/core-api/percpu-counter-tree.rst b/Documentation/core-api/percpu-counter-tree.rst
new file mode 100644
index 000000000000..196da056e7b4
--- /dev/null
+++ b/Documentation/core-api/percpu-counter-tree.rst
@@ -0,0 +1,75 @@
+========================================
+The Hierarchical Per-CPU Counters (HPCC)
+========================================
+
+:Author: Mathieu Desnoyers
+
+Introduction
+============
+
+Counters come in many varieties, each with their own trade offs:
+
+ * A global atomic counter provides a fast read access to the current
+   sum, at the expense of cache-line bouncing on updates. This leads to
+   poor performance of frequent updates from various cores on large SMP
+   systems.
+
+ * A per-cpu split counter provides fast updates to per-cpu counters,
+   at the expense of a slower aggregation (sum). The sum operation needs
+   to iterate over all per-cpu counters to calculate the current total.
+
+The hierarchical per-cpu counters attempt to provide the best of both
+worlds (fast updates, and fast sum) by relaxing requirements on the sum
+accuracy. It allows quickly querying an approximated sum value, along
+with the possible min/max ranges of the associated precise sum. The
+exact precise sum can still be calculated with an iteration on all
+per-cpu counter, but the availability of an approximated sum value with
+possible precise sum min/max ranges allows eliminating candidates which
+are certainly outside of a known target range without the overhead of
+precise sums.
+
+Overview
+========
+
+The herarchical per-cpu counters are organized as a tree with the tree
+root at the bottom (last level) and the first level of the tree
+consisting of per-cpu counters.
+
+The intermediate tree levels contain carry propagation counters. When
+reaching a threshold (batch size), the carry is propagated down the
+tree.
+
+This allows reading an approximated value at the root, which has a
+bounded accuracy (minimum/maximum possible precise sum range) determined
+by the tree topology.
+
+Use Cases
+=========
+
+Use cases HPCC is meant to handle invove tracking resources which are
+used across many CPUs to quickly sum as feedback for decision making to
+apply throttling, quota limits, sort tasks, and perform memory or task
+migration decisions. When considering approximated sums within the
+accuracy range of the decision threshold, the user can either:
+
+ * Be conservative and fast: Consider that the sum has reached the
+   limit as soon as the given limit is within the approximation range.
+
+ * Be aggressive and fast: Consider that the sum is over the
+   limit only when the approximation range is over the given limit.
+
+ * Be precise and slow: Do a precise comparison with the limit, which
+   requires a precise sum when the limit is within the approximated
+   range.
+
+One use-case for these hierarchical counters is to implement a two-pass
+algorithm to speed up sorting picking a maximum/minimunm sum value from
+a set. A first pass compares the approximated values, and then a second
+pass only needs the precise sum for counter trees which are within the
+possible precise sum range of the counter tree chosen by the first pass.
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/percpu_counter_tree.h
+.. kernel-doc:: lib/percpu_counter_tree.c
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index ac4129d1d741..612a6da6127a 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -35,6 +35,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
 int kho_add_subtree(const char *name, void *fdt);
 void kho_remove_subtree(void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
+bool pfn_is_kho_scratch(unsigned long pfn);
 
 void kho_memory_init(void);
 
@@ -109,6 +110,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 	return -EOPNOTSUPP;
 }
 
+static inline bool pfn_is_kho_scratch(unsigned long pfn)
+{
+	return false;
+}
+
 static inline void kho_memory_init(void) { }
 
 static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6ec5e9ac0699..3e217414e12d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -614,11 +614,9 @@ static inline void memtest_report_meminfo(struct seq_file *m) { }
 #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
 void memblock_set_kho_scratch_only(void);
 void memblock_clear_kho_scratch_only(void);
-void memmap_init_kho_scratch_pages(void);
 #else
 static inline void memblock_set_kho_scratch_only(void) { }
 static inline void memblock_clear_kho_scratch_only(void) { }
-static inline void memmap_init_kho_scratch_pages(void) {}
 #endif
 
 #endif /* _LINUX_MEMBLOCK_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..b2e478b14c87 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3057,38 +3057,47 @@ static inline bool get_user_page_fast_only(unsigned long addr,
 {
 	return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
 }
+
+static inline struct percpu_counter_tree_level_item *get_rss_stat_items(struct mm_struct *mm)
+{
+	unsigned long ptr = (unsigned long)mm;
+
+	ptr += offsetof(struct mm_struct, flexible_array);
+	return (struct percpu_counter_tree_level_item *)ptr;
+}
+
 /*
  * per-process(per-mm_struct) statistics.
  */
 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 {
-	return percpu_counter_read_positive(&mm->rss_stat[member]);
+	return percpu_counter_tree_approximate_sum_positive(&mm->rss_stat[member]);
 }
 
 static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
 {
-	return percpu_counter_sum_positive(&mm->rss_stat[member]);
+	return percpu_counter_tree_precise_sum_positive(&mm->rss_stat[member]);
 }
 
 void mm_trace_rss_stat(struct mm_struct *mm, int member);
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-	percpu_counter_add(&mm->rss_stat[member], value);
+	percpu_counter_tree_add(&mm->rss_stat[member], value);
 
 	mm_trace_rss_stat(mm, member);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-	percpu_counter_inc(&mm->rss_stat[member]);
+	percpu_counter_tree_add(&mm->rss_stat[member], 1);
 
 	mm_trace_rss_stat(mm, member);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-	percpu_counter_dec(&mm->rss_stat[member]);
+	percpu_counter_tree_add(&mm->rss_stat[member], -1);
 
 	mm_trace_rss_stat(mm, member);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc8ae722886..1a808d78245d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -18,7 +18,7 @@
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
-#include <linux/percpu_counter.h>
+#include <linux/percpu_counter_tree.h>
 #include <linux/types.h>
 #include <linux/rseq_types.h>
 #include <linux/bitmap.h>
@@ -1118,6 +1118,19 @@ typedef struct {
 	DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
 } __private mm_flags_t;
 
+/*
+ * The alignment of the mm_struct flexible array is based on the largest
+ * alignment of its content:
+ * __alignof__(struct percpu_counter_tree_level_item) provides a
+ * cacheline aligned alignment on SMP systems, else alignment on
+ * unsigned long on UP systems.
+ */
+#ifdef CONFIG_SMP
+# define __mm_struct_flexible_array_aligned	__aligned(__alignof__(struct percpu_counter_tree_level_item))
+#else
+# define __mm_struct_flexible_array_aligned	__aligned(__alignof__(unsigned long))
+#endif
+
 struct kioctx_table;
 struct iommu_mm_data;
 struct mm_struct {
@@ -1263,7 +1276,7 @@ struct mm_struct {
 		unsigned long saved_e_flags;
 #endif
 
-		struct percpu_counter rss_stat[NR_MM_COUNTERS];
+		struct percpu_counter_tree rss_stat[NR_MM_COUNTERS];
 
 		struct linux_binfmt *binfmt;
 
@@ -1374,10 +1387,13 @@ struct mm_struct {
 	} __randomize_layout;
 
 	/*
-	 * The mm_cpumask needs to be at the end of mm_struct, because it
-	 * is dynamically sized based on nr_cpu_ids.
+	 * The rss hierarchical counter items, mm_cpumask, and mm_cid
+	 * masks need to be at the end of mm_struct, because they are
+	 * dynamically sized based on nr_cpu_ids.
+	 * The content of the flexible array needs to be placed in
+	 * decreasing alignment requirement order.
 	 */
-	char flexible_array[] __aligned(__alignof__(unsigned long));
+	char flexible_array[] __mm_struct_flexible_array_aligned;
 };
 
 /* Copy value to the first system word of mm flags, non-atomically. */
@@ -1414,24 +1430,30 @@ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
 			 MT_FLAGS_USE_RCU)
 extern struct mm_struct init_mm;
 
-#define MM_STRUCT_FLEXIBLE_ARRAY_INIT				\
-{								\
-	[0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0	\
+#define MM_STRUCT_FLEXIBLE_ARRAY_INIT									\
+{													\
+	[0 ... (PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE * NR_MM_COUNTERS) + sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0	\
 }
 
-/* Pointer magic because the dynamic array size confuses some compilers. */
-static inline void mm_init_cpumask(struct mm_struct *mm)
+static inline size_t get_rss_stat_items_size(void)
 {
-	unsigned long cpu_bitmap = (unsigned long)mm;
-
-	cpu_bitmap += offsetof(struct mm_struct, flexible_array);
-	cpumask_clear((struct cpumask *)cpu_bitmap);
+	return percpu_counter_tree_items_size() * NR_MM_COUNTERS;
 }
 
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 {
-	return (struct cpumask *)&mm->flexible_array;
+	unsigned long ptr = (unsigned long)mm;
+
+	ptr += offsetof(struct mm_struct, flexible_array);
+	/* Skip RSS stats counters. */
+	ptr += get_rss_stat_items_size();
+	return (struct cpumask *)ptr;
+}
+
+static inline void mm_init_cpumask(struct mm_struct *mm)
+{
+	cpumask_clear((struct cpumask *)mm_cpumask(mm));
 }
 
 #ifdef CONFIG_LRU_GEN
@@ -1523,6 +1545,8 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
 	unsigned long bitmap = (unsigned long)mm;
 
 	bitmap += offsetof(struct mm_struct, flexible_array);
+	/* Skip RSS stats counters. */
+	bitmap += get_rss_stat_items_size();
 	/* Skip cpu_bitmap */
 	bitmap += cpumask_size();
 	return (struct cpumask *)bitmap;
diff --git a/include/linux/percpu_counter_tree.h b/include/linux/percpu_counter_tree.h
new file mode 100644
index 000000000000..828c763edd4a
--- /dev/null
+++ b/include/linux/percpu_counter_tree.h
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR MIT */
+/* SPDX-FileCopyrightText: 2025 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */
+
+#ifndef _PERCPU_COUNTER_TREE_H
+#define _PERCPU_COUNTER_TREE_H
+
+#include <linux/preempt.h>
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+
+#ifdef CONFIG_SMP
+
+#if NR_CPUS == (1U << 0)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	0
+#elif NR_CPUS <= (1U << 1)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	1
+#elif NR_CPUS <= (1U << 2)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	3
+#elif NR_CPUS <= (1U << 3)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	7
+#elif NR_CPUS <= (1U << 4)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	7
+#elif NR_CPUS <= (1U << 5)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	11
+#elif NR_CPUS <= (1U << 6)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	21
+#elif NR_CPUS <= (1U << 7)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	21
+#elif NR_CPUS <= (1U << 8)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	37
+#elif NR_CPUS <= (1U << 9)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	73
+#elif NR_CPUS <= (1U << 10)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	149
+#elif NR_CPUS <= (1U << 11)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	293
+#elif NR_CPUS <= (1U << 12)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	585
+#elif NR_CPUS <= (1U << 13)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	1173
+#elif NR_CPUS <= (1U << 14)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	2341
+#elif NR_CPUS <= (1U << 15)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	4681
+#elif NR_CPUS <= (1U << 16)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	4681
+#elif NR_CPUS <= (1U << 17)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	8777
+#elif NR_CPUS <= (1U << 18)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	17481
+#elif NR_CPUS <= (1U << 19)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	34953
+#elif NR_CPUS <= (1U << 20)
+# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS	69905
+#else
+# error "Unsupported number of CPUs."
+#endif
+
+struct percpu_counter_tree_level_item {
+	atomic_long_t count;		/*
+					 * Count the number of carry for this tree item.
+					 * The carry counter is kept at the order of the
+					 * carry accounted for at this tree level.
+					 */
+} ____cacheline_aligned_in_smp;
+
+#define PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE	\
+	(PERCPU_COUNTER_TREE_STATIC_NR_ITEMS * sizeof(struct percpu_counter_tree_level_item))
+
+struct percpu_counter_tree {
+	/* Fast-path fields. */
+	unsigned long __percpu *level0;	/* Pointer to per-CPU split counters (tree level 0). */
+	unsigned long level0_bit_mask;	/* Bit mask to apply to detect carry propagation from tree level 0. */
+	union {
+		unsigned long *i;	/* Approximate sum for single-CPU topology. */
+		atomic_long_t *a;	/* Approximate sum for SMP topology.  */
+	} approx_sum;
+	long bias;			/* Bias to apply to counter precise and approximate values. */
+
+	/* Slow-path fields. */
+	struct percpu_counter_tree_level_item *items;	/* Array of tree items for levels 1 to N. */
+	unsigned long batch_size;	/*
+					 * The batch size is the increment step at level 0 which
+					 * triggers a carry propagation. The batch size is required
+					 * to be greater than 1, and a power of 2.
+					 */
+	/*
+	 * The tree approximate sum is guaranteed to be within this accuracy range:
+	 * (precise_sum - approx_accuracy_range.under) <= approx_sum <= (precise_sum + approx_accuracy_range.over).
+	 * This accuracy is derived from the hardware topology and the tree batch_size.
+	 * The "under" accuracy is larger than the "over" accuracy because the negative range of a
+	 * two's complement signed integer is one unit larger than the positive range. This delta
+	 * is summed for each tree item, which leads to a significantly larger "under" accuracy range
+	 * compared to the "over" accuracy range.
+	 */
+	struct {
+		unsigned long under;
+		unsigned long over;
+	} approx_accuracy_range;
+};
+
+size_t percpu_counter_tree_items_size(void);
+int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, struct percpu_counter_tree_level_item *items,
+				  unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags);
+int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct percpu_counter_tree_level_item *items,
+			     unsigned long batch_size, gfp_t gfp_flags);
+void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counter, unsigned int nr_counters);
+void percpu_counter_tree_destroy(struct percpu_counter_tree *counter);
+void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc);
+long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter);
+int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b);
+int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tree *counter, long v);
+int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b);
+int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *counter, long v);
+void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v);
+int percpu_counter_tree_subsystem_init(void);
+
+/**
+ * percpu_counter_tree_approximate_sum() - Return approximate counter sum.
+ * @counter: The counter to sum.
+ *
+ * Querying the approximate sum is fast, but it is only accurate within
+ * the bounds delimited by percpu_counter_tree_approximate_accuracy_range().
+ * This is meant to be used when speed is preferred over accuracy.
+ *
+ * Return: The current approximate counter sum.
+ */
+static inline
+long percpu_counter_tree_approximate_sum(struct percpu_counter_tree *counter)
+{
+	unsigned long v;
+
+	if (!counter->level0_bit_mask)
+		v = READ_ONCE(*counter->approx_sum.i);
+	else
+		v = atomic_long_read(counter->approx_sum.a);
+	return (long) (v + (unsigned long)READ_ONCE(counter->bias));
+}
+
+/**
+ * percpu_counter_tree_approximate_accuracy_range - Query the accuracy range for a counter tree.
+ * @counter: Counter to query.
+ * @under: Pointer to a variable to be incremented of the approximation
+ *         accuracy range below the precise sum.
+ * @over: Pointer to a variable to be incremented of the approximation
+ *        accuracy range above the precise sum.
+ *
+ * Query the accuracy range limits for the counter.
+ * Because of two's complement binary representation, the "under" range is typically
+ * slightly larger than the "over" range.
+ * Those values are derived from the hardware topology and the counter tree batch size.
+ * They are invariant for a given counter tree.
+ * Using this function should not be typically required, see the following functions instead:
+ * * percpu_counter_tree_approximate_compare(),
+ * * percpu_counter_tree_approximate_compare_value(),
+ * * percpu_counter_tree_precise_compare(),
+ * * percpu_counter_tree_precise_compare_value().
+ */
+static inline
+void percpu_counter_tree_approximate_accuracy_range(struct percpu_counter_tree *counter,
+						    unsigned long *under, unsigned long *over)
+{
+	*under += counter->approx_accuracy_range.under;
+	*over += counter->approx_accuracy_range.over;
+}
+
+#else	/* !CONFIG_SMP */
+
+#define PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE	0
+
+struct percpu_counter_tree_level_item;
+
+struct percpu_counter_tree {
+	atomic_long_t count;
+};
+
+static inline
+size_t percpu_counter_tree_items_size(void)
+{
+	return 0;
+}
+
+static inline
+int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, struct percpu_counter_tree_level_item *items,
+				  unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags)
+{
+	for (unsigned int i = 0; i < nr_counters; i++)
+		atomic_long_set(&counters[i].count, 0);
+	return 0;
+}
+
+static inline
+int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct percpu_counter_tree_level_item *items,
+			     unsigned long batch_size, gfp_t gfp_flags)
+{
+	return percpu_counter_tree_init_many(counter, items, 1, batch_size, gfp_flags);
+}
+
+static inline
+void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counter, unsigned int nr_counters)
+{
+}
+
+static inline
+void percpu_counter_tree_destroy(struct percpu_counter_tree *counter)
+{
+}
+
+static inline
+long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter)
+{
+	return atomic_long_read(&counter->count);
+}
+
+static inline
+int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b)
+{
+	long count_a = percpu_counter_tree_precise_sum(a),
+	     count_b = percpu_counter_tree_precise_sum(b);
+
+	if (count_a == count_b)
+		return 0;
+	if (count_a < count_b)
+		return -1;
+	return 1;
+}
+
+static inline
+int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *counter, long v)
+{
+	long count = percpu_counter_tree_precise_sum(counter);
+
+	if (count == v)
+		return 0;
+	if (count < v)
+		return -1;
+	return 1;
+}
+
+static inline
+int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b)
+{
+	return percpu_counter_tree_precise_compare(a, b);
+}
+
+static inline
+int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tree *counter, long v)
+{
+	return percpu_counter_tree_precise_compare_value(counter, v);
+}
+
+static inline
+void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v)
+{
+	atomic_long_set(&counter->count, v);
+}
+
+static inline
+void percpu_counter_tree_approximate_accuracy_range(struct percpu_counter_tree *counter,
+						    unsigned long *under, unsigned long *over)
+{
+}
+
+static inline
+void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc)
+{
+	atomic_long_add(inc, &counter->count);
+}
+
+static inline
+long percpu_counter_tree_approximate_sum(struct percpu_counter_tree *counter)
+{
+	return percpu_counter_tree_precise_sum(counter);
+}
+
+static inline
+int percpu_counter_tree_subsystem_init(void)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_SMP */
+
+/**
+ * percpu_counter_tree_approximate_sum_positive() - Return a positive approximate counter sum.
+ * @counter: The counter to sum.
+ *
+ * Return an approximate counter sum which is guaranteed to be greater
+ * or equal to 0.
+ *
+ * Return: The current positive approximate counter sum.
+ */
+static inline
+long percpu_counter_tree_approximate_sum_positive(struct percpu_counter_tree *counter)
+{
+	long v = percpu_counter_tree_approximate_sum(counter);
+	return v > 0 ? v : 0;
+}
+
+/**
+ * percpu_counter_tree_precise_sum_positive() - Return a positive precise counter sum.
+ * @counter: The counter to sum.
+ *
+ * Return a precise counter sum which is guaranteed to be greater
+ * or equal to 0.
+ *
+ * Return: The current positive precise counter sum.
+ */
+static inline
+long percpu_counter_tree_precise_sum_positive(struct percpu_counter_tree *counter)
+{
+	long v = percpu_counter_tree_precise_sum(counter);
+	return v > 0 ? v : 0;
+}
+
+/**
+ * percpu_counter_tree_approximate_min_max_range() - Return the approximation min and max precise values.
+ * @approx_sum: Approximated sum.
+ * @under: Tree accuracy range (under).
+ * @over: Tree accuracy range (over).
+ * @precise_min: Minimum possible value for precise sum (output).
+ * @precise_max: Maximum possible value for precise sum (output).
+ *
+ * Calculate the minimum and maximum precise values for a given
+ * approximation and (under, over) accuracy range.
+ *
+ * The range of the approximation as a function of the precise sum is expressed as:
+ *
+ *   approx_sum >= precise_sum - approx_accuracy_range.under
+ *   approx_sum <= precise_sum + approx_accuracy_range.over
+ *
+ * Therefore, the range of the precise sum as a function of the approximation is expressed as:
+ *
+ *   precise_sum <= approx_sum + approx_accuracy_range.under
+ *   precise_sum >= approx_sum - approx_accuracy_range.over
+ */
+static inline
+void percpu_counter_tree_approximate_min_max_range(long approx_sum, unsigned long under, unsigned long over,
+						   long *precise_min, long *precise_max)
+{
+	*precise_min = approx_sum - over;
+	*precise_max = approx_sum + under;
+}
+
+/**
+ * percpu_counter_tree_approximate_min_max() - Return the tree approximation, min and max possible precise values.
+ * @counter: The counter to sum.
+ * @approx_sum: Approximate sum (output).
+ * @precise_min: Minimum possible value for precise sum (output).
+ * @precise_max: Maximum possible value for precise sum (output).
+ *
+ * Return the approximate sum, minimum and maximum precise values for
+ * a counter.
+ */
+static inline
+void percpu_counter_tree_approximate_min_max(struct percpu_counter_tree *counter,
+					     long *approx_sum, long *precise_min, long *precise_max)
+{
+	unsigned long under = 0, over = 0;
+	long v = percpu_counter_tree_approximate_sum(counter);
+
+	percpu_counter_tree_approximate_accuracy_range(counter, &under, &over);
+	percpu_counter_tree_approximate_min_max_range(v, under, over, precise_min, precise_max);
+	*approx_sum = v;
+}
+
+#endif  /* _PERCPU_COUNTER_TREE_H */
diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h
index a91fa24b06e0..0b530428db71 100644
--- a/include/linux/vdso_datastore.h
+++ b/include/linux/vdso_datastore.h
@@ -2,9 +2,15 @@
 #ifndef _LINUX_VDSO_DATASTORE_H
 #define _LINUX_VDSO_DATASTORE_H
 
+#ifdef CONFIG_HAVE_GENERIC_VDSO
 #include <linux/mm_types.h>
 
 extern const struct vm_special_mapping vdso_vvar_mapping;
 struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr);
 
+void __init vdso_setup_data_pages(void);
+#else /* !CONFIG_HAVE_GENERIC_VDSO */
+static inline void vdso_setup_data_pages(void) { }
+#endif /* CONFIG_HAVE_GENERIC_VDSO */
+
 #endif /* _LINUX_VDSO_DATASTORE_H */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index cd7920c81f85..290ccb9fd25d 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -448,7 +448,7 @@ TRACE_EVENT(rss_stat,
 		 */
 		__entry->curr = current->mm == mm && !(current->flags & PF_KTHREAD);
 		__entry->member = member;
-		__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
+		__entry->size = (percpu_counter_tree_approximate_sum_positive(&mm->rss_stat[member])
 							    << PAGE_SHIFT);
 	),
 
diff --git a/init/main.c b/init/main.c
index 1cb395dd94e4..453ac9dff2da 100644
--- a/init/main.c
+++ b/init/main.c
@@ -105,6 +105,8 @@
 #include <linux/ptdump.h>
 #include <linux/time_namespace.h>
 #include <linux/unaligned.h>
+#include <linux/percpu_counter_tree.h>
+#include <linux/vdso_datastore.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -1067,6 +1069,7 @@ void start_kernel(void)
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
+	percpu_counter_tree_subsystem_init();
 	mm_core_init();
 	maple_tree_init();
 	poking_init();
@@ -1119,6 +1122,7 @@ void start_kernel(void)
 	srcu_init();
 	hrtimers_init();
 	softirq_init();
+	vdso_setup_data_pages();
 	timekeeping_init();
 	time_init();
 
diff --git a/kernel/fork.c b/kernel/fork.c
index bc2bf58b93b6..0de4c8727055 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -134,6 +134,11 @@
  */
 #define MAX_THREADS FUTEX_TID_MASK
 
+/*
+ * Batch size of rss stat approximation
+ */
+#define RSS_STAT_BATCH_SIZE	32
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -627,14 +632,12 @@ static void check_mm(struct mm_struct *mm)
 			 "Please make sure 'struct resident_page_types[]' is updated as well");
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
-		long x = percpu_counter_sum(&mm->rss_stat[i]);
-
-		if (unlikely(x)) {
+		if (unlikely(percpu_counter_tree_precise_compare_value(&mm->rss_stat[i], 0) != 0))
 			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
-				 mm, resident_page_types[i], x,
+				 mm, resident_page_types[i],
+				 percpu_counter_tree_precise_sum(&mm->rss_stat[i]),
 				 current->comm,
 				 task_pid_nr(current));
-		}
 	}
 
 	if (mm_pgtables_bytes(mm))
@@ -732,7 +735,7 @@ void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
-	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+	percpu_counter_tree_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
 	free_mm(mm);
 }
@@ -1125,8 +1128,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm, p))
 		goto fail_cid;
 
-	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
-				     NR_MM_COUNTERS))
+	if (percpu_counter_tree_init_many(mm->rss_stat, get_rss_stat_items(mm),
+					  NR_MM_COUNTERS, RSS_STAT_BATCH_SIZE,
+					  GFP_KERNEL_ACCOUNT))
 		goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
@@ -3008,7 +3012,7 @@ void __init mm_cache_init(void)
 	 * dynamically sized based on the maximum CPU number this system
 	 * can have, taking hotplug into account (nr_cpu_ids).
 	 */
-	mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+	mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size() + get_rss_stat_items_size();
 
 	mm_cachep = kmem_cache_create_usercopy("mm_struct",
 			mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index cc68a3692905..ce2786faf044 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -1333,6 +1333,23 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 }
 EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
 
+bool pfn_is_kho_scratch(unsigned long pfn)
+{
+	unsigned int i;
+	phys_addr_t scratch_start, scratch_end, phys = __pfn_to_phys(pfn);
+
+	for (i = 0; i < kho_scratch_cnt; i++) {
+		scratch_start = kho_scratch[i].addr;
+		scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
+
+		if (scratch_start <= phys && phys < scratch_end)
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(pfn_is_kho_scratch);
+
 static __init int kho_out_fdt_setup(void)
 {
 	void *root = kho_out.fdt;
@@ -1421,12 +1438,27 @@ static __init int kho_init(void)
 }
 fs_initcall(kho_init);
 
+static void __init kho_init_scratch_pages(void)
+{
+	if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+		return;
+
+	for (int i = 0; i < kho_scratch_cnt; i++) {
+		unsigned long pfn = PFN_DOWN(kho_scratch[i].addr);
+		unsigned long end_pfn = PFN_UP(kho_scratch[i].addr + kho_scratch[i].size);
+		int nid = early_pfn_to_nid(pfn);
+
+		for (; pfn < end_pfn; pfn++)
+			init_deferred_page(pfn, nid);
+	}
+}
+
 static void __init kho_release_scratch(void)
 {
 	phys_addr_t start, end;
 	u64 i;
 
-	memmap_init_kho_scratch_pages();
+	kho_init_scratch_pages();
 
 	/*
 	 * Mark scratch mem as CMA before we return it. That way we
@@ -1453,6 +1485,7 @@ void __init kho_memory_init(void)
 		kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys));
 	} else {
 		kho_reserve_scratch();
+		kho_init_scratch_pages();
 	}
 }
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 0f2fb9610647..0b8241e5b548 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -52,6 +52,18 @@ config PACKING_KUNIT_TEST
 
 	  When in doubt, say N.
 
+config PERCPU_COUNTER_TREE_TEST
+	tristate "Hierarchical Per-CPU counter test" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  This builds Kunit tests for the hierarchical per-cpu counters.
+
+	  For more information on KUnit and unit tests in general,
+	  please refer to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+	  When in doubt, say N.
+
 config BITREVERSE
 	tristate
 
diff --git a/lib/Makefile b/lib/Makefile
index 1b9ee167517f..abc32420b581 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -181,6 +181,7 @@ obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
 obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 obj-$(CONFIG_SMP) += percpu_counter.o
+obj-$(CONFIG_SMP) += percpu_counter_tree.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
diff --git a/lib/percpu_counter_tree.c b/lib/percpu_counter_tree.c
new file mode 100644
index 000000000000..beb1144e6450
--- /dev/null
+++ b/lib/percpu_counter_tree.c
@@ -0,0 +1,702 @@
+// SPDX-License-Identifier: GPL-2.0+ OR MIT
+// SPDX-FileCopyrightText: 2025 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+/*
+ * Split Counters With Tree Approximation Propagation
+ *
+ * * Propagation diagram when reaching batch size thresholds (± batch size):
+ *
+ * Example diagram for 8 CPUs:
+ *
+ * log2(8) = 3 levels
+ *
+ * At each level, each pair propagates its values to the next level when
+ * reaching the batch size thresholds.
+ *
+ * Counters at levels 0, 1, 2 can be kept on a single byte ([-128 .. +127] range),
+ * although it may be relevant to keep them on "long" counters for
+ * simplicity. (complexity vs memory footprint tradeoff)
+ *
+ * Counter at level 3 can be kept on a "long" counter.
+ *
+ * Level 0:  0    1    2    3    4    5    6    7
+ *           |   /     |   /     |   /     |   /
+ *           |  /      |  /      |  /      |  /
+ *           | /       | /       | /       | /
+ * Level 1:  0         1         2         3
+ *           |       /           |       /
+ *           |    /              |    /
+ *           | /                 | /
+ * Level 2:  0                   1
+ *           |               /
+ *           |         /
+ *           |   /
+ * Level 3:  0
+ *
+ * * Approximation accuracy:
+ *
+ * BATCH(level N): Level N batch size.
+ *
+ * Example for BATCH(level 0) = 32.
+ *
+ * BATCH(level 0) =  32
+ * BATCH(level 1) =  64
+ * BATCH(level 2) = 128
+ * BATCH(level N) = BATCH(level 0) * 2^N
+ *
+ *            per-counter     global
+ *            accuracy        accuracy
+ * Level 0:   [ -32 ..  +31]  ±256  (8 * 32)
+ * Level 1:   [ -64 ..  +63]  ±256  (4 * 64)
+ * Level 2:   [-128 .. +127]  ±256  (2 * 128)
+ * Total:      ------         ±768  (log2(nr_cpu_ids) * BATCH(level 0) * nr_cpu_ids)
+ *
+ * Note that the global accuracy can be calculated more precisely
+ * by taking into account that the positive accuracy range is
+ * 31 rather than 32.
+ *
+ * -----
+ *
+ * Approximate Sum Carry Propagation
+ *
+ * Let's define a number of counter bits for each level, e.g.:
+ *
+ * log2(BATCH(level 0)) = log2(32) = 5
+ * Let's assume, for this example, a 32-bit architecture (sizeof(long) == 4).
+ *
+ *               nr_bit        value_mask                      range
+ * Level 0:      5 bits        v                             0 ..  +31
+ * Level 1:      1 bit        (v & ~((1UL << 5) - 1))        0 ..  +63
+ * Level 2:      1 bit        (v & ~((1UL << 6) - 1))        0 .. +127
+ * Level 3:     25 bits       (v & ~((1UL << 7) - 1))        0 .. 2^32-1
+ *
+ * Note: Use a "long" per-cpu counter at level 0 to allow precise sum.
+ *
+ * Note: Use cacheline aligned counters at levels above 0 to prevent false sharing.
+ *       If memory footprint is an issue, a specialized allocator could be used
+ *       to eliminate padding.
+ *
+ * Example with expanded values:
+ *
+ * counter_add(counter, inc):
+ *
+ *         if (!inc)
+ *                 return;
+ *
+ *         res = percpu_add_return(counter @ Level 0, inc);
+ *         orig = res - inc;
+ *         if (inc < 0) {
+ *                 inc = -(-inc & ~0b00011111);  // Clear used bits
+ *                 // xor bit 5: underflow
+ *                 if ((inc ^ orig ^ res) & 0b00100000)
+ *                         inc -= 0b00100000;
+ *         } else {
+ *                 inc &= ~0b00011111;           // Clear used bits
+ *                 // xor bit 5: overflow
+ *                 if ((inc ^ orig ^ res) & 0b00100000)
+ *                         inc += 0b00100000;
+ *         }
+ *         if (!inc)
+ *                 return;
+ *
+ *         res = atomic_long_add_return(counter @ Level 1, inc);
+ *         orig = res - inc;
+ *         if (inc < 0) {
+ *                 inc = -(-inc & ~0b00111111);  // Clear used bits
+ *                 // xor bit 6: underflow
+ *                 if ((inc ^ orig ^ res) & 0b01000000)
+ *                         inc -= 0b01000000;
+ *         } else {
+ *                 inc &= ~0b00111111;           // Clear used bits
+ *                 // xor bit 6: overflow
+ *                 if ((inc ^ orig ^ res) & 0b01000000)
+ *                         inc += 0b01000000;
+ *         }
+ *         if (!inc)
+ *                 return;
+ *
+ *         res = atomic_long_add_return(counter @ Level 2, inc);
+ *         orig = res - inc;
+ *         if (inc < 0) {
+ *                 inc = -(-inc & ~0b01111111);  // Clear used bits
+ *                 // xor bit 7: underflow
+ *                 if ((inc ^ orig ^ res) & 0b10000000)
+ *                         inc -= 0b10000000;
+ *         } else {
+ *                 inc &= ~0b01111111;           // Clear used bits
+ *                 // xor bit 7: overflow
+ *                 if ((inc ^ orig ^ res) & 0b10000000)
+ *                         inc += 0b10000000;
+ *         }
+ *         if (!inc)
+ *                 return;
+ *
+ *         atomic_long_add(counter @ Level 3, inc);
+ */
+
+#include <linux/percpu_counter_tree.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/math.h>
+
+#define MAX_NR_LEVELS 5
+
+/*
+ * The counter configuration is selected at boot time based on the
+ * hardware topology.
+ */
+struct counter_config {
+	unsigned int nr_items;				/*
+							 * nr_items is the number of items in the tree for levels 1
+							 * up to and including the final level (approximate sum).
+							 * It excludes the level 0 per-CPU counters.
+							 */
+	unsigned char nr_levels;			/*
+							 * nr_levels is the number of hierarchical counter tree levels.
+							 * It excludes the final level (approximate sum).
+							 */
+	unsigned char n_arity_order[MAX_NR_LEVELS];	/*
+							 * n-arity of tree nodes for each level from
+							 * 0 to (nr_levels - 1).
+							 */
+};
+
+static const struct counter_config per_nr_cpu_order_config[] = {
+	[0] =	{ .nr_items = 0,	.nr_levels = 0,		.n_arity_order = { 0 } },
+	[1] =	{ .nr_items = 1,	.nr_levels = 1,		.n_arity_order = { 1 } },
+	[2] =	{ .nr_items = 3,	.nr_levels = 2,		.n_arity_order = { 1, 1 } },
+	[3] =	{ .nr_items = 7,	.nr_levels = 3,		.n_arity_order = { 1, 1, 1 } },
+	[4] =	{ .nr_items = 7,	.nr_levels = 3,		.n_arity_order = { 2, 1, 1 } },
+	[5] =	{ .nr_items = 11,	.nr_levels = 3,		.n_arity_order = { 2, 2, 1 } },
+	[6] =	{ .nr_items = 21,	.nr_levels = 3,		.n_arity_order = { 2, 2, 2 } },
+	[7] =	{ .nr_items = 21,	.nr_levels = 3,		.n_arity_order = { 3, 2, 2 } },
+	[8] =	{ .nr_items = 37,	.nr_levels = 3,		.n_arity_order = { 3, 3, 2 } },
+	[9] =	{ .nr_items = 73,	.nr_levels = 3,		.n_arity_order = { 3, 3, 3 } },
+	[10] =	{ .nr_items = 149,	.nr_levels = 4,		.n_arity_order = { 3, 3, 2, 2 } },
+	[11] =	{ .nr_items = 293,	.nr_levels = 4,		.n_arity_order = { 3, 3, 3, 2 } },
+	[12] =	{ .nr_items = 585,	.nr_levels = 4,		.n_arity_order = { 3, 3, 3, 3 } },
+	[13] =	{ .nr_items = 1173,	.nr_levels = 5,		.n_arity_order = { 3, 3, 3, 2, 2 } },
+	[14] =	{ .nr_items = 2341,	.nr_levels = 5,		.n_arity_order = { 3, 3, 3, 3, 2 } },
+	[15] =	{ .nr_items = 4681,	.nr_levels = 5,		.n_arity_order = { 3, 3, 3, 3, 3 } },
+	[16] =	{ .nr_items = 4681,	.nr_levels = 5,		.n_arity_order = { 4, 3, 3, 3, 3 } },
+	[17] =	{ .nr_items = 8777,	.nr_levels = 5,		.n_arity_order = { 4, 4, 3, 3, 3 } },
+	[18] =	{ .nr_items = 17481,	.nr_levels = 5,		.n_arity_order = { 4, 4, 4, 3, 3 } },
+	[19] =	{ .nr_items = 34953,	.nr_levels = 5,		.n_arity_order = { 4, 4, 4, 4, 3 } },
+	[20] =	{ .nr_items = 69905,	.nr_levels = 5,		.n_arity_order = { 4, 4, 4, 4, 4 } },
+};
+
+static const struct counter_config *counter_config;	/* Hierarchical counter configuration for the hardware topology. */
+static unsigned int nr_cpus_order;			/* Order of nr_cpu_ids. */
+static unsigned long accuracy_multiplier;		/* Calculate accuracy for a given batch size (multiplication factor). */
+
+static
+int __percpu_counter_tree_init(struct percpu_counter_tree *counter,
+			       unsigned long batch_size, gfp_t gfp_flags,
+			       unsigned long __percpu *level0,
+			       struct percpu_counter_tree_level_item *items)
+{
+	/* Batch size must be greater than 1, and a power of 2. */
+	if (WARN_ON(batch_size <= 1 || (batch_size & (batch_size - 1))))
+		return -EINVAL;
+	counter->batch_size = batch_size;
+	counter->bias = 0;
+	counter->level0 = level0;
+	counter->items = items;
+	if (!nr_cpus_order) {
+		counter->approx_sum.i = per_cpu_ptr(counter->level0, 0);
+		counter->level0_bit_mask = 0;
+	} else {
+		counter->approx_sum.a = &counter->items[counter_config->nr_items - 1].count;
+		counter->level0_bit_mask = 1UL << get_count_order(batch_size);
+	}
+	/*
+	 * Each tree item signed integer has a negative range which is
+	 * one unit greater than the positive range.
+	 */
+	counter->approx_accuracy_range.under = batch_size * accuracy_multiplier;
+	counter->approx_accuracy_range.over = (batch_size - 1) * accuracy_multiplier;
+	return 0;
+}
+
+/**
+ * percpu_counter_tree_init_many() - Initialize many per-CPU counter trees.
+ * @counters: An array of @nr_counters counters to initialize.
+ *	      Their memory is provided by the caller.
+ * @items: Pointer to memory area where to store tree items.
+ *	   This memory is provided by the caller.
+ *	   Its size needs to be at least @nr_counters * percpu_counter_tree_items_size().
+ * @nr_counters: The number of counter trees to initialize
+ * @batch_size: The batch size is the increment step at level 0 which triggers a
+ * 		carry propagation.
+ *		The batch size is required to be greater than 1, and a power of 2.
+ * @gfp_flags: gfp flags to pass to the per-CPU allocator.
+ *
+ * Initialize many per-CPU counter trees using a single per-CPU
+ * allocator invocation for @nr_counters counters.
+ *
+ * Return:
+ * * %0: Success
+ * * %-EINVAL:		- Invalid @batch_size argument
+ * * %-ENOMEM:		- Out of memory
+ */
+int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, struct percpu_counter_tree_level_item *items,
+				  unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags)
+{
+	void __percpu *level0, *level0_iter;
+	size_t counter_size = sizeof(*counters->level0),
+	       items_size = percpu_counter_tree_items_size();
+	void *items_iter;
+	unsigned int i;
+	int ret;
+
+	memset(items, 0, items_size * nr_counters);
+	level0 = __alloc_percpu_gfp(nr_counters * counter_size,
+				    __alignof__(*counters->level0), gfp_flags);
+	if (!level0)
+		return -ENOMEM;
+	level0_iter = level0;
+	items_iter = items;
+	for (i = 0; i < nr_counters; i++) {
+		ret = __percpu_counter_tree_init(&counters[i], batch_size, gfp_flags, level0_iter, items_iter);
+		if (ret)
+			goto free_level0;
+		level0_iter += counter_size;
+		items_iter += items_size;
+	}
+	return 0;
+
+free_level0:
+	free_percpu(level0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_init_many);
+
+/**
+ * percpu_counter_tree_init() - Initialize one per-CPU counter tree.
+ * @counter: Counter to initialize.
+ *	     Its memory is provided by the caller.
+ * @items: Pointer to memory area where to store tree items.
+ *	   This memory is provided by the caller.
+ *	   Its size needs to be at least percpu_counter_tree_items_size().
+ * @batch_size: The batch size is the increment step at level 0 which triggers a
+ * 		carry propagation.
+ *		The batch size is required to be greater than 1, and a power of 2.
+ * @gfp_flags: gfp flags to pass to the per-CPU allocator.
+ *
+ * Initialize one per-CPU counter tree.
+ *
+ * Return:
+ * * %0: Success
+ * * %-EINVAL:		- Invalid @batch_size argument
+ * * %-ENOMEM:		- Out of memory
+ */
+int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct percpu_counter_tree_level_item *items,
+			     unsigned long batch_size, gfp_t gfp_flags)
+{
+	return percpu_counter_tree_init_many(counter, items, 1, batch_size, gfp_flags);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_init);
+
+/**
+ * percpu_counter_tree_destroy_many() - Destroy many per-CPU counter trees.
+ * @counters: Array of counters trees to destroy.
+ * @nr_counters: The number of counter trees to destroy.
+ *
+ * Release internal resources allocated for @nr_counters per-CPU counter trees.
+ */
+
+void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counters, unsigned int nr_counters)
+{
+	free_percpu(counters->level0);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_destroy_many);
+
+/**
+ * percpu_counter_tree_destroy() - Destroy one per-CPU counter tree.
+ * @counter: Counter to destroy.
+ *
+ * Release internal resources allocated for one per-CPU counter tree.
+ */
+void percpu_counter_tree_destroy(struct percpu_counter_tree *counter)
+{
+	return percpu_counter_tree_destroy_many(counter, 1);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_destroy);
+
+static
+long percpu_counter_tree_carry(long orig, long res, long inc, unsigned long bit_mask)
+{
+	if (inc < 0) {
+		inc = -(-inc & ~(bit_mask - 1));
+		/*
+		 * xor bit_mask: underflow.
+		 *
+		 * If inc has bit set, decrement an additional bit if
+		 * there is _no_ bit transition between orig and res.
+		 * Else, inc has bit cleared, decrement an additional
+		 * bit if there is a bit transition between orig and
+		 * res.
+		 */
+		if ((inc ^ orig ^ res) & bit_mask)
+			inc -= bit_mask;
+	} else {
+		inc &= ~(bit_mask - 1);
+		/*
+		 * xor bit_mask: overflow.
+		 *
+		 * If inc has bit set, increment an additional bit if
+		 * there is _no_ bit transition between orig and res.
+		 * Else, inc has bit cleared, increment an additional
+		 * bit if there is a bit transition between orig and
+		 * res.
+		 */
+		if ((inc ^ orig ^ res) & bit_mask)
+			inc += bit_mask;
+	}
+	return inc;
+}
+
+/*
+ * It does not matter through which path the carry propagates up the
+ * tree, therefore there is no need to disable preemption because the
+ * cpu number is only used to favor cache locality.
+ */
+static
+void percpu_counter_tree_add_slowpath(struct percpu_counter_tree *counter, long inc)
+{
+	unsigned int level_items, nr_levels = counter_config->nr_levels,
+		     level, n_arity_order;
+	unsigned long bit_mask;
+	struct percpu_counter_tree_level_item *item = counter->items;
+	unsigned int cpu = raw_smp_processor_id();
+
+	WARN_ON_ONCE(!nr_cpus_order);	/* Should never be called for 1 cpu. */
+
+	n_arity_order = counter_config->n_arity_order[0];
+	bit_mask = counter->level0_bit_mask << n_arity_order;
+	level_items = 1U << (nr_cpus_order - n_arity_order);
+
+	for (level = 1; level < nr_levels; level++) {
+		/*
+		 * For the purpose of carry propagation, the
+		 * intermediate level counters only need to keep track
+		 * of the bits relevant for carry propagation. We
+		 * therefore don't care about higher order bits.
+		 * Note that this optimization is unwanted if the
+		 * intended use is to track counters within intermediate
+		 * levels of the topology.
+		 */
+		if (abs(inc) & (bit_mask - 1)) {
+			atomic_long_t *count = &item[cpu & (level_items - 1)].count;
+			unsigned long orig, res;
+
+			res = atomic_long_add_return_relaxed(inc, count);
+			orig = res - inc;
+			inc = percpu_counter_tree_carry(orig, res, inc, bit_mask);
+			if (likely(!inc))
+				return;
+		}
+		item += level_items;
+		n_arity_order = counter_config->n_arity_order[level];
+		level_items >>= n_arity_order;
+		bit_mask <<= n_arity_order;
+	}
+	atomic_long_add(inc, counter->approx_sum.a);
+}
+
+/**
+ * percpu_counter_tree_add() - Add to a per-CPU counter tree.
+ * @counter: Counter added to.
+ * @inc: Increment value (either positive or negative).
+ *
+ * Add @inc to a per-CPU counter tree. This is a fast-path which will
+ * typically increment per-CPU counters as long as there is no carry
+ * greater or equal to the counter tree batch size.
+ */
+void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc)
+{
+	unsigned long bit_mask = counter->level0_bit_mask, orig, res;
+
+	res = this_cpu_add_return(*counter->level0, inc);
+	orig = res - inc;
+	inc = percpu_counter_tree_carry(orig, res, inc, bit_mask);
+	if (likely(!inc))
+		return;
+	percpu_counter_tree_add_slowpath(counter, inc);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_add);
+
+static
+long percpu_counter_tree_precise_sum_unbiased(struct percpu_counter_tree *counter)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		sum += *per_cpu_ptr(counter->level0, cpu);
+	return (long) sum;
+}
+
+/**
+ * percpu_counter_tree_precise_sum() - Return precise counter sum.
+ * @counter: The counter to sum.
+ *
+ * Querying the precise sum is relatively expensive because it needs to
+ * iterate over all CPUs.
+ * This is meant to be used when accuracy is preferred over speed.
+ *
+ * Return: The current precise counter sum.
+ */
+long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter)
+{
+	return percpu_counter_tree_precise_sum_unbiased(counter) + READ_ONCE(counter->bias);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_sum);
+
+static
+int compare_delta(long delta, unsigned long accuracy_pos, unsigned long accuracy_neg)
+{
+	if (delta >= 0) {
+		if (delta <= accuracy_pos)
+			return 0;
+		else
+			return 1;
+	} else {
+		if (-delta <= accuracy_neg)
+			return 0;
+		else
+			return -1;
+	}
+}
+
+/**
+ * percpu_counter_tree_approximate_compare - Approximated comparison of two counter trees.
+ * @a: First counter to compare.
+ * @b: Second counter to compare.
+ *
+ * Evaluate an approximate comparison of two counter trees.
+ * This approximation comparison is fast, and provides an accurate
+ * answer if the counters are found to be either less than or greater
+ * than the other. However, if the approximated comparison returns
+ * 0, the counters respective sums are found to be within the two
+ * counters accuracy range.
+ *
+ * Return:
+ * * %0		- Counters @a and @b do not differ by more than the sum of their respective
+ *                accuracy ranges.
+ * * %-1	- Counter @a less than counter @b.
+ * * %1		- Counter @a is greater than counter @b.
+ */
+int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b)
+{
+	return compare_delta(percpu_counter_tree_approximate_sum(a) - percpu_counter_tree_approximate_sum(b),
+			     a->approx_accuracy_range.over + b->approx_accuracy_range.under,
+			     a->approx_accuracy_range.under + b->approx_accuracy_range.over);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_approximate_compare);
+
+/**
+ * percpu_counter_tree_approximate_compare_value - Approximated comparison of a counter tree against a given value.
+ * @counter: Counter to compare.
+ * @v: Value to compare.
+ *
+ * Evaluate an approximate comparison of a counter tree against a given value.
+ * This approximation comparison is fast, and provides an accurate
+ * answer if the counter is found to be either less than or greater
+ * than the value. However, if the approximated comparison returns
+ * 0, the value is within the counter accuracy range.
+ *
+ * Return:
+ * * %0		- The value @v is within the accuracy range of the counter.
+ * * %-1	- The value @v is less than the counter.
+ * * %1		- The value @v is greater than the counter.
+ */
+int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tree *counter, long v)
+{
+	return compare_delta(v - percpu_counter_tree_approximate_sum(counter),
+			     counter->approx_accuracy_range.under,
+			     counter->approx_accuracy_range.over);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_approximate_compare_value);
+
+/**
+ * percpu_counter_tree_precise_compare - Precise comparison of two counter trees.
+ * @a: First counter to compare.
+ * @b: Second counter to compare.
+ *
+ * Evaluate a precise comparison of two counter trees.
+ * As an optimization, it uses the approximate counter comparison
+ * to quickly compare counters which are far apart. Only cases where
+ * counter sums are within the accuracy range require precise counter
+ * sums.
+ *
+ * Return:
+ * * %0		- Counters are equal.
+ * * %-1	- Counter @a less than counter @b.
+ * * %1		- Counter @a is greater than counter @b.
+ */
+int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b)
+{
+	long count_a = percpu_counter_tree_approximate_sum(a),
+	     count_b = percpu_counter_tree_approximate_sum(b);
+	unsigned long accuracy_a, accuracy_b;
+	long delta = count_a - count_b;
+	int res;
+
+	res = compare_delta(delta,
+			    a->approx_accuracy_range.over + b->approx_accuracy_range.under,
+			    a->approx_accuracy_range.under + b->approx_accuracy_range.over);
+	/* The values are distanced enough for an accurate approximated comparison. */
+	if (res)
+		return res;
+
+	/*
+	 * The approximated comparison is within the accuracy range, therefore at least one
+	 * precise sum is needed. Sum the counter which has the largest accuracy first.
+	 */
+	if (delta >= 0) {
+		accuracy_a = a->approx_accuracy_range.under;
+		accuracy_b = b->approx_accuracy_range.over;
+	} else {
+		accuracy_a = a->approx_accuracy_range.over;
+		accuracy_b = b->approx_accuracy_range.under;
+	}
+	if (accuracy_b < accuracy_a) {
+		count_a = percpu_counter_tree_precise_sum(a);
+		res = compare_delta(count_a - count_b,
+				    b->approx_accuracy_range.under,
+				    b->approx_accuracy_range.over);
+		if (res)
+			return res;
+		/* Precise sum of second counter is required. */
+		count_b = percpu_counter_tree_precise_sum(b);
+	} else {
+		count_b = percpu_counter_tree_precise_sum(b);
+		res = compare_delta(count_a - count_b,
+				    a->approx_accuracy_range.over,
+				    a->approx_accuracy_range.under);
+		if (res)
+			return res;
+		/* Precise sum of second counter is required. */
+		count_a = percpu_counter_tree_precise_sum(a);
+	}
+	if (count_a - count_b < 0)
+		return -1;
+	if (count_a - count_b > 0)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_compare);
+
+/**
+ * percpu_counter_tree_precise_compare_value - Precise comparison of a counter tree against a given value.
+ * @counter: Counter to compare.
+ * @v: Value to compare.
+ *
+ * Evaluate a precise comparison of a counter tree against a given value.
+ * As an optimization, it uses the approximate counter comparison
+ * to quickly identify whether the counter and value are far apart.
+ * Only cases where the value is within the counter accuracy range
+ * require a precise counter sum.
+ *
+ * Return:
+ * * %0		- The value @v is equal to the counter.
+ * * %-1	- The value @v is less than the counter.
+ * * %1		- The value @v is greater than the counter.
+ */
+int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *counter, long v)
+{
+	long count = percpu_counter_tree_approximate_sum(counter);
+	int res;
+
+	res = compare_delta(v - count,
+			    counter->approx_accuracy_range.under,
+			    counter->approx_accuracy_range.over);
+	/* The values are distanced enough for an accurate approximated comparison. */
+	if (res)
+		return res;
+
+	/* Precise sum is required. */
+	count = percpu_counter_tree_precise_sum(counter);
+	if (v - count < 0)
+		return -1;
+	if (v - count > 0)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_compare_value);
+
+static
+void percpu_counter_tree_set_bias(struct percpu_counter_tree *counter, long bias)
+{
+	WRITE_ONCE(counter->bias, bias);
+}
+
+/**
+ * percpu_counter_tree_set - Set the counter tree sum to a given value.
+ * @counter: Counter to set.
+ * @v: Value to set.
+ *
+ * Set the counter sum to a given value. It can be useful for instance
+ * to reset the counter sum to 0. Note that even after setting the
+ * counter sum to a given value, the counter sum approximation can
+ * return any value within the accuracy range around that value.
+ */
+void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v)
+{
+	percpu_counter_tree_set_bias(counter,
+				     v - percpu_counter_tree_precise_sum_unbiased(counter));
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_set);
+
+/*
+ * percpu_counter_tree_items_size - Query the size required for counter tree items.
+ *
+ * Query the size of the memory area required to hold the counter tree
+ * items. This depends on the hardware topology and is invariant after
+ * boot.
+ *
+ * Return: Size required to hold tree items.
+ */
+size_t percpu_counter_tree_items_size(void)
+{
+	if (!nr_cpus_order)
+		return 0;
+	return counter_config->nr_items * sizeof(struct percpu_counter_tree_level_item);
+}
+EXPORT_SYMBOL_GPL(percpu_counter_tree_items_size);
+
+static void __init calculate_accuracy_topology(void)
+{
+	unsigned int nr_levels = counter_config->nr_levels, level;
+	unsigned int level_items = 1U << nr_cpus_order;
+	unsigned long batch_size = 1;
+
+	for (level = 0; level < nr_levels; level++) {
+		unsigned int n_arity_order = counter_config->n_arity_order[level];
+
+		/*
+		 * The accuracy multiplier is derived from a batch size of 1
+		 * to speed up calculating the accuracy at tree initialization.
+		 */
+		accuracy_multiplier += batch_size * level_items;
+		batch_size <<= n_arity_order;
+		level_items >>= n_arity_order;
+	}
+}
+
+int __init percpu_counter_tree_subsystem_init(void)
+{
+	nr_cpus_order = get_count_order(nr_cpu_ids);
+	if (WARN_ON_ONCE(nr_cpus_order >= ARRAY_SIZE(per_nr_cpu_order_config))) {
+		printk(KERN_ERR "Unsupported number of CPUs (%u)\n", nr_cpu_ids);
+		return -1;
+	}
+	counter_config = &per_nr_cpu_order_config[nr_cpus_order];
+	calculate_accuracy_topology();
+	return 0;
+}
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index 05f74edbc62b..d282aa23d273 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -56,4 +56,6 @@ obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o
 obj-$(CONFIG_RATELIMIT_KUNIT_TEST) += test_ratelimit.o
 obj-$(CONFIG_UUID_KUNIT_TEST) += uuid_kunit.o
 
+obj-$(CONFIG_PERCPU_COUNTER_TREE_TEST) += percpu_counter_tree_kunit.o
+
 obj-$(CONFIG_TEST_RUNTIME_MODULE)		+= module/
diff --git a/lib/tests/percpu_counter_tree_kunit.c b/lib/tests/percpu_counter_tree_kunit.c
new file mode 100644
index 000000000000..a79176655c4b
--- /dev/null
+++ b/lib/tests/percpu_counter_tree_kunit.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0+ OR MIT
+// SPDX-FileCopyrightText: 2026 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+#include <kunit/test.h>
+#include <linux/percpu_counter_tree.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/random.h>
+
+struct multi_thread_test_data {
+	long increment;
+	int nr_inc;
+	int counter_index;
+};
+
+#define NR_COUNTERS	2
+
+/* Hierarchical per-CPU counter instances. */
+static struct percpu_counter_tree counter[NR_COUNTERS];
+static struct percpu_counter_tree_level_item *items;
+
+/* Global atomic counters for validation. */
+static atomic_long_t global_counter[NR_COUNTERS];
+
+static DECLARE_WAIT_QUEUE_HEAD(kernel_threads_wq);
+static atomic_t kernel_threads_to_run;
+
+static void complete_work(void)
+{
+	if (atomic_dec_and_test(&kernel_threads_to_run))
+		wake_up(&kernel_threads_wq);
+}
+
+static void hpcc_print_info(struct kunit *test)
+{
+	kunit_info(test, "Running test with %d CPUs\n", num_online_cpus());
+}
+
+static void add_to_counter(int counter_index, unsigned int nr_inc, long increment)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_inc; i++) {
+		percpu_counter_tree_add(&counter[counter_index], increment);
+		atomic_long_add(increment, &global_counter[counter_index]);
+	}
+}
+
+static void check_counters(struct kunit *test)
+{
+	int counter_index;
+
+	/* Compare each counter with its global counter. */
+	for (counter_index = 0; counter_index < NR_COUNTERS; counter_index++) {
+		long v = atomic_long_read(&global_counter[counter_index]);
+		long approx_sum = percpu_counter_tree_approximate_sum(&counter[counter_index]);
+		unsigned long under_accuracy = 0, over_accuracy = 0;
+		long precise_min, precise_max;
+
+		/* Precise comparison. */
+		KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&counter[counter_index]), v);
+		KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_precise_compare_value(&counter[counter_index], v));
+
+		/* Approximate comparison. */
+		KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&counter[counter_index], v));
+
+		/* Accuracy limits checks. */
+		percpu_counter_tree_approximate_accuracy_range(&counter[counter_index], &under_accuracy, &over_accuracy);
+
+		KUNIT_EXPECT_GE(test, (long)(approx_sum - (v - under_accuracy)), 0);
+		KUNIT_EXPECT_LE(test, (long)(approx_sum - (v + over_accuracy)), 0);
+		KUNIT_EXPECT_GT(test, (long)(approx_sum - (v - under_accuracy - 1)), 0);
+		KUNIT_EXPECT_LT(test, (long)(approx_sum - (v + over_accuracy + 1)), 0);
+
+		/* Precise min/max range check. */
+		percpu_counter_tree_approximate_min_max_range(approx_sum, under_accuracy, over_accuracy, &precise_min, &precise_max);
+
+		KUNIT_EXPECT_GE(test, v - precise_min, 0);
+		KUNIT_EXPECT_LE(test, v - precise_max, 0);
+		KUNIT_EXPECT_GT(test, v - (precise_min - 1), 0);
+		KUNIT_EXPECT_LT(test, v - (precise_max + 1), 0);
+	}
+	/* Compare each counter with the second counter. */
+	KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&counter[0]), percpu_counter_tree_precise_sum(&counter[1]));
+	KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_precise_compare(&counter[0], &counter[1]));
+	KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare(&counter[0], &counter[1]));
+}
+
+static int multi_thread_worker_fn(void *data)
+{
+	struct multi_thread_test_data *td = data;
+
+	add_to_counter(td->counter_index, td->nr_inc, td->increment);
+	complete_work();
+	kfree(td);
+	return 0;
+}
+
+static void test_run_on_specific_cpu(struct kunit *test, int target_cpu, int counter_index, unsigned int nr_inc, long increment)
+{
+	struct task_struct *task;
+	struct multi_thread_test_data *td = kzalloc(sizeof(struct multi_thread_test_data), GFP_KERNEL);
+
+	KUNIT_EXPECT_PTR_NE(test, td, NULL);
+	td->increment = increment;
+	td->nr_inc = nr_inc;
+	td->counter_index = counter_index;
+	atomic_inc(&kernel_threads_to_run);
+	task = kthread_run_on_cpu(multi_thread_worker_fn, td, target_cpu, "kunit_multi_thread_worker");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, task);
+}
+
+static void init_kthreads(void)
+{
+	atomic_set(&kernel_threads_to_run, 1);
+}
+
+static void fini_kthreads(void)
+{
+	/* Release our own reference. */
+	complete_work();
+	/* Wait for all others threads to run. */
+	wait_event(kernel_threads_wq, (atomic_read(&kernel_threads_to_run) == 0));
+}
+
+static void test_sync_kthreads(void)
+{
+	fini_kthreads();
+	init_kthreads();
+}
+
+static void init_counters(struct kunit *test, unsigned long batch_size)
+{
+	int i, ret;
+
+	items = kzalloc(percpu_counter_tree_items_size() * NR_COUNTERS, GFP_KERNEL);
+	KUNIT_EXPECT_PTR_NE(test, items, NULL);
+	ret = percpu_counter_tree_init_many(counter, items, NR_COUNTERS, batch_size, GFP_KERNEL);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	for (i = 0; i < NR_COUNTERS; i++)
+		atomic_long_set(&global_counter[i], 0);
+}
+
+static void fini_counters(void)
+{
+	percpu_counter_tree_destroy_many(counter, NR_COUNTERS);
+	kfree(items);
+}
+
+enum up_test_inc_type {
+	INC_ONE,
+	INC_MINUS_ONE,
+	INC_RANDOM,
+};
+
+/*
+ * Single-threaded tests. Those use many threads to run on various CPUs,
+ * but synchronize for completion of each thread before running the
+ * next, effectively making sure there are no concurrent updates.
+ */
+static void do_hpcc_test_single_thread(struct kunit *test, int _cpu0, int _cpu1, enum up_test_inc_type type)
+{
+	unsigned long batch_size_order = 5;
+	int cpu0 = _cpu0;
+	int cpu1 = _cpu1;
+	int i;
+
+	init_counters(test, 1UL << batch_size_order);
+	init_kthreads();
+	for (i = 0; i < 10000; i++) {
+		long increment;
+
+		switch (type) {
+		case INC_ONE:
+			increment = 1;
+			break;
+		case INC_MINUS_ONE:
+			increment = -1;
+			break;
+		case INC_RANDOM:
+			increment = (long) get_random_long() % 50000;
+			break;
+		}
+		if (_cpu0 < 0)
+			cpu0 = cpumask_any_distribute(cpu_online_mask);
+		if (_cpu1 < 0)
+			cpu1 = cpumask_any_distribute(cpu_online_mask);
+		test_run_on_specific_cpu(test, cpu0, 0, 1, increment);
+		test_sync_kthreads();
+		test_run_on_specific_cpu(test, cpu1, 1, 1, increment);
+		test_sync_kthreads();
+		check_counters(test);
+	}
+	fini_kthreads();
+	fini_counters();
+}
+
+static void hpcc_test_single_thread_first(struct kunit *test)
+{
+	int cpu = cpumask_first(cpu_online_mask);
+
+	do_hpcc_test_single_thread(test, cpu, cpu, INC_ONE);
+	do_hpcc_test_single_thread(test, cpu, cpu, INC_MINUS_ONE);
+	do_hpcc_test_single_thread(test, cpu, cpu, INC_RANDOM);
+}
+
+static void hpcc_test_single_thread_first_random(struct kunit *test)
+{
+	int cpu = cpumask_first(cpu_online_mask);
+
+	do_hpcc_test_single_thread(test, cpu, -1, INC_ONE);
+	do_hpcc_test_single_thread(test, cpu, -1, INC_MINUS_ONE);
+	do_hpcc_test_single_thread(test, cpu, -1, INC_RANDOM);
+}
+
+static void hpcc_test_single_thread_random(struct kunit *test)
+{
+	do_hpcc_test_single_thread(test, -1, -1, INC_ONE);
+	do_hpcc_test_single_thread(test, -1, -1, INC_MINUS_ONE);
+	do_hpcc_test_single_thread(test, -1, -1, INC_RANDOM);
+}
+
+/* Multi-threaded SMP tests. */
+
+static void do_hpcc_multi_thread_increment_each_cpu(struct kunit *test, unsigned long batch_size, unsigned int nr_inc, long increment)
+{
+	int cpu;
+
+	init_counters(test, batch_size);
+	init_kthreads();
+	for_each_online_cpu(cpu) {
+		test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment);
+		test_run_on_specific_cpu(test, cpu, 1, nr_inc, increment);
+	}
+	fini_kthreads();
+	check_counters(test);
+	fini_counters();
+}
+
+static void do_hpcc_multi_thread_increment_even_cpus(struct kunit *test, unsigned long batch_size, unsigned int nr_inc, long increment)
+{
+	int cpu;
+
+	init_counters(test, batch_size);
+	init_kthreads();
+	for_each_online_cpu(cpu) {
+		test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment);
+		test_run_on_specific_cpu(test, cpu & ~1, 1, nr_inc, increment); /* even cpus. */
+	}
+	fini_kthreads();
+	check_counters(test);
+	fini_counters();
+}
+
+static void do_hpcc_multi_thread_increment_single_cpu(struct kunit *test, unsigned long batch_size, unsigned int nr_inc, long increment)
+{
+	int cpu;
+
+	init_counters(test, batch_size);
+	init_kthreads();
+	for_each_online_cpu(cpu) {
+		test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment);
+		test_run_on_specific_cpu(test, cpumask_first(cpu_online_mask), 1, nr_inc, increment);
+	}
+	fini_kthreads();
+	check_counters(test);
+	fini_counters();
+}
+
+static void do_hpcc_multi_thread_increment_random_cpu(struct kunit *test, unsigned long batch_size, unsigned int nr_inc, long increment)
+{
+	int cpu;
+
+	init_counters(test, batch_size);
+	init_kthreads();
+	for_each_online_cpu(cpu) {
+		test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment);
+		test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask), 1, nr_inc, increment);
+	}
+	fini_kthreads();
+	check_counters(test);
+	fini_counters();
+}
+
+static void hpcc_test_multi_thread_batch_increment(struct kunit *test)
+{
+	unsigned long batch_size_order;
+
+	for (batch_size_order = 2; batch_size_order < 10; batch_size_order++) {
+		unsigned int nr_inc;
+
+		for (nr_inc = 1; nr_inc < 1024; nr_inc *= 2) {
+			long increment;
+
+			for (increment = 1; increment < 100000; increment *= 10) {
+				do_hpcc_multi_thread_increment_each_cpu(test, 1UL << batch_size_order, nr_inc, increment);
+				do_hpcc_multi_thread_increment_even_cpus(test, 1UL << batch_size_order, nr_inc, increment);
+				do_hpcc_multi_thread_increment_single_cpu(test, 1UL << batch_size_order, nr_inc, increment);
+				do_hpcc_multi_thread_increment_random_cpu(test, 1UL << batch_size_order, nr_inc, increment);
+			}
+		}
+	}
+}
+
+static void hpcc_test_multi_thread_random_walk(struct kunit *test)
+{
+	unsigned long batch_size_order = 5;
+	int loop;
+
+	for (loop = 0; loop < 100; loop++) {
+		int i;
+
+		init_counters(test, 1UL << batch_size_order);
+		init_kthreads();
+		for (i = 0; i < 1000; i++) {
+			long increment = (long) get_random_long() % 512;
+			unsigned int nr_inc = ((unsigned long) get_random_long()) % 1024;
+
+			test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask), 0, nr_inc, increment);
+			test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask), 1, nr_inc, increment);
+		}
+		fini_kthreads();
+		check_counters(test);
+		fini_counters();
+	}
+}
+
+static void hpcc_test_init_one(struct kunit *test)
+{
+	struct percpu_counter_tree pct;
+	struct percpu_counter_tree_level_item *counter_items;
+	int ret;
+
+	counter_items = kzalloc(percpu_counter_tree_items_size(), GFP_KERNEL);
+	KUNIT_EXPECT_PTR_NE(test, counter_items, NULL);
+	ret = percpu_counter_tree_init(&pct, counter_items, 32, GFP_KERNEL);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	percpu_counter_tree_destroy(&pct);
+	kfree(counter_items);
+}
+
+static void hpcc_test_set(struct kunit *test)
+{
+	static long values[] = {
+		5, 100, 127, 128, 255, 256, 4095, 4096, 500000, 0,
+		-5, -100, -127, -128, -255, -256, -4095, -4096, -500000,
+	};
+	struct percpu_counter_tree pct;
+	struct percpu_counter_tree_level_item *counter_items;
+	int i, ret;
+
+	counter_items = kzalloc(percpu_counter_tree_items_size(), GFP_KERNEL);
+	KUNIT_EXPECT_PTR_NE(test, counter_items, NULL);
+	ret = percpu_counter_tree_init(&pct, counter_items, 32, GFP_KERNEL);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	for (i = 0; i < ARRAY_SIZE(values); i++) {
+		long v = values[i];
+
+		percpu_counter_tree_set(&pct, v);
+		KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), v);
+		KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&pct, v));
+
+		percpu_counter_tree_add(&pct, v);
+		KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), 2 * v);
+		KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&pct, 2 * v));
+
+		percpu_counter_tree_add(&pct, -2 * v);
+		KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), 0);
+		KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&pct, 0));
+	}
+
+	percpu_counter_tree_destroy(&pct);
+	kfree(counter_items);
+}
+
+static struct kunit_case hpcc_test_cases[] = {
+	KUNIT_CASE(hpcc_print_info),
+	KUNIT_CASE(hpcc_test_single_thread_first),
+	KUNIT_CASE(hpcc_test_single_thread_first_random),
+	KUNIT_CASE(hpcc_test_single_thread_random),
+	KUNIT_CASE(hpcc_test_multi_thread_batch_increment),
+	KUNIT_CASE(hpcc_test_multi_thread_random_walk),
+	KUNIT_CASE(hpcc_test_init_one),
+	KUNIT_CASE(hpcc_test_set),
+	{}
+};
+
+static struct kunit_suite hpcc_test_suite = {
+	.name = "percpu_counter_tree",
+	.test_cases = hpcc_test_cases,
+};
+
+kunit_test_suite(hpcc_test_suite);
+
+MODULE_DESCRIPTION("Test cases for hierarchical per-CPU counters");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c
index a565c30c71a0..faebf5b7cd6e 100644
--- a/lib/vdso/datastore.c
+++ b/lib/vdso/datastore.c
@@ -1,64 +1,92 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/linkage.h>
-#include <linux/mmap_lock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/time_namespace.h>
 #include <linux/types.h>
 #include <linux/vdso_datastore.h>
 #include <vdso/datapage.h>
 
-/*
- * The vDSO data page.
- */
+static u8 vdso_initdata[VDSO_NR_PAGES * PAGE_SIZE] __aligned(PAGE_SIZE) __initdata = {};
+
 #ifdef CONFIG_GENERIC_GETTIMEOFDAY
-static union {
-	struct vdso_time_data	data;
-	u8			page[PAGE_SIZE];
-} vdso_time_data_store __page_aligned_data;
-struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data;
-static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE);
+struct vdso_time_data *vdso_k_time_data __refdata =
+	(void *)&vdso_initdata[VDSO_TIME_PAGE_OFFSET * PAGE_SIZE];
+
+static_assert(sizeof(struct vdso_time_data) <= PAGE_SIZE);
 #endif /* CONFIG_GENERIC_GETTIMEOFDAY */
 
 #ifdef CONFIG_VDSO_GETRANDOM
-static union {
-	struct vdso_rng_data	data;
-	u8			page[PAGE_SIZE];
-} vdso_rng_data_store __page_aligned_data;
-struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data;
-static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE);
+struct vdso_rng_data *vdso_k_rng_data __refdata =
+	(void *)&vdso_initdata[VDSO_RNG_PAGE_OFFSET * PAGE_SIZE];
+
+static_assert(sizeof(struct vdso_rng_data) <= PAGE_SIZE);
 #endif /* CONFIG_VDSO_GETRANDOM */
 
 #ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA
-static union {
-	struct vdso_arch_data	data;
-	u8			page[VDSO_ARCH_DATA_SIZE];
-} vdso_arch_data_store __page_aligned_data;
-struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data;
+struct vdso_arch_data *vdso_k_arch_data __refdata =
+	(void *)&vdso_initdata[VDSO_ARCH_PAGES_START * PAGE_SIZE];
 #endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */
 
+void __init vdso_setup_data_pages(void)
+{
+	unsigned int order = get_order(VDSO_NR_PAGES * PAGE_SIZE);
+	struct page *pages;
+
+	/*
+	 * Allocate the data pages dynamically. SPARC does not support mapping
+	 * static pages to be mapped into userspace.
+	 * It is also a requirement for mlockall() support.
+	 *
+	 * Do not use folios. In time namespaces the pages are mapped in a different order
+	 * to userspace, which is not handled by the folio optimizations in finish_fault().
+	 */
+	pages = alloc_pages(GFP_KERNEL, order);
+	if (!pages)
+		panic("Unable to allocate VDSO storage pages");
+
+	/* The pages are mapped one-by-one into userspace and each one needs to be refcounted. */
+	split_page(pages, order);
+
+	/* Move the data already written by other subsystems to the new pages */
+	memcpy(page_address(pages), vdso_initdata, VDSO_NR_PAGES * PAGE_SIZE);
+
+	if (IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY))
+		vdso_k_time_data = page_address(pages + VDSO_TIME_PAGE_OFFSET);
+
+	if (IS_ENABLED(CONFIG_VDSO_GETRANDOM))
+		vdso_k_rng_data = page_address(pages + VDSO_RNG_PAGE_OFFSET);
+
+	if (IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA))
+		vdso_k_arch_data = page_address(pages + VDSO_ARCH_PAGES_START);
+}
+
 static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 			     struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct page *timens_page = find_timens_vvar_page(vma);
-	unsigned long addr, pfn;
-	vm_fault_t err;
+	struct page *page, *timens_page;
+
+	timens_page = find_timens_vvar_page(vma);
 
 	switch (vmf->pgoff) {
 	case VDSO_TIME_PAGE_OFFSET:
 		if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY))
 			return VM_FAULT_SIGBUS;
-		pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data));
+		page = virt_to_page(vdso_k_time_data);
 		if (timens_page) {
 			/*
 			 * Fault in VVAR page too, since it will be accessed
 			 * to get clock data anyway.
 			 */
+			unsigned long addr;
+			vm_fault_t err;
+
 			addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE;
-			err = vmf_insert_pfn(vma, addr, pfn);
+			err = vmf_insert_page(vma, addr, page);
 			if (unlikely(err & VM_FAULT_ERROR))
 				return err;
-			pfn = page_to_pfn(timens_page);
+			page = timens_page;
 		}
 		break;
 	case VDSO_TIMENS_PAGE_OFFSET:
@@ -71,24 +99,25 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 		 */
 		if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page)
 			return VM_FAULT_SIGBUS;
-		pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data));
+		page = virt_to_page(vdso_k_time_data);
 		break;
 	case VDSO_RNG_PAGE_OFFSET:
 		if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM))
 			return VM_FAULT_SIGBUS;
-		pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data));
+		page = virt_to_page(vdso_k_rng_data);
 		break;
 	case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END:
 		if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA))
 			return VM_FAULT_SIGBUS;
-		pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) +
-			vmf->pgoff - VDSO_ARCH_PAGES_START;
+		page = virt_to_page(vdso_k_arch_data) + vmf->pgoff - VDSO_ARCH_PAGES_START;
 		break;
 	default:
 		return VM_FAULT_SIGBUS;
 	}
 
-	return vmf_insert_pfn(vma, vmf->address, pfn);
+	get_page(page);
+	vmf->page = page;
+	return 0;
 }
 
 const struct vm_special_mapping vdso_vvar_mapping = {
@@ -100,7 +129,7 @@ struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned
 {
 	return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE,
 					VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP |
-					VM_PFNMAP | VM_SEALED_SYSMAP,
+					VM_MIXEDMAP | VM_SEALED_SYSMAP,
 					&vdso_vvar_mapping);
 }
 
diff --git a/mm/memblock.c b/mm/memblock.c
index b3ddfdec7a80..ae6a5af46bd7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -959,28 +959,6 @@ __init void memblock_clear_kho_scratch_only(void)
 {
 	kho_scratch_only = false;
 }
-
-__init void memmap_init_kho_scratch_pages(void)
-{
-	phys_addr_t start, end;
-	unsigned long pfn;
-	int nid;
-	u64 i;
-
-	if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
-		return;
-
-	/*
-	 * Initialize struct pages for free scratch memory.
-	 * The struct pages for reserved scratch memory will be set up in
-	 * reserve_bootmem_region()
-	 */
-	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
-			     MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
-		for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
-			init_deferred_page(pfn, nid);
-	}
-}
 #endif
 
 /**
diff --git a/mm/mm_init.c b/mm/mm_init.c
index df34797691bd..7363b5b0d22a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -786,7 +786,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
 	for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
 		struct page *page = pfn_to_page(pfn);
 
-		__init_deferred_page(pfn, nid);
+		if (!pfn_is_kho_scratch(pfn))
+			__init_deferred_page(pfn, nid);
 
 		/*
 		 * no need for atomic set_bit because the struct
@@ -1996,9 +1997,12 @@ static void __init deferred_free_pages(unsigned long pfn,
 
 	/* Free a large naturally-aligned chunk if possible */
 	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
-		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
+		for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+			if (pfn_is_kho_scratch(page_to_pfn(page + i)))
+				continue;
 			init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
 					false);
+		}
 		__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
 		return;
 	}
@@ -2007,7 +2011,7 @@ static void __init deferred_free_pages(unsigned long pfn,
 	accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
-		if (pageblock_aligned(pfn))
+		if (pageblock_aligned(pfn) && !pfn_is_kho_scratch(pfn))
 			init_pageblock_migratetype(page, MIGRATE_MOVABLE,
 					false);
 		__free_pages_core(page, 0, MEMINIT_EARLY);
@@ -2078,9 +2082,11 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
 			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
 			unsigned long chunk_end = min(mo_pfn, epfn);
 
-			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
-			deferred_free_pages(spfn, chunk_end - spfn);
+			// KHO scratch is MAX_ORDER_NR_PAGES aligned.
+			if (!pfn_is_kho_scratch(spfn))
+				deferred_init_pages(zone, spfn, chunk_end);
 
+			deferred_free_pages(spfn, chunk_end - spfn);
 			spfn = chunk_end;
 
 			if (can_resched)
@@ -2088,6 +2094,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
 			else
 				touch_nmi_watchdog();
 		}
+		nr_pages += epfn - spfn;
 	}
 
 	return nr_pages;

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox