Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v5 4/7] bootconfig: clean build-time tools/bootconfig from make clean
From: Breno Leitao @ 2026-06-17 11:23 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, linux-kernel, linux-trace-kernel, linux-kbuild,
	bpf, Breno Leitao, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-0-fd589a9cc5e3@debian.org>

The previous patch builds tools/bootconfig during 'make prepare' to
render the embedded bootconfig cmdline, but nothing removes it on
'make clean', leaving the compiled tool and its objects behind.

Wire a bootconfig_clean hook into the top-level clean target so the
compiled tool and its objects are removed by make clean, matching the
prepare-wired tools/objtool and tools/bpf/resolve_btfids.

The hook runs tools/bootconfig's Makefile via $(MAKE), which the kernel
build invokes with -rR (MAKEFLAGS += -rR). -rR drops the built-in $(RM)
variable, so the existing "$(RM) -f ..." clean recipe would expand to a
bare "-f ..." and fail. Spell the recipe with a literal "rm -f" so it
keeps working both standalone and when invoked from Kbuild.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 Makefile                  | 13 ++++++++++++-
 tools/bootconfig/Makefile |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index a7abb3f9a6264..a6e13fa1c1dc1 100644
--- a/Makefile
+++ b/Makefile
@@ -1586,6 +1586,17 @@ ifneq ($(wildcard $(objtool_O)),)
 	$(Q)$(MAKE) -sC $(abs_srctree)/tools/objtool O=$(objtool_O) srctree=$(abs_srctree) $(patsubst objtool_%,%,$@)
 endif
 
+PHONY += bootconfig_clean
+
+bootconfig_O = $(abspath $(objtree))/tools/bootconfig
+
+# tools/bootconfig is only built (via the prepare hook above) when
+# CONFIG_BOOT_CONFIG_EMBED_CMDLINE is set; skip its clean otherwise.
+bootconfig_clean:
+ifneq ($(wildcard $(bootconfig_O)),)
+	$(Q)$(MAKE) -sC $(srctree)/tools/bootconfig O=$(bootconfig_O) clean
+endif
+
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
 	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/
@@ -1756,7 +1767,7 @@ vmlinuxclean:
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean
 	$(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean)
 
-clean: archclean vmlinuxclean resolve_btfids_clean objtool_clean
+clean: archclean vmlinuxclean resolve_btfids_clean objtool_clean bootconfig_clean
 
 # mrproper - Delete all generated files, including .config
 #
diff --git a/tools/bootconfig/Makefile b/tools/bootconfig/Makefile
index 4e82fd9553cde..3cb8066d5141b 100644
--- a/tools/bootconfig/Makefile
+++ b/tools/bootconfig/Makefile
@@ -27,4 +27,4 @@ install: $(ALL_PROGRAMS)
 	install $(OUTPUT)bootconfig $(DESTDIR)$(bindir)
 
 clean:
-	$(RM) -f $(OUTPUT)*.o $(ALL_PROGRAMS)
+	rm -f $(OUTPUT)*.o $(ALL_PROGRAMS)

-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v5 5/7] bootconfig: add xbc_prepend_embedded_cmdline() helper
From: Breno Leitao @ 2026-06-17 11:23 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, linux-kernel, linux-trace-kernel, linux-kbuild,
	bpf, Breno Leitao, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-0-fd589a9cc5e3@debian.org>

Add a helper that prepends the build-time-rendered embedded bootconfig
"kernel" subtree (embedded_kernel_cmdline[] from embedded-cmdline.S) to
a cmdline buffer with a separating space. Architectures call this from
setup_arch() before parse_early_param() so early_param() handlers
(mem=, earlycon=, loglevel=, ...) see values supplied via the embedded
bootconfig.

The in-place prepend (shift the existing string right, then drop the
embedded string in front) is factored into a small str_prepend() helper.

On overflow the helper logs an error and leaves the cmdline untouched
rather than panicking. Booting without the embedded values is better
than refusing to boot, and the error tells the user why their embedded
keys are missing.

The helper records whether it actually prepended, exposed via
xbc_embedded_cmdline_applied(). setup_boot_config() uses this to decide
whether the runtime "kernel" render would duplicate keys already folded
into boot_command_line.

When CONFIG_BOOT_CONFIG_EMBED_CMDLINE=n, the public declaration in
<linux/bootconfig.h> resolves to a no-op stub so callers compile
unchanged.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 include/linux/bootconfig.h |  9 ++++++
 lib/bootconfig.c           | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 1c7f3b74ffcf3..c186137f87ac5 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -308,4 +308,13 @@ static inline const char *xbc_get_embedded_bootconfig(size_t *size)
 }
 #endif
 
+/* Build-time-rendered bootconfig cmdline prepended in setup_arch() */
+#ifdef CONFIG_BOOT_CONFIG_EMBED_CMDLINE
+void __init xbc_prepend_embedded_cmdline(char *dst, size_t size);
+bool __init xbc_embedded_cmdline_applied(void);
+#else
+static inline void xbc_prepend_embedded_cmdline(char *dst, size_t size) { }
+static inline bool xbc_embedded_cmdline_applied(void) { return false; }
+#endif
+
 #endif
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 926094d97397e..f66be0b2dc241 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
+#include <linux/printk.h>
 #include <linux/sprintf.h>
 #include <linux/memblock.h>
 #include <linux/string.h>
@@ -34,6 +35,83 @@ const char * __init xbc_get_embedded_bootconfig(size_t *size)
 	return (*size) ? embedded_bootconfig_data : NULL;
 }
 #endif
+
+#ifdef CONFIG_BOOT_CONFIG_EMBED_CMDLINE
+/* embedded_kernel_cmdline is defined in embedded-cmdline.S */
+extern __visible const char embedded_kernel_cmdline[];
+extern __visible const char embedded_kernel_cmdline_end[];
+
+/* Set once the embedded cmdline has actually been prepended. */
+static bool xbc_cmdline_applied __initdata;
+
+/*
+ * str_prepend() - Prepend @src in front of the string in @dst, in place
+ * @dst: NUL-terminated destination buffer, currently @dst_len bytes long
+ * @dst_len: length of the current @dst string (excluding its NUL)
+ * @src: bytes to prepend (not NUL-terminated)
+ * @src_len: number of bytes from @src to prepend
+ *
+ * The caller must guarantee @dst has room for src_len + dst_len + 1 bytes.
+ * Moving dst_len + 1 bytes carries @dst's NUL terminator too, so an empty
+ * @dst needs no special case.
+ */
+static void __init str_prepend(char *dst, size_t dst_len,
+			       const char *src, size_t src_len)
+{
+	memmove(dst + src_len, dst, dst_len + 1);
+	memcpy(dst, src, src_len);
+}
+
+/**
+ * xbc_prepend_embedded_cmdline() - Prepend embedded bootconfig cmdline
+ * @dst: cmdline buffer to prepend into (must already contain a NUL byte)
+ * @size: total capacity of @dst in bytes
+ *
+ * Prepend the build-time-rendered "kernel" subtree of the embedded
+ * bootconfig to @dst. The rendered string already ends with a single
+ * space (the xbc_snprint_cmdline() invariant), which serves as the
+ * separator between the embedded keys and any existing content of @dst.
+ * On overflow, log an error and leave @dst untouched rather than
+ * silently truncating: booting without the embedded values is better
+ * than refusing to boot, and the error message tells the user why
+ * their embedded keys are missing.
+ *
+ * Intended to be called from setup_arch() before parse_early_param() so
+ * that early_param() handlers see the embedded values.
+ */
+void __init xbc_prepend_embedded_cmdline(char *dst, size_t size)
+{
+	size_t embed_len = embedded_kernel_cmdline_end - embedded_kernel_cmdline;
+	size_t dst_len;
+
+	if (!size || embed_len <= 1)	/* trailing NUL only */
+		return;
+	embed_len--;			/* exclude trailing NUL byte */
+
+	dst_len = strnlen(dst, size);
+	if (embed_len + dst_len + 1 > size) {
+		pr_err("embedded bootconfig cmdline (%zu bytes) does not fit in COMMAND_LINE_SIZE with %zu bytes already used; ignoring embedded values\n",
+		       embed_len, dst_len);
+		return;
+	}
+
+	str_prepend(dst, dst_len, embedded_kernel_cmdline, embed_len);
+	xbc_cmdline_applied = true;
+}
+
+/**
+ * xbc_embedded_cmdline_applied() - Did the embedded cmdline get prepended?
+ *
+ * Return true if xbc_prepend_embedded_cmdline() actually prepended the
+ * embedded "kernel" subtree. setup_boot_config() uses this to avoid
+ * rendering the same keys a second time.
+ */
+bool __init xbc_embedded_cmdline_applied(void)
+{
+	return xbc_cmdline_applied;
+}
+#endif
+
 #endif
 
 /*

-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v5 6/7] Documentation: bootconfig: document build-time cmdline rendering
From: Breno Leitao @ 2026-06-17 11:23 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, linux-kernel, linux-trace-kernel, linux-kbuild,
	bpf, Breno Leitao, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-0-fd589a9cc5e3@debian.org>

Add a section describing CONFIG_BOOT_CONFIG_EMBED_CMDLINE: what it
does (renders the embedded "kernel" subtree to a flat cmdline at
build time so early_param() handlers see the values), what it
requires (BOOT_CONFIG_EMBED, a non-empty BOOT_CONFIG_EMBED_FILE,
and ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG -- currently x86 only),
the bootconfig opt-in semantics, the initrd-vs-embedded precedence,
and the soft-error overflow behavior.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 Documentation/admin-guide/bootconfig.rst | 81 ++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index f712758472d5c..4a7e90c21f968 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -234,6 +234,87 @@ Kconfig option selected.
 Note that even if you set this option, you can override the embedded
 bootconfig by another bootconfig which attached to the initrd.
 
+Rendering Embedded kernel.* Keys at Build Time
+----------------------------------------------
+
+By default, the embedded bootconfig (``CONFIG_BOOT_CONFIG_EMBED=y``) is
+parsed at runtime, after ``parse_early_param()`` has already run. Early
+parameter handlers (``mem=``, ``earlycon=``, ``loglevel=``, ...) therefore
+cannot see values supplied via the embedded ``kernel`` subtree.
+
+``CONFIG_BOOT_CONFIG_EMBED_CMDLINE`` resolves this by rendering the
+``kernel`` subtree of ``CONFIG_BOOT_CONFIG_EMBED_FILE`` into a flat cmdline
+string at kernel build time (via ``tools/bootconfig -C``) and prepending
+it to ``boot_command_line`` during early architecture setup, so the keys
+are visible to ``parse_early_param()``.
+
+The option requires ``CONFIG_BOOT_CONFIG_EMBED=y``, a non-empty
+``CONFIG_BOOT_CONFIG_EMBED_FILE``, and an architecture that selects
+``CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG``. Currently only x86
+selects it; on other architectures the embedded bootconfig still works,
+but only through the late runtime parser.
+
+The same ``bootconfig`` opt-in applies as elsewhere: the rendered keys
+are prepended only when ``bootconfig`` (in any form) appears on the
+kernel command line, or when ``CONFIG_BOOT_CONFIG_FORCE`` is set, which
+defaults to ``y`` when ``CONFIG_BOOT_CONFIG_EMBED`` is set.
+
+For example, given::
+
+ kernel {
+   loglevel = 7
+   mem = 4G
+ }
+
+the kernel boots as if ``loglevel=7 mem=4G`` had been prepended to the
+bootloader command line, with the values visible to early-parsed
+handlers. Comma-separated values are still expanded into multiple
+cmdline entries per the bootconfig array convention -- the embedded
+``kernel.earlycon = "uart8250,io,0x3f8"`` must be quoted to land as a
+single ``earlycon=`` entry, exactly as for the runtime parser.
+
+If the rendered string would not fit in ``COMMAND_LINE_SIZE`` together
+with the existing command line, the prepend is skipped and an error is
+logged, so an oversized embedded bootconfig cannot brick a boot.
+
+Interaction with other command line and bootconfig sources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With ``CONFIG_BOOT_CONFIG_EMBED_CMDLINE=y`` the rendered ``kernel``
+subtree behaves like a build-time command line (similar to
+``CONFIG_CMDLINE``), not like a bootconfig source. It is prepended to
+``boot_command_line`` in ``setup_arch()``, before ``parse_early_param()``
+and long before the runtime parser looks at an initrd. Options can reach
+the kernel from up to four places:
+
+- Bootloader command line: the arguments the boot loader passes. The
+  embedded cmdline is prepended in front of them, so for last-one-wins
+  parameters a bootloader option still overrides the embedded value.
+  Visible in /proc/cmdline.
+- Embedded cmdline (this option): the rendered ``kernel`` subtree,
+  prepended early so it is seen by ``parse_early_param()``. Visible in
+  /proc/cmdline.
+- Initrd bootconfig: parsed late in ``setup_boot_config()``; its
+  ``kernel`` keys are placed ahead of ``boot_command_line``, i.e. before
+  the embedded cmdline, so last-wins favors the embedded values. As a
+  bootconfig source, an initrd bootconfig still replaces the embedded
+  bootconfig. Visible in /proc/cmdline and /proc/bootconfig.
+- Embedded bootconfig (runtime): parsed late, only when no initrd
+  bootconfig is present. Visible in /proc/cmdline and /proc/bootconfig.
+
+So with this option the embedded ``kernel.*`` values take precedence
+over an initrd bootconfig's ``kernel.*`` values: for early parameters
+the initrd is not parsed yet, and for ordinary parameters the embedded
+keys land later in the command line. If you need an initrd bootconfig to
+override the embedded ``kernel.*`` keys, leave this option off and rely
+on the runtime parser.
+
+The rendered string is part of the command line, so it appears in
+/proc/cmdline. It is deliberately not shown in /proc/bootconfig: that
+file keeps reporting the parsed bootconfig tree -- the initrd bootconfig
+if present, otherwise the embedded bootconfig -- independent of whether
+build-time cmdline rendering is enabled.
+
 Kernel parameters via Boot Config
 =================================
 

-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v5 7/7] x86/setup: prepend embedded bootconfig cmdline before parse_early_param
From: Breno Leitao @ 2026-06-17 11:23 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, linux-kernel, linux-trace-kernel, linux-kbuild,
	bpf, Breno Leitao, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-0-fd589a9cc5e3@debian.org>

Call xbc_prepend_embedded_cmdline() in setup_arch() right after the
CONFIG_CMDLINE merge and before strscpy(command_line, ...) so the
build-time-rendered embedded bootconfig "kernel" subtree is part of
boot_command_line by the time parse_early_param() runs. early_param()
handlers (mem=, earlycon=, loglevel=, ...) now see values supplied via
CONFIG_BOOT_CONFIG_EMBED_FILE without parsing bootconfig at runtime.

Gate the prepend on the same opt-in the runtime parser uses: prepend
when "bootconfig" is present on the command line, or when
CONFIG_BOOT_CONFIG_FORCE is set. setup_boot_config()'s parse_args()
loop treats any presence of the "bootconfig" key as opt-in regardless
of value, so check both cmdline_find_option_bool() (matches the bare
key) and cmdline_find_option() (matches "bootconfig=<anything>").
Without the latter check, "bootconfig=0" would skip the early prepend
yet still trigger the late runtime apply, leaving the embedded keys
invisible to early_param() but applied to saved_command_line.

The prepend necessarily runs before setup_boot_config() detects an
initrd bootconfig, so an initrd cannot override the embedded "kernel"
keys for early_param(). This is intentional: the embedded cmdline acts
like a build-time CONFIG_CMDLINE. An initrd bootconfig's "kernel" keys
never reached early_param() anyway (they apply late via
extra_command_line), so nothing is lost -- the initrd keys still apply
late, with last-wins keeping the embedded values in effect.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 arch/x86/Kconfig        |  1 +
 arch/x86/kernel/setup.c | 27 +++++++++++++++++++++++++++
 init/main.c             | 25 ++++++++++++++++++++++---
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0de23e6471973..8ab11199c16d5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -127,6 +127,7 @@ config X86
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
 	select ARCH_SUPPORTS_CFI		if X86_64
+	select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
 	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 46882ce79c3a4..d69ba84c203f1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -6,6 +6,7 @@
  * parts of early kernel initialization.
  */
 #include <linux/acpi.h>
+#include <linux/bootconfig.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/crash_dump.h>
@@ -36,6 +37,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bugs.h>
 #include <asm/cacheinfo.h>
+#include <asm/cmdline.h>
 #include <asm/coco.h>
 #include <asm/cpu.h>
 #include <asm/efi.h>
@@ -924,6 +926,31 @@ void __init setup_arch(char **cmdline_p)
 	builtin_cmdline_added = true;
 #endif
 
+	/*
+	 * Match the runtime bootconfig parser's opt-in: only fold the
+	 * embedded kernel.* keys into the cmdline when "bootconfig" is
+	 * present on the command line, or CONFIG_BOOT_CONFIG_FORCE is set.
+	 * setup_boot_config()'s parse_args() loop treats any presence of
+	 * the "bootconfig" key as opt-in (bare, =0, =1, ...), so check both
+	 * forms here: cmdline_find_option_bool() matches the bare key,
+	 * cmdline_find_option() matches "bootconfig=<anything>". Without
+	 * the second check, "bootconfig=0" would skip the early prepend
+	 * but still trigger the late runtime apply -- a split-brain state.
+	 * CONFIG_BOOT_CONFIG_FORCE defaults to y when BOOT_CONFIG_EMBED is
+	 * set, so on the default config the embedded keys are applied
+	 * unconditionally.
+	 */
+	{
+		char buf[8];
+
+		if (IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE) ||
+		    cmdline_find_option_bool(boot_command_line, "bootconfig") ||
+		    cmdline_find_option(boot_command_line, "bootconfig",
+					buf, sizeof(buf)) >= 0)
+			xbc_prepend_embedded_cmdline(boot_command_line,
+						     COMMAND_LINE_SIZE);
+	}
+
 	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
diff --git a/init/main.c b/init/main.c
index e363232b428b4..2ecb6aa536dd1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -378,12 +378,15 @@ static void __init setup_boot_config(void)
 	int pos, ret;
 	size_t size;
 	char *err;
+	bool from_embedded = false;
 
 	/* Cut out the bootconfig data even if we have no bootconfig option */
 	data = get_boot_config_from_initrd(&size);
 	/* If there is no bootconfig in initrd, try embedded one. */
-	if (!data)
+	if (!data) {
 		data = xbc_get_embedded_bootconfig(&size);
+		from_embedded = true;
+	}
 
 	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
@@ -421,8 +424,24 @@ static void __init setup_boot_config(void)
 	} else {
 		xbc_get_info(&ret, NULL);
 		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
-		/* keys starting with "kernel." are passed via cmdline */
-		extra_command_line = xbc_make_cmdline("kernel");
+		/*
+		 * keys starting with "kernel." are passed via cmdline. When
+		 * this bootconfig came from the embedded source and
+		 * setup_arch() already prepended the rendered "kernel" subtree
+		 * to boot_command_line, rendering again here would duplicate
+		 * the keys in saved_command_line and make accumulating handlers
+		 * (console=, earlycon=, ...) re-register the same value. Skip
+		 * only when the prepend really happened.
+		 *
+		 * On arches that do not select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG,
+		 * CONFIG_BOOT_CONFIG_EMBED_CMDLINE is unselectable and
+		 * xbc_embedded_cmdline_applied() collapses to a stub returning
+		 * false, so this path still runs and the embedded "kernel"
+		 * keys reach the cmdline via the runtime parser exactly as
+		 * before this series.
+		 */
+		if (!from_embedded || !xbc_embedded_cmdline_applied())
+			extra_command_line = xbc_make_cmdline("kernel");
 		/* Also, "init." keys are init arguments */
 		extra_init_args = xbc_make_cmdline("init");
 	}

-- 
2.53.0-Meta


^ permalink raw reply related

* [RFC PATCH v2 0/4] tracing/osnoise: Track IPIs
From: Valentin Schneider @ 2026-06-17 13:17 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Tomas Glozar,
	Costa Shulyupin, Crystal Wood, John Kacur, Ivan Pravdin,
	Jonathan Corbet

Hi folks,

So I've seen a few times now reports of latency spikes caused by IPIs, usually
because of isolation misconfiguration, but only detected at the tail of end
e.g. a 24h timerlat run.

It's not because those IPIs are rare, but rather that they don't by themselves
cause a monitered CPU to reach the latency threshold, it's usually a combined
interference that gets us there.

I'd like to make it easier to detect such misconfigurations and thus IPIs
hitting supposedly-isolated CPUs. I initially kludged a timerlat option to stop
tracing as soon as an IPI was sent to a monitored CPU, regardless of the latency
threshold. It sort of did the trick, but Tomáš convinced me timerlat wasn't
really the place for that.

So here's IPI tracking added to osnoise. This time around fully in userspace, as
Tomáš pointed out to me that this will make it a lot easier to deploy to older
kernels.

Based on top of linux/next at 'next-20260616' to have the latest libsubcmd
changes.
  
Cheers,
Valentin

Revisions
=========

v1 -> v2
++++++++

o Dropped the in-kernel osnoise_sample changes and made it all userspace

Valentin Schneider (4):
  rtla/osnoise: Add IPI tracking cmdline option
  rtla/osnoise: Record IPI count in osnoise top
  rtla/osnoise: Trace IPI events when recording a trace file
  rtla/osnoise: Leverage IPI event filters when tracing a subset of CPUs

 Documentation/tools/rtla/rtla-osnoise-top.rst |   4 +
 tools/tracing/rtla/src/cli.c                  |   1 +
 tools/tracing/rtla/src/cli_p.h                |   3 +
 tools/tracing/rtla/src/common.c               |   2 +-
 tools/tracing/rtla/src/common.h               |   3 +-
 tools/tracing/rtla/src/osnoise.c              |  17 +-
 tools/tracing/rtla/src/osnoise_top.c          | 153 +++++++++++++++++-
 7 files changed, 179 insertions(+), 4 deletions(-)

--
2.54.0


^ permalink raw reply

* [RFC PATCH v2 1/4] rtla/osnoise: Add IPI tracking cmdline option
From: Valentin Schneider @ 2026-06-17 13:17 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Tomas Glozar, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Costa Shulyupin, Crystal Wood, John Kacur, Ivan Pravdin,
	Jonathan Corbet
In-Reply-To: <20260617131803.2988989-1-vschneid@redhat.com>

Later commits will add IPI tracking to osnoise top. To avoid breaking
existing scripts, this new feature will be gated behind a new -i option.

Suggested-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 Documentation/tools/rtla/rtla-osnoise-top.rst | 4 ++++
 tools/tracing/rtla/src/cli.c                  | 1 +
 tools/tracing/rtla/src/cli_p.h                | 3 +++
 tools/tracing/rtla/src/common.h               | 1 +
 4 files changed, 9 insertions(+)

diff --git a/Documentation/tools/rtla/rtla-osnoise-top.rst b/Documentation/tools/rtla/rtla-osnoise-top.rst
index b91c02ac2bbe1..98f77f8971a69 100644
--- a/Documentation/tools/rtla/rtla-osnoise-top.rst
+++ b/Documentation/tools/rtla/rtla-osnoise-top.rst
@@ -28,6 +28,10 @@ OPTIONS
 =======
 .. include:: common_osnoise_options.txt
 
+**-i**, **--ipi**
+
+	Track sources of IPIs.
+
 .. include:: common_top_options.txt
 
 .. include:: common_options.txt
diff --git a/tools/tracing/rtla/src/cli.c b/tools/tracing/rtla/src/cli.c
index c5279c9875310..eb1e76a6b0dea 100644
--- a/tools/tracing/rtla/src/cli.c
+++ b/tools/tracing/rtla/src/cli.c
@@ -78,6 +78,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		RTLA_OPT_STOP_TOTAL('S', "stop-total", "total sample"),
 		OSNOISE_OPT_THRESHOLD,
 		RTLA_OPT_TRACE_OUTPUT("osnoise", opt_osnoise_trace_output_cb),
+		OSNOISE_OPT_IPI,
 
 	OPT_GROUP("Event Configuration:"),
 		RTLA_OPT_EVENT,
diff --git a/tools/tracing/rtla/src/cli_p.h b/tools/tracing/rtla/src/cli_p.h
index 3c939de9abf02..7d3f982cfabdb 100644
--- a/tools/tracing/rtla/src/cli_p.h
+++ b/tools/tracing/rtla/src/cli_p.h
@@ -305,6 +305,9 @@ static int opt_filter_cb(const struct option *opt, const char *arg, int unset)
 	"the minimum delta to be considered a noise", \
 	opt_llong_callback)
 
+#define OSNOISE_OPT_IPI OPT_BOOLEAN('i', "ipi", &params->common.ipi, \
+	"track sources of IPIs")
+
 /*
  * Callback functions for command line options for osnoise tools
  */
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 04b287a03f6d4..045253230fcf2 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -108,6 +108,7 @@ struct common_params {
 	bool			kernel_workload;
 	bool			user_data;
 	bool			aa_only;
+	bool			ipi;
 
 	struct actions		threshold_actions;
 	struct actions		end_actions;
-- 
2.54.0


^ permalink raw reply related

* [RFC PATCH v2 2/4] rtla/osnoise: Record IPI count in osnoise top
From: Valentin Schneider @ 2026-06-17 13:17 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Tomas Glozar,
	Costa Shulyupin, Crystal Wood, John Kacur, Ivan Pravdin,
	Jonathan Corbet
In-Reply-To: <20260617131803.2988989-1-vschneid@redhat.com>

Leverage the ipi_send_cpu and ipi_send_cpumask trace events to record the
count of IPIs sent to monitored CPUs. These interferences are already
accounted by the IRQ count, but this split gives a better overall picture.

This uses the newly added -i cmdline option.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 tools/tracing/rtla/src/osnoise_top.c | 124 ++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 1 deletion(-)

diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 512a6299cb018..5b462a3543b97 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -8,6 +8,7 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
+#include <errno.h>
 #include <stdio.h>
 #include <time.h>
 
@@ -25,6 +26,7 @@ struct osnoise_top_cpu {
 	unsigned long long	irq_count;
 	unsigned long long	softirq_count;
 	unsigned long long	thread_count;
+	unsigned long long	ipi_count;
 
 	int			sum_cycles;
 };
@@ -70,6 +72,91 @@ static struct osnoise_top_data *osnoise_alloc_top(void)
 	return NULL;
 }
 
+static void account_ipi(struct osnoise_tool *tool,
+			unsigned long long src_cpu, unsigned long long dst_cpu)
+{
+	struct osnoise_top_cpu *cpu_data;
+	struct osnoise_top_data *data;
+	unsigned long long inc = 1;
+
+	data = tool->data;
+	cpu_data = &data->cpu_data[dst_cpu];
+
+	update_sum(&cpu_data->ipi_count, &inc);
+}
+
+/*
+ * osnoise_ipi_cpu_handler - this is the handler for single CPU IPI events.
+ */
+static int
+osnoise_ipi_cpu_handler(struct trace_seq *s, struct tep_record *record,
+		     struct tep_event *event, void *context)
+{
+	struct osnoise_tool *tool;
+	struct osnoise_params *params;
+	unsigned long long src_cpu, dst_cpu;
+	struct trace_instance *trace = context;
+
+	tool = container_of(trace, struct osnoise_tool, trace);
+	params = to_osnoise_params(tool->params);
+
+	src_cpu = record->cpu;
+	tep_get_field_val(s, event, "cpu", record, &dst_cpu, 1);
+
+	if (CPU_ISSET(dst_cpu, &params->common.monitored_cpus))
+		account_ipi(tool, src_cpu, dst_cpu);
+
+	return 0;
+}
+
+static cpu_set_t cpumask_tmp_cpus;
+
+/*
+ * osnoise_ipi_cpumask_handler - this is the handler for broadcasted IPI events.
+ */
+static int
+osnoise_ipi_cpumask_handler(struct trace_seq *s, struct tep_record *record,
+			 struct tep_event *event, void *context)
+{
+	struct trace_instance *trace = context;
+	struct osnoise_tool *tool;
+	struct osnoise_params *params;
+	struct tep_format_field *field;
+	unsigned long long src_cpu;
+	cpu_set_t *event_cpus;
+	int len;
+
+	tool = container_of(trace, struct osnoise_tool, trace);
+	params = to_osnoise_params(tool->params);
+
+	src_cpu = record->cpu;
+
+	field = tep_find_field(event, "cpumask");
+	if (!field)
+		return 0;
+
+	event_cpus = tep_get_field_raw(s, event, "cpumask", record, &len, 1);
+	if (!event_cpus) {
+		err_msg("Failed to get cpumask field\n");
+		return 0;
+	}
+
+	CPU_AND(&cpumask_tmp_cpus, event_cpus, &params->common.monitored_cpus);
+
+	/*
+	 * Computing the mask weight is overkill but there is no leaner option
+	 * provided by glibc, e.g cpumask_first() or somesuch.
+	 */
+	if (CPU_COUNT(&cpumask_tmp_cpus)) {
+		for (int cpu = 0; cpu < nr_cpus; cpu++) {
+			if (CPU_ISSET(cpu, &cpumask_tmp_cpus))
+				account_ipi(tool, src_cpu, cpu);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * osnoise_top_handler - this is the handler for osnoise tracer events
  */
@@ -164,6 +251,8 @@ static void osnoise_top_header(struct osnoise_tool *top)
 		goto eol;
 
 	trace_seq_printf(s, "          IRQ      Softirq       Thread");
+	if (params->common.ipi)
+		trace_seq_printf(s, "          IPI");
 
 eol:
 	if (pretty)
@@ -218,7 +307,13 @@ static void osnoise_top_print(struct osnoise_tool *tool, int cpu)
 
 	trace_seq_printf(s, "%12llu ", cpu_data->irq_count);
 	trace_seq_printf(s, "%12llu ", cpu_data->softirq_count);
-	trace_seq_printf(s, "%12llu\n", cpu_data->thread_count);
+	trace_seq_printf(s, "%12llu", cpu_data->thread_count);
+	if (!params->common.ipi) {
+		trace_seq_printf(s, "\n");
+		return;
+	}
+
+	trace_seq_printf(s, " %12llu\n", cpu_data->ipi_count);
 }
 
 /*
@@ -281,6 +376,7 @@ osnoise_top_apply_config(struct osnoise_tool *tool)
 struct osnoise_tool *osnoise_init_top(struct common_params *params)
 {
 	struct osnoise_tool *tool;
+	int retval;
 
 	tool = osnoise_init_tool("osnoise_top");
 	if (!tool)
@@ -295,7 +391,33 @@ struct osnoise_tool *osnoise_init_top(struct common_params *params)
 	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "osnoise",
 				   osnoise_top_handler, NULL);
 
+	if (!params->ipi)
+		goto out;
+
+	retval = tracefs_event_enable(tool->trace.inst, "ipi", "ipi_send_cpu");
+	if (retval < 0 && !errno) {
+		err_msg("Could not find ipi_send_cpu event\n");
+		goto out_err;
+	}
+
+	retval = tracefs_event_enable(tool->trace.inst, "ipi", "ipi_send_cpumask");
+	if (retval < 0 && !errno) {
+		err_msg("Could not find ipi_send_cpumask event\n");
+		goto out_err;
+	}
+
+	tep_register_event_handler(tool->trace.tep, -1, "ipi", "ipi_send_cpu",
+				   osnoise_ipi_cpu_handler, NULL);
+
+	tep_register_event_handler(tool->trace.tep, -1, "ipi", "ipi_send_cpumask",
+				   osnoise_ipi_cpumask_handler, NULL);
+
+out:
 	return tool;
+out_err:
+	osnoise_free_top_tool(tool);
+	osnoise_destroy_tool(tool);
+	return NULL;
 }
 
 struct tool_ops osnoise_top_ops = {
-- 
2.54.0


^ permalink raw reply related

* [RFC PATCH v2 3/4] rtla/osnoise: Trace IPI events when recording a trace file
From: Valentin Schneider @ 2026-06-17 13:17 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Tomas Glozar,
	Costa Shulyupin, Crystal Wood, John Kacur, Ivan Pravdin,
	Jonathan Corbet
In-Reply-To: <20260617131803.2988989-1-vschneid@redhat.com>

IPIs can now be monitored and accounted by osnoise top. When that is
the case, also record them when saving a trace file.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 tools/tracing/rtla/src/common.c  |  2 +-
 tools/tracing/rtla/src/common.h  |  2 +-
 tools/tracing/rtla/src/osnoise.c | 17 ++++++++++++++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index d0a8a6edbf0cb..dd302427557ca 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -204,7 +204,7 @@ int run_tool(struct tool_ops *ops, int argc, char *argv[])
 
 	if (params->threshold_actions.present[ACTION_TRACE_OUTPUT] ||
 	    params->end_actions.present[ACTION_TRACE_OUTPUT]) {
-		tool->record = osnoise_init_trace_tool(ops->tracer);
+		tool->record = osnoise_init_trace_tool(params, ops->tracer);
 		if (!tool->record) {
 			err_msg("Failed to enable the trace instance\n");
 			goto out_free;
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 045253230fcf2..421e06e10f3f1 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -178,7 +178,7 @@ int osnoise_set_workload(struct osnoise_context *context, bool onoff);
 
 void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
-struct osnoise_tool *osnoise_init_trace_tool(const char *tracer);
+struct osnoise_tool *osnoise_init_trace_tool(struct common_params *params, const char *tracer);
 bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record);
 int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us);
 int osnoise_set_stop_total_us(struct osnoise_context *context,
diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 4ff5dad013b10..281f6f57d15af 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -1181,7 +1181,8 @@ struct osnoise_tool *osnoise_init_tool(char *tool_name)
 /*
  * osnoise_init_trace_tool - init a tracer instance to trace osnoise events
  */
-struct osnoise_tool *osnoise_init_trace_tool(const char *tracer)
+struct osnoise_tool *osnoise_init_trace_tool(struct common_params *params,
+					     const char *tracer)
 {
 	struct osnoise_tool *trace;
 	int retval;
@@ -1196,6 +1197,20 @@ struct osnoise_tool *osnoise_init_trace_tool(const char *tracer)
 		goto out_err;
 	}
 
+	if (params->ipi) {
+		retval = tracefs_event_enable(trace->trace.inst, "ipi", "ipi_send_cpu");
+		if (retval < 0 && !errno) {
+			err_msg("Could not find ipi_send_cpu event\n");
+			goto out_err;
+		}
+
+		retval = tracefs_event_enable(trace->trace.inst, "ipi", "ipi_send_cpumask");
+		if (retval < 0 && !errno) {
+			err_msg("Could not find ipi_send_cpumask event\n");
+			goto out_err;
+		}
+	}
+
 	retval = enable_tracer_by_name(trace->trace.inst, tracer);
 	if (retval) {
 		err_msg("Could not enable %s tracer for tracing\n", tracer);
-- 
2.54.0


^ permalink raw reply related

* [RFC PATCH v2 4/4] rtla/osnoise: Leverage IPI event filters when tracing a subset of CPUs
From: Valentin Schneider @ 2026-06-17 13:17 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Tomas Glozar,
	Costa Shulyupin, Crystal Wood, John Kacur, Ivan Pravdin,
	Jonathan Corbet
In-Reply-To: <20260617131803.2988989-1-vschneid@redhat.com>

Instead of post-processing the events in the tracefs_iterate_raw_events()
callbacks, leverage the kernel event filtering infrastructure to only emit
IPI events if they target CPUs that are being traced, as specified by the
-c cmdline option.

Note that some post-processing is still required for the ipi_send_cpumask
event, as the event being emitted means *some* CPUs targeted by that event
are monitored, but not all of them - userspace has to recompute that
intersection.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 tools/tracing/rtla/src/osnoise_top.c | 37 +++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 5b462a3543b97..8040521710884 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -93,18 +93,15 @@ osnoise_ipi_cpu_handler(struct trace_seq *s, struct tep_record *record,
 		     struct tep_event *event, void *context)
 {
 	struct osnoise_tool *tool;
-	struct osnoise_params *params;
 	unsigned long long src_cpu, dst_cpu;
 	struct trace_instance *trace = context;
 
 	tool = container_of(trace, struct osnoise_tool, trace);
-	params = to_osnoise_params(tool->params);
 
 	src_cpu = record->cpu;
 	tep_get_field_val(s, event, "cpu", record, &dst_cpu, 1);
 
-	if (CPU_ISSET(dst_cpu, &params->common.monitored_cpus))
-		account_ipi(tool, src_cpu, dst_cpu);
+	account_ipi(tool, src_cpu, dst_cpu);
 
 	return 0;
 }
@@ -141,6 +138,11 @@ osnoise_ipi_cpumask_handler(struct trace_seq *s, struct tep_record *record,
 		return 0;
 	}
 
+	/*
+	 * Despite already filtering for such an intersection, we need to compute
+	 * the intersection here as the @cpumask field may contain non-monitered
+	 * CPUs.
+	 */
 	CPU_AND(&cpumask_tmp_cpus, event_cpus, &params->common.monitored_cpus);
 
 	/*
@@ -406,6 +408,33 @@ struct osnoise_tool *osnoise_init_top(struct common_params *params)
 		goto out_err;
 	}
 
+	/*
+	 * If tracing on a subset of possible CPUs, leverage the kernel filtering
+	 * infrastructure to only generate events on traced CPUs.
+	 */
+	if (params->cpus) {
+		char filter[MAX_PATH];
+
+		snprintf(filter, ARRAY_SIZE(filter), "cpu & CPUS{%s}\n", params->cpus);
+		retval = tracefs_event_file_write(tool->trace.inst,
+						  "ipi", "ipi_send_cpu", "filter",
+						  filter);
+		if (retval) {
+			err_msg("Could not set ipi_send_cpu CPU filter\n");
+			goto out_err;
+		}
+
+
+		snprintf(filter, ARRAY_SIZE(filter), "cpumask & CPUS{%s}\n", params->cpus);
+		retval = tracefs_event_file_write(tool->trace.inst,
+						  "ipi", "ipi_send_cpumask", "filter",
+						  filter);
+		if (retval) {
+			err_msg("Could not set ipi_send_cpumask CPU filter\n");
+			goto out_err;
+		}
+	}
+
 	tep_register_event_handler(tool->trace.tep, -1, "ipi", "ipi_send_cpu",
 				   osnoise_ipi_cpu_handler, NULL);
 
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH] tracing: ring-buffer: allowlist clang-generated symbols
From: Vincent Donnefort @ 2026-06-17 13:26 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Steven Rostedt, Masami Hiramatsu, Nathan Chancellor,
	Arnd Bergmann, Mathieu Desnoyers, Nick Desaulniers, Bill Wendling,
	Justin Stitt, Marc Zyngier, Thomas Weißschuh, Paolo Bonzini,
	linux-kernel, linux-trace-kernel, llvm
In-Reply-To: <20260616164211.3733326-1-arnd@kernel.org>

On Tue, Jun 16, 2026 at 06:42:03PM +0200, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> In randconfig build testing using clang-22, I came across two
> sets of extra symbols in the ring buffer code that may get
> inserted by the compiler:
> 
> Unexpected symbols in kernel/trace/simple_ring_buffer.o:
>          U memset
> 
> Unexpected symbols in kernel/trace/simple_ring_buffer.o:
>                  U llvm_gcda_emit_arcs
>                  U llvm_gcda_emit_function
>                  U llvm_gcda_end_file
>                  U llvm_gcda_start_file
>                  U llvm_gcda_summary_info
>                  U llvm_gcov_init
> 
> Add all of these to the allowlist.
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>  kernel/trace/Makefile | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index f934ff586bd4..aa8564fb8ff4 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -146,6 +146,7 @@ KASAN_SANITIZE_undefsyms_base.o := y

Would "GCOV_PROFILE_undefsyms_base.o := y" work?

>  
>  UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
>  		      __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
> +		      memset llvm_gcda llvm_gcov \
>  		      $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
>  
>  quiet_cmd_check_undefined = NM      $<
> -- 
> 2.39.5
> 

^ permalink raw reply

* Re: [PATCH v5 3/7] bootconfig: render embedded bootconfig as a kernel cmdline at build time
From: Nicolas Schier @ 2026-06-17 13:30 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-3-fd589a9cc5e3@debian.org>

On Wed, Jun 17, 2026 at 04:23:35AM -0700, Breno Leitao wrote:
> Add the build-time pipeline that renders the "kernel" subtree of
> CONFIG_BOOT_CONFIG_EMBED_FILE into a flat cmdline string and stashes
> it in .init.rodata as embedded_kernel_cmdline[]. A follow-up patch
> adds the runtime helper that prepends this string to boot_command_line
> during early architecture setup so parse_early_param() sees the values.
> 
> The build wires up:
>   tools/bootconfig -C kernel - userspace tool already shared with
>                                lib/bootconfig.c, used here in -C mode
>                                to render a bootconfig file to a cmdline
>   lib/embedded-cmdline.S     - .incbin's the rendered text plus a NUL
>                                (listed under the EXTRA BOOT CONFIG
>                                MAINTAINERS entry)
>   lib/Makefile rule          - runs tools/bootconfig at build time
>   Makefile prepare dep       - ensures tools/bootconfig is built first,
>                                same pattern as tools/objtool and
>                                tools/bpf/resolve_btfids
[...]
> 
> Drop the test target from tools/bootconfig/Makefile's default 'all'
> recipe so that hooking the binary into the kernel build does not run
> test-bootconfig.sh on every prepare. The tests stay available as
> 'make -C tools/bootconfig test', matching the convention of
> tools/objtool and tools/bpf/resolve_btfids whose 'all' targets only
> build the binary.
> 
> Require BOOT_CONFIG_EMBED_FILE to be non-empty before the new option
> can be enabled, otherwise tools/bootconfig -C runs against an empty
> file and prints a parse error on every kernel build.
> 
> The feature gates on CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG, a
> silent symbol arches select once they've wired the prepend call into
> setup_arch(). No arch selects it in this patch, so the user-visible
> CONFIG_BOOT_CONFIG_EMBED_CMDLINE is not yet enableable; when an arch
> later opts in, the runtime behavior is added by the follow-up patches.
> 
> tools/bootconfig also installs on target systems, so its own Makefile
> keeps $(CC) and stays cross-buildable as a standalone tool. The kernel
> build, which runs the tool on the build host during prepare, instead
> forces CC=$(HOSTCC) from a dedicated tools/bootconfig rule and clears
> CROSS_COMPILE= in the sub-make. Without that clear, an LLVM=1 cross
> build would inherit CROSS_COMPILE and tools/scripts/Makefile.include
> would inject --target=/--sysroot= flags into the host clang invocation,
> producing a target binary that fails to exec ("Exec format error").
> 
> embedded-cmdline.S places the rendered string in its own .init.rodata
> subsection (.init.rodata.embed_cmdline) with the "a" (allocatable,
> read-only) flag and %progbits. lib/bootconfig-data.S already places
> the embedded bootconfig blob in .init.rodata with the "aw" flag
> (xbc_init() rewrites separators in place, so that data must be
> writable). Using a distinct subsection name avoids the ld.lld section-
> type mismatch that would otherwise arise from mixing "a" and "aw"
> under the same name; the linker's "*(.init.rodata .init.rodata.*)"
> glob still folds both into the init image and frees them after boot.
> 
> A follow-up patch wires the build-time tools/bootconfig into the
> top-level clean target.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  MAINTAINERS               |  1 +
>  Makefile                  | 15 +++++++++++++++
>  init/Kconfig              | 35 +++++++++++++++++++++++++++++++++++
>  lib/Makefile              | 16 ++++++++++++++++
>  lib/embedded-cmdline.S    | 16 ++++++++++++++++
>  tools/bootconfig/Makefile |  2 +-
>  6 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 57656ec0e9d5d..953231df1911d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -9844,6 +9844,7 @@ F:	fs/proc/bootconfig.c
>  F:	include/linux/bootconfig.h
>  F:	lib/bootconfig-data.S
>  F:	lib/bootconfig.c
> +F:	lib/embedded-cmdline.S
>  F:	tools/bootconfig/*
>  F:	tools/bootconfig/scripts/*
>  
> diff --git a/Makefile b/Makefile
> index bf196c6df5b92..a7abb3f9a6264 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -1545,6 +1545,21 @@ prepare: tools/bpf/resolve_btfids
>  endif
>  endif
>  
> +# tools/bootconfig renders the embedded bootconfig into a cmdline at build time.
> +ifdef CONFIG_BOOT_CONFIG_EMBED_CMDLINE
> +prepare: tools/bootconfig
> +endif
> +
> +# tools/bootconfig is run on the build host during prepare, so force a host
> +# binary here; its own Makefile keeps $(CC) for standalone and cross builds.
> +# CROSS_COMPILE= is cleared so tools/scripts/Makefile.include does not inject
> +# the target's --target=/--sysroot= flags into the host clang invocation under
> +# LLVM=1 cross builds (which would produce a target binary that fails to exec).
> +tools/bootconfig: FORCE
> +	$(Q)mkdir -p $(objtree)/tools
> +	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ \
> +		bootconfig CC=$(HOSTCC) CROSS_COMPILE=

sashiko whines (priority: low) about the 'CC=$(HOSTCC)' as HOSTCC might 
contains spaces (e.g. "ccache gcc") [1].  Instead of adding quotes (as 
sashiko suggests), the CC could be redefined locally for the target, for 
example:


tools/bootconfig: export CC := $(HOSTCC)
tools/bootconfig: FORCE
	$(Q)mkdir -p $(objtree)/tools
	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ \
		bootconfig CROSS_COMPILE=


That way, make handles the variable definition as it should and there is 
no interference with shell escaping.

for Kbuild:

Reviewed-by: Nicolas Schier <n.schier@fritz.com>


Kind regards,
Nicolas


[1]: http://sashiko.dev/#/message/20260617113701.0405E1F000E9%40smtp.kernel.org


> +
>  # The tools build system is not a part of Kbuild and tends to introduce
>  # its own unique issues. If you need to integrate a new tool into Kbuild,
>  # please consider locating that tool outside the tools/ tree and using the
> diff --git a/init/Kconfig b/init/Kconfig
> index 5230d4879b1c8..d2b8613a6b927 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1566,6 +1566,41 @@ config BOOT_CONFIG_EMBED_FILE
>  	  This bootconfig will be used if there is no initrd or no other
>  	  bootconfig in the initrd.
>  
> +config ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
> +	bool
> +	help
> +	  Silent symbol; no C code reads it directly. Architectures
> +	  select it once their setup_arch() calls
> +	  xbc_prepend_embedded_cmdline() before parse_early_param().
> +	  Its only role is to gate the user-visible
> +	  BOOT_CONFIG_EMBED_CMDLINE option per-arch, the same
> +	  ARCH_SUPPORTS_* idiom used by ARCH_SUPPORTS_CFI, etc.
> +
> +config BOOT_CONFIG_EMBED_CMDLINE
> +	bool "Render embedded bootconfig as kernel cmdline at build time"
> +	depends on BOOT_CONFIG_EMBED_FILE != ""
> +	depends on ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
> +	default n
> +	help
> +	  Render the "kernel" subtree of the embedded bootconfig file into a
> +	  flat cmdline string at kernel build time and prepend it to
> +	  boot_command_line during early architecture setup. This makes
> +	  early_param() handlers (e.g. mem=, earlycon=, loglevel=) see the
> +	  values supplied via the embedded bootconfig.
> +
> +	  The runtime bootconfig parser is unaffected, so tree-structured
> +	  consumers such as ftrace boot-time tracing keep working.
> +
> +	  Note: when an initrd also carries a bootconfig, its "kernel"
> +	  subtree is still parsed at runtime, but the embedded "kernel"
> +	  keys remain in boot_command_line for parse_early_param() and
> +	  end up later than the initrd keys in saved_command_line, so
> +	  parse_args() last-wins favors the embedded values. If you need
> +	  initrd to override embedded kernel.* keys, leave this option
> +	  off.
> +
> +	  If unsure, say N.
> +
>  config CMDLINE_LOG_WRAP_IDEAL_LEN
>  	int "Length to try to wrap the cmdline when logged at boot"
>  	default 1021
> diff --git a/lib/Makefile b/lib/Makefile
> index 7f75cc6edf94a..4ace86a5cb6de 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -273,6 +273,22 @@ filechk_defbconf = cat $(or $(real-prereqs), /dev/null)
>  $(obj)/default.bconf: $(CONFIG_BOOT_CONFIG_EMBED_FILE) FORCE
>  	$(call filechk,defbconf)
>  
> +obj-$(CONFIG_BOOT_CONFIG_EMBED_CMDLINE) += embedded-cmdline.o
> +$(obj)/embedded-cmdline.o: $(obj)/embedded_cmdline.bin
> +
> +# Render the bootconfig "kernel" subtree to a flat cmdline string using
> +# the userspace tools/bootconfig parser (-C mode). The runtime prepend
> +# helper enforces COMMAND_LINE_SIZE at boot, so no build-time size
> +# check is performed here (COMMAND_LINE_SIZE is an arch header
> +# constant, not a Kconfig value).
> +quiet_cmd_render_cmdline = BCONF2C $@
> +      cmd_render_cmdline = \
> +	$(objtree)/tools/bootconfig/bootconfig -C $< > $@
> +
> +targets += embedded_cmdline.bin
> +$(obj)/embedded_cmdline.bin: $(obj)/default.bconf $(objtree)/tools/bootconfig/bootconfig FORCE
> +	$(call if_changed,render_cmdline)
> +
>  obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
>  obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
>  
> diff --git a/lib/embedded-cmdline.S b/lib/embedded-cmdline.S
> new file mode 100644
> index 0000000000000..bda81b4a42bea
> --- /dev/null
> +++ b/lib/embedded-cmdline.S
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Embed the build-time-rendered bootconfig "kernel" subtree as a flat
> + * cmdline string. setup_arch() prepends this to boot_command_line on
> + * architectures that select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG.
> + *
> + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates
> + * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
> + */
> +	.section .init.rodata.embed_cmdline, "a", %progbits
> +	.global embedded_kernel_cmdline
> +embedded_kernel_cmdline:
> +	.incbin "lib/embedded_cmdline.bin"
> +	.byte 0
> +	.global embedded_kernel_cmdline_end
> +embedded_kernel_cmdline_end:
> diff --git a/tools/bootconfig/Makefile b/tools/bootconfig/Makefile
> index 90eb47c9d8de6..4e82fd9553cde 100644
> --- a/tools/bootconfig/Makefile
> +++ b/tools/bootconfig/Makefile
> @@ -15,7 +15,7 @@ override CFLAGS += -Wall -g -I$(CURDIR)/include
>  ALL_TARGETS := bootconfig
>  ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
>  
> -all: $(ALL_PROGRAMS) test
> +all: $(ALL_PROGRAMS)
>  
>  $(OUTPUT)bootconfig: main.c include/linux/bootconfig.h $(LIBSRC)
>  	$(CC) $(filter %.c,$^) $(CFLAGS) $(LDFLAGS) -o $@
> 
> -- 
> 2.53.0-Meta
> 

^ permalink raw reply

* Re: [PATCH v5 4/7] bootconfig: clean build-time tools/bootconfig from make clean
From: Nicolas Schier @ 2026-06-17 13:45 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Masami Hiramatsu, Andrew Morton, Nathan Chancellor, paulmck,
	Nicolas Schier, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, kernel-team
In-Reply-To: <20260617-bootconfig_using_tools-v5-4-fd589a9cc5e3@debian.org>

On Wed, Jun 17, 2026 at 04:23:36AM -0700, Breno Leitao wrote:
> The previous patch builds tools/bootconfig during 'make prepare' to
> render the embedded bootconfig cmdline, but nothing removes it on
> 'make clean', leaving the compiled tool and its objects behind.
> 
> Wire a bootconfig_clean hook into the top-level clean target so the
> compiled tool and its objects are removed by make clean, matching the
> prepare-wired tools/objtool and tools/bpf/resolve_btfids.
> 
> The hook runs tools/bootconfig's Makefile via $(MAKE), which the kernel
> build invokes with -rR (MAKEFLAGS += -rR). -rR drops the built-in $(RM)
> variable, so the existing "$(RM) -f ..." clean recipe would expand to a
> bare "-f ..." and fail. Spell the recipe with a literal "rm -f" so it
> keeps working both standalone and when invoked from Kbuild.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  Makefile                  | 13 ++++++++++++-
>  tools/bootconfig/Makefile |  2 +-
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/Makefile b/Makefile
> index a7abb3f9a6264..a6e13fa1c1dc1 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -1586,6 +1586,17 @@ ifneq ($(wildcard $(objtool_O)),)
>  	$(Q)$(MAKE) -sC $(abs_srctree)/tools/objtool O=$(objtool_O) srctree=$(abs_srctree) $(patsubst objtool_%,%,$@)
>  endif
>  
> +PHONY += bootconfig_clean
> +
> +bootconfig_O = $(abspath $(objtree))/tools/bootconfig
> +
> +# tools/bootconfig is only built (via the prepare hook above) when
> +# CONFIG_BOOT_CONFIG_EMBED_CMDLINE is set; skip its clean otherwise.

The wildcard below matches for all in-source builds and also for all 
out-of-source builds that _once_ built bootconfig (as the directory will 
never be removed).  I'd like the comment to be removed, it's obvious 
enough what is happening here.

> +bootconfig_clean:
> +ifneq ($(wildcard $(bootconfig_O)),)
> +	$(Q)$(MAKE) -sC $(srctree)/tools/bootconfig O=$(bootconfig_O) clean
> +endif
> +

Some additional bike-shedding:  I'd rather keep it here as short and 
simple altogether:


PHONY += bootconfig_clean
bootconfig_clean: bootconfig_O = $(abs_output))/tools/bootconfig
	$(Q)$(MAKE) -sC $(srctree)/tools/bootconfig O=$(bootconfig_O) clean


Nevertheless, for kbuild:

Reviewed-by: Nicolas Schier <n.schier@fritz.com>



Kind regards,
Nicolas

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-06-17 14:03 UTC (permalink / raw)
  To: Balbir Singh
  Cc: David Hildenbrand (Arm), lsf-pc, linux-kernel, linux-cxl, cgroups,
	linux-mm, linux-trace-kernel, damon, kernel-team, gregkh, rafael,
	dakr, dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <ajIb4DJdLGPbMB4V@parvat>

On Wed, Jun 17, 2026 at 02:02:47PM +1000, Balbir Singh wrote:
> On Wed, Jun 10, 2026 at 12:37:34PM -0400, Gregory Price wrote:
> > On Wed, Jun 10, 2026 at 05:00:33PM +0200, David Hildenbrand (Arm) wrote:
> > > On 6/10/26 12:41, Gregory Price wrote:
> > > > On Wed, Jun 03, 2026 at 03:00:01PM +1000, Balbir Singh wrote:
> > > > 
> > 
> > For mm/slub.c we can choose to do one of thwo things
> > 
> >   1) 100% refuse slab allocations on private nodes, i.e.:
> > 
> >      kmalloc_node(..., private_nid, __GFP_THISNODE)
> > 
> >      And will fail (return NULL).
> > 
> 
> Doesn't this iterate through N_MEMORY only? N_MEMORY_PRIVATE should not
> be in the regular for_each(...) loops
> 

If a node is in neither FALLBACK nor NOFALLBACK - it is *completely*
unreachable in the current page allocator.

Next RFC I've reduced this to create a ZONELIST_PRIVATE separate from
the ZONELIST_FALLBACK and ZONELIST_NOFALLBACK, and an explicit folio
allocation interface that selects which fallback list to use.

the feedback in the past week has been helpful in honing in on a
solution that I think is generalizable.  Have just been taking the time
to test various behaviors to make sure I haven't been regressing any
userland API/ABIs (mbind, mempolicy, etc).

~Gregory

^ permalink raw reply

* Re: [PATCH v3] mm/lruvec: trace LRU add drains and drain-all requests
From: Shakeel Butt @ 2026-06-17 15:03 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: JP Kobryn, linux-mm, willy, usama.arif, akpm, vbabka, mhocko,
	rostedt, mhiramat, mathieu.desnoyers, kasong, qi.zheng, baohua,
	axelrasmussen, yuanchu, weixugc, chrisl, shikemeng, nphamcs,
	baoquan.he, youngjun.park, linux-kernel, linux-trace-kernel
In-Reply-To: <06122cae-e28b-4ded-a9dd-d380d31c5230@kernel.org>

On Wed, Jun 17, 2026 at 01:11:16PM +0200, David Hildenbrand (Arm) wrote:
> On 6/10/26 21:52, JP Kobryn wrote:
> > LRU add batches can be drained before they reach capacity. This can be a
> > source of LRU lock contention, but it is not currently possible to
> > attribute these drains to callers with existing tracepoints.
> > 
> > Add mm_lru_add_drain to report the CPU and lru_add batch count when an
> > lru_add batch is drained. This allows tracing to distinguish full drains
> > from partial drains and attribute them to the calling stack.
> > 
> > Add mm_lru_add_drain_all to capture callers of __lru_add_drain_all and
> > whether they set the force flag for all CPUs. The tracepoint resembles
> > the signature of the enclosing function, but is needed because of
> > potential inlining.
> > 
> > Signed-off-by: JP Kobryn <jp.kobryn@linux.dev>
> > ---
> >  include/trace/events/pagemap.h | 37 ++++++++++++++++++++++++++++++++++
> >  mm/swap.c                      |  7 ++++++-
> >  2 files changed, 43 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
> > index 171524d3526d..ff3da07ccb40 100644
> > --- a/include/trace/events/pagemap.h
> > +++ b/include/trace/events/pagemap.h
> > @@ -77,6 +77,43 @@ TRACE_EVENT(mm_lru_activate,
> >  	TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
> >  );
> >  
> > +TRACE_EVENT(mm_lru_add_drain,
> > +
> > +	TP_PROTO(int cpu, unsigned int nr),
> > +
> > +	TP_ARGS(cpu, nr),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(int,		cpu	)
> > +		__field(unsigned int,	nr	)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->cpu	= cpu;
> > +		__entry->nr	= nr;
> > +	),
> > +
> > +	TP_printk("cpu=%d nr=%u", __entry->cpu, __entry->nr)
> > +);
> > +
> > +TRACE_EVENT(mm_lru_add_drain_all,
> > +
> > +	TP_PROTO(bool force_all_cpus),
> > +
> > +	TP_ARGS(force_all_cpus),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(bool,	force_all_cpus	)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->force_all_cpus	= force_all_cpus;
> > +	),
> > +
> > +	TP_printk("force_all_cpus=%s",
> > +		__entry->force_all_cpus ? "true" : "false")
> > +);
> > +
> >  #endif /* _TRACE_PAGEMAP_H */
> >  
> >  /* This part must be outside protection */
> > diff --git a/mm/swap.c b/mm/swap.c
> > index 588f50d8f1a8..e14b7612f896 100644
> > --- a/mm/swap.c
> > +++ b/mm/swap.c
> > @@ -694,9 +694,12 @@ void lru_add_drain_cpu(int cpu)
> >  {
> >  	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
> >  	struct folio_batch *fbatch = &fbatches->lru_add;
> > +	unsigned int nr_folios_add = folio_batch_count(fbatch);
> >  
> > -	if (folio_batch_count(fbatch))
> > +	if (nr_folios_add) {
> >  		folio_batch_move_lru(fbatch, lru_add);
> > +		trace_mm_lru_add_drain(cpu, nr_folios_add);
> > +	}
> >  
> >  	fbatch = &fbatches->lru_move_tail;
> >  	/* Disabling interrupts below acts as a compiler barrier. */
> > @@ -869,6 +872,8 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
> >  	if (WARN_ON(!mm_percpu_wq))
> >  		return;
> >  
> > +	trace_mm_lru_add_drain_all(force_all_cpus);
> > +
> >  	/*
> >  	 * Guarantee folio_batch counter stores visible by this CPU
> >  	 * are visible to other CPUs before loading the current drain
> 
> Given that trace events can quickly become stable ABI [1], are we really sure we
> want to add this?

Yes, I think so as this is useful to get insights into lru cache draining.
Trace events being stable or not is secondary IMHO. If in future we rearchitect
the lru page handling where there is no cache draining anymore, we can make
these a noops.

> 
> [1] https://lore.kernel.org/r/20260603130006.7d2c4a62@gandalf.local.home
> 
> -- 
> Cheers,
> 
> David

^ permalink raw reply

* Re: [PATCH v3 9/9] selftests/verification: add tlob selftests
From: Gabriele Monaco @ 2026-06-17 15:09 UTC (permalink / raw)
  To: wen.yang; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <4aeb668c8446a9f6366d92e218df386bef7bc965.1780847473.git.wen.yang@linux.dev>

On Mon, 2026-06-08 at 00:13 +0800, wen.yang@linux.dev wrote:
> From: Wen Yang <wen.yang@linux.dev>
> 
> Add selftest coverage for the tlob uprobe monitoring interface under
> tools/testing/selftests/verification/.
> 
> test.d/tlob/ contains both the helper sources (tlob_target, tlob_sym)
> and the seven test scripts so the test suite is self-contained.
> tlob_target provides busy-spin, sleep, and preempt workloads;
> tlob_sym
> resolves ELF symbol offsets for uprobe registration.
> 
> Seven test scripts exercise uprobe binding management, budget
> violation
> detection, and per-state time accounting (running_ns, waiting_ns,
> sleeping_ns).
> 
> Signed-off-by: Wen Yang <wen.yang@linux.dev>

Tests look fine and coverage is good, thanks!

Minor comments follow.

> ---
>  .../testing/selftests/verification/.gitignore |   2 +
>  tools/testing/selftests/verification/Makefile |  19 +-
>  .../verification/test.d/tlob/Makefile         |  20 ++
>  .../verification/test.d/tlob/test.d/functions |   1 +
>  .../verification/test.d/tlob/tlob_sym.c       | 189
> ++++++++++++++++++
>  .../verification/test.d/tlob/tlob_target.c    | 138 +++++++++++++
>  .../verification/test.d/tlob/uprobe_bind.tc   |  37 ++++

>  .../test.d/tlob/uprobe_detail_running.tc      |  51 +++++
>  .../test.d/tlob/uprobe_detail_sleeping.tc     |  50 +++++
>  .../test.d/tlob/uprobe_detail_waiting.tc      |  66 ++++++

Not sure if this would work, but just to lower the maintenance burden,
couldn't we put these 3 in the same test case? You could define a bash
function and pass "running", "sleeping" or "waiting" and whether to launch the
hog to that.

Only waiting uses a taskset and a slightly different ordering, but wouldn't
they all work fine like that?

...

> a/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
> b/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
> new file mode 100644
> index 000000000000..1b7ba1c6d95b
> --- /dev/null
> +++ b/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
> @@ -0,0 +1,189 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * tlob_sym.c - ELF symbol-to-file-offset utility for tlob selftests
> + *
> + * Usage: tlob_sym sym_offset <binary> <symbol>
> + *
> + *   Prints the ELF file offset of <symbol> in <binary> to stdout.
> + *
> + * Exit: 0 = found, 1 = error / not found.
> + */

I wonder if instead of maintaining a pure C solution we couldn't live
with something like:

  sym_offset() { # target symbol
    readelf -W -S -s $1 | awk -v symbol="$2" '
      { gsub(/\[ /, "[") }  # normalise section markers
      $1 ~ /^\[[0-9]+\]$/ { sections[$1]="0x"$4; offsets[$1]="0x"$5 }
      $1 ~ /^[0-9]+:$/ && $NF == symbol { addr="0x"$2; sec="["$7"]" }
      END { printf "printf \"0x%%x\\n\" $((%s - %s + %s))\n", addr, sections[sec], offsets[sec] }
    ' | sh
  }

...

> diff --git
> a/tools/testing/selftests/verification/test.d/tlob/tlob_target.c
> b/tools/testing/selftests/verification/test.d/tlob/tlob_target.c
> new file mode 100644
> index 000000000000..0fdbc575d71d
> --- /dev/null
> +++ b/tools/testing/selftests/verification/test.d/tlob/tlob_target.c
> @@ -0,0 +1,138 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * tlob_target.c - uprobe target binary for tlob selftests.
> + *
> + * Provides three start/stop probe pairs, each designed to exercise
> a
> + * different dominant component of the detail_env_tlob ns breakdown:
> + *
> + *   tlob_busy_work    / tlob_busy_work_done    - busy-spin:
> running_ns dominates
> + *   tlob_sleep_work   / tlob_sleep_work_done   - nanosleep:
> sleeping_ns dominates
> + *   tlob_preempt_work / tlob_preempt_work_done - busy-spin:
> waiting_ns dominates
> + *                                                (needs an RT

In short tlob_preempt_work is the same as tlob_busy_work, isn't it? Do we need
them both? Cannot you just have a hog in the test and keep using the same
function?

...

> +
> +	do {
> +		if (strcmp(mode, "sleep") == 0)
> +			tlob_sleep_work(200);
> +		else if (strcmp(mode, "preempt") == 0)
> +			tlob_preempt_work(200);
> +		else
> +			tlob_busy_work(200 * 1000000UL);

The only difference I see is that you multiply by 1000000UL here for busy and
in the function for preempt.
Cannot we make them all consistent (call with 200 and do the math inside)?


> diff --git
> a/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
> b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
> new file mode 100644
> index 000000000000..1ac3db6ca7bb
> --- /dev/null
> +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
> @@ -0,0 +1,37 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# description: Test tlob monitor uprobe binding (visible in monitor
> file, removable, duplicate rejected)
> +# requires: tlob:monitor
> +
> +RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
> +UPROBE_TARGET="${RV_BINDIR}/tlob_target"
> +TLOB_SYM="${RV_BINDIR}/tlob_sym"
> +[ -x "$UPROBE_TARGET" ] || exit_unsupported
> +[ -x "$TLOB_SYM" ]      || exit_unsupported

If those aren't ready, the build system didn't work, I don't think we need to
check here, it's just a clear error.

> +TLOB_MONITOR=monitors/tlob/monitor
> +
> +busy_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work
> 2>/dev/null)
> +stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET"
> tlob_busy_work_done 2>/dev/null)
> +[ -n "$busy_offset" ] || exit_unsupported
> +[ -n "$stop_offset" ] || exit_unsupported

Kind of the same here, the rest of the test should probably fail (EINVAL in the
monitor or whatever). The script will print everything with set -x and it
should be clear what was missing.

> +command -v chrt    > /dev/null || exit_unsupported
> +command -v taskset > /dev/null || exit_unsupported

Not sure how common it is not to have those, but this is exactly what the
:program under requires: is for (see rv_wwnr_printk with stress-ng).

Thanks,
Gabriele


^ permalink raw reply

* [GIT PULL v2] RTLA additional fixes for v7.2
From: Tomas Glozar @ 2026-06-17 15:30 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: LKML, linux-trace-kernel, Tomas Glozar

Steven,

The following changes since commit 6b5a2b7d9bc156e505f09e698d85d6a1547c1206:

  Merge tag 'trace-tools-v7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace (2026-06-16 17:50:34 +0530)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/tglozar/linux.git tags/rtla-v7.2-fixups-v2

for you to fetch changes up to c35eb77a67515d4201bc91294f40761591f43bbd:

  rtla/tests: Fix pgrep filter in get_workload_pids.sh (2026-06-17 16:26:44 +0200)

----------------------------------------------------------------
RTLA additional fixes for v7.2

- Fix and clean up .gitignore

Narrow match range of entries in .gitignore to only what is needed,
fixing "lib/" matching tools/tracing/rtla/tests/scripts/lib/*.

- Fix pgrep filter in runtime tests

Make the pgrep filter used by runtime tests to get workload PIDs work
on both older and newer versions of pgrep, regardless of whether
square brackets are counted as part of kthread comm or not.

Build, runtime tests, unit tests pass.

v2:
- Rebase onto 6b5a2b7d9bc156e505f09e698d85d6a1547c1206 to avoid merge
  conflicts.

Signed-off-by: Tomas Glozar <tglozar@redhat.com>

----------------------------------------------------------------
Tomas Glozar (2):
      rtla: Fix and clean up .gitignore
      rtla/tests: Fix pgrep filter in get_workload_pids.sh

 tools/tracing/rtla/.gitignore                             | 13 ++++---------
 tools/tracing/rtla/tests/scripts/lib/get_workload_pids.sh |  2 +-
 2 files changed, 5 insertions(+), 10 deletions(-)


^ permalink raw reply

* Re: [PATCH 0/3] rv/reactors: fix lockdep warning and add KUnit tests
From: Gabriele Monaco @ 2026-06-17 15:41 UTC (permalink / raw)
  To: wen.yang; +Cc: Nam Cao, linux-trace-kernel, linux-kernel
In-Reply-To: <cover.1781541556.git.wen.yang@linux.dev>

On Tue, 2026-06-16 at 00:44 +0800, wen.yang@linux.dev wrote:
> From: Wen Yang <wen.yang@linux.dev>
> 
> We occasionally hit a lockdep "Invalid wait context" warning in
> production
> environments when rv_react() callbacks are interrupted.
> 
> The bug is intermittent in production. KUnit tests with busy-wait
> callbacks
> can reproduce it by holding the CPU long enough for a timer interrupt
> to fire
> during rv_react(), exposing the lockdep constraint violation:
> 
> [   44.820913] =============================
> [   44.820923] [ BUG: Invalid wait context ]
> [   44.821137] 7.1.0-rc7-next-20260612-virtme #6 Tainted:
> G                 N
> [   44.821203] -----------------------------

It's nice to have reactors kunit coverage, I need to go through them
more carefully but I like the idea.

Are those tests supposed to trigger this issue though? Under what
configuration?

I reverted the lockdep fix and run the tests in vng on both x86_64 and
arm64, both preempt_rt and not but I see no splat.
Repeating the tests multiple times from debugfs also didn't seem to
help. Both machines were relatively large (128 and 48 CPUs).

The config was the bare vng one with kunit built-in, lockdep and the
reactors tests.

What am I missing?

Thanks,
Gabriele

> [   44.821211] kunit_try_catch/209 is trying to lock:
> [   44.821244] ffff8a743ed3e8a0 (&rq->__lock){-...}-{2:2}, at:
> __schedule+0x102/0x13d0
> [   44.821688] other info that might help us debug this:
> [   44.821708] context-{5:5}
> [   44.821730] 1 lock held by kunit_try_catch/209:
> [   44.821745]  #0: ffffffffb6ba62c0 (rv_react_map-wait-type-
> override){+.+.}-{1:1}, at: rv_react+0x9d/0xf0
> [   44.821803] stack backtrace:
> [   44.822110] CPU: 10 UID: 0 PID: 209 Comm: kunit_try_catch Tainted:
> G                 N  7.1.0-rc7-next-20260612-virtme #6
> PREEMPT_{RT,(full)}
> [   44.822197] Tainted: [N]=TEST
> [   44.822210] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX,
> arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   44.822328] Call Trace:
> [   44.822377]  <TASK>
> [   44.822806]  dump_stack_lvl+0x78/0xe0
> [   44.822860]  __lock_acquire+0x926/0x1c90
> [   44.822888]  lock_acquire+0xd3/0x310
> [   44.822901]  ? __schedule+0x102/0x13d0
> [   44.822919]  ? rcu_qs+0x2d/0x1a0
> [   44.822954]  _raw_spin_lock_nested+0x36/0x50
> [   44.822966]  ? __schedule+0x102/0x13d0
> [   44.822979]  __schedule+0x102/0x13d0
> [   44.822993]  ? mark_held_locks+0x40/0x70
> [   44.823009]  preempt_schedule_irq+0x37/0x70
> [   44.823018]  irqentry_exit+0x1da/0x8c0
> [   44.823032]  asm_sysvec_apic_timer_interrupt+0x1a/0x20
> [   44.823093] RIP: 0010:mock_printk_react+0x2a/0x50
> [   44.823250] Code: f3 0f 1e fa 0f 1f 44 00 00 41 54 49 89 f4 55 48
> 89 fd 53 e8 18 8b db ff 4c 89 e6 48 89 ef 48 89 c3 e8 fa 8e ed ff eb
> 02 f3 90 <e8> 01 8b db ff 48 29 d8 48 3d 3f 4b 4c 00 76 ee 5b 5d 41
> 5c c3 cc
> [   44.823303] RSP: 0018:ffffd1c3c0733d38 EFLAGS: 00000297
> [   44.823332] RAX: 00000000000119f3 RBX: 0000000a74e60d1c RCX:
> 000000000000001f
> [   44.823342] RDX: 0000000000000000 RSI: 000000003348c8a2 RDI:
> ffffffffc1abbfd9
> [   44.823351] RBP: ffffffffb671b613 R08: 0000000000000002 R09:
> 0000000000000000
> [   44.823359] R10: 0000000000000001 R11: 0000000000000000 R12:
> ffffd1c3c0733d60
> [   44.823367] R13: ffffffffb575a5fd R14: ffffd1c3c0017be8 R15:
> ffffd1c3c00179f8
> [   44.823397]  ? rv_react+0x9d/0xf0
> [   44.823437]  ? mock_printk_react+0x2f/0x50
> [   44.823448]  rv_react+0xb4/0xf0
> [   44.823455]  ? rv_react+0x9d/0xf0
> [   44.823476]  test_printk_react_called+0x83/0xb0
> [   44.823486]  ? __pfx_mock_printk_react+0x10/0x10
> [   44.823502]  ? __pfx_mock_printk_react+0x10/0x10
> [   44.823513]  kunit_try_run_case+0x97/0x190
> [   44.823534]  ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10
> [   44.823544]  kunit_generic_run_threadfn_adapter+0x21/0x40
> [   44.823551]  kthread+0x124/0x160
> [   44.823562]  ? __pfx_kthread+0x10/0x10
> [   44.823574]  ret_from_fork+0x291/0x3b0
> [   44.823585]  ? __pfx_kthread+0x10/0x10
> [   44.823595]  ret_from_fork_asm+0x1a/0x30
> [   44.823641]  </TASK>
> 
> 
> Patch 1 fixes the lockdep bug by correcting rv_react()'s
> wait_type_inner
> from LD_WAIT_CONFIG (which inherits the outer context) to
> LD_WAIT_SPIN
> (the tightest constraint callbacks must satisfy).
> 
> Patch 2 adds KUnit tests for reactor_printk. The busy-wait in the
> mock
> callback reproduces the timer interrupt scenario that exposes the
> bug.
> 
> Patch 3 adds KUnit tests for reactor_panic, exercising the panic
> notifier
> chain without halting the system.
> 
> Tested with CONFIG_PROVE_LOCKING=y and CONFIG_KUNIT=y.
> 
> 
> Wen Yang (3):
>   rv/reactors: fix lockdep "Invalid wait context" in rv_react()
>   rv/reactors: add KUnit tests for reactor_printk
>   rv/reactors: add KUnit tests for reactor_panic
> 
>  kernel/trace/rv/Kconfig                |  20 ++++
>  kernel/trace/rv/Makefile               |   2 +
>  kernel/trace/rv/reactor_panic_kunit.c  | 106 +++++++++++++++++++++
>  kernel/trace/rv/reactor_printk_kunit.c | 123
> +++++++++++++++++++++++++
>  kernel/trace/rv/rv_reactors.c          |   8 +-
>  5 files changed, 258 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/trace/rv/reactor_panic_kunit.c
>  create mode 100644 kernel/trace/rv/reactor_printk_kunit.c


^ permalink raw reply

* Re: [PATCH 0/3] rv/reactors: fix lockdep warning and add KUnit tests
From: Nam Cao @ 2026-06-17 15:52 UTC (permalink / raw)
  To: Gabriele Monaco, wen.yang; +Cc: linux-trace-kernel, linux-kernel
In-Reply-To: <2bcfa0bda551c0e1ba137b728dbe7886ff5c2579.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> Are those tests supposed to trigger this issue though? Under what
> configuration?
>
> I reverted the lockdep fix and run the tests in vng on both x86_64 and
> arm64, both preempt_rt and not but I see no splat.
> Repeating the tests multiple times from debugfs also didn't seem to
> help. Both machines were relatively large (128 and 48 CPUs).
>
> The config was the bare vng one with kunit built-in, lockdep and the
> reactors tests.
>
> What am I missing?

I haven't tried to reproduce it, but seems quite rare. From the look of
it, adding some delay into the reactor function should make the issue
more easily reproducible.

Nam

^ permalink raw reply

* Re: [PATCH 1/3] rv/reactors: fix lockdep "Invalid wait context" in rv_react()
From: Nam Cao @ 2026-06-17 15:58 UTC (permalink / raw)
  To: wen.yang, Gabriele Monaco
  Cc: linux-trace-kernel, linux-kernel, Wen Yang, Thomas Weißschuh
In-Reply-To: <bc01343ae74acf6bdf142434aeaa4e6b40aa72a9.1781541556.git.wen.yang@linux.dev>

wen.yang@linux.dev writes:
>  void rv_react(struct rv_monitor *monitor, const char *msg, ...)
>  {
> -	static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE);
> +#ifdef CONFIG_LOCKDEP
> +	static struct lockdep_map rv_react_map = {
> +		.name = "rv_react",
> +		.wait_type_outer = LD_WAIT_FREE,
> +		.wait_type_inner = LD_WAIT_SPIN,
> +	};
> +#endif
>  	va_list args;
>  
>  	if (!rv_reacting_on() || !monitor->react)

From my limited understanding of lockdep, this looks fine to me. It now
will not warn us if reactor takes a raw_spin_lock, but I think it's fine.

But I would wait for Thomas's thought on this. He will be back next
week.

Nam

^ permalink raw reply

* Re: [PATCH 0/3] rv/reactors: fix lockdep warning and add KUnit tests
From: Gabriele Monaco @ 2026-06-17 16:14 UTC (permalink / raw)
  To: Nam Cao, wen.yang; +Cc: linux-trace-kernel, linux-kernel
In-Reply-To: <874ij16u6i.fsf@yellow.woof>

On Wed, 2026-06-17 at 17:52 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> > Are those tests supposed to trigger this issue though? Under what
> > configuration?
> > 
> > I reverted the lockdep fix and run the tests in vng on both x86_64
> > and arm64, both preempt_rt and not but I see no splat.
> > Repeating the tests multiple times from debugfs also didn't seem to
> > help. Both machines were relatively large (128 and 48 CPUs).
> > 
> > The config was the bare vng one with kunit built-in, lockdep and
> > the reactors tests.
> > 
> > What am I missing?
> 
> I haven't tried to reproduce it, but seems quite rare. From the look
> of it, adding some delay into the reactor function should make the
> issue more easily reproducible.

Yeah the tests should be doing that, but even increasing the delay
didn't help. I should probably try on physical machines to have more
likely interrupts but at least the tick should be running.


^ permalink raw reply

* Re: [PATCH 0/3] rv/reactors: fix lockdep warning and add KUnit tests
From: Wen Yang @ 2026-06-17 17:11 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Nam Cao, linux-trace-kernel, linux-kernel
In-Reply-To: <2bcfa0bda551c0e1ba137b728dbe7886ff5c2579.camel@redhat.com>



On 6/17/26 23:41, Gabriele Monaco wrote:
> On Tue, 2026-06-16 at 00:44 +0800, wen.yang@linux.dev wrote:
>> From: Wen Yang <wen.yang@linux.dev>
>>
>> We occasionally hit a lockdep "Invalid wait context" warning in
>> production
>> environments when rv_react() callbacks are interrupted.
>>
>> The bug is intermittent in production. KUnit tests with busy-wait
>> callbacks
>> can reproduce it by holding the CPU long enough for a timer interrupt
>> to fire
>> during rv_react(), exposing the lockdep constraint violation:
>>
>> [   44.820913] =============================
>> [   44.820923] [ BUG: Invalid wait context ]
>> [   44.821137] 7.1.0-rc7-next-20260612-virtme #6 Tainted:
>> G                 N
>> [   44.821203] -----------------------------
> 
> It's nice to have reactors kunit coverage, I need to go through them
> more carefully but I like the idea.
> 
> Are those tests supposed to trigger this issue though? Under what
> configuration?
> 
> I reverted the lockdep fix and run the tests in vng on both x86_64 and
> arm64, both preempt_rt and not but I see no splat.
> Repeating the tests multiple times from debugfs also didn't seem to
> help. Both machines were relatively large (128 and 48 CPUs).
> 
> The config was the bare vng one with kunit built-in, lockdep and the
> reactors tests.
> 
> What am I missing?
> 

Thank you for your feedback.
I am using a WSL dev environment with 12 cores and 16GB. The config of 
the tested kernel code is as follows:


$ make savedefconfig

$ cat defconfig
CONFIG_WERROR=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_RT=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
CONFIG_CGROUP_MISC=y
CONFIG_CGROUP_DEBUG=y
CONFIG_NAMESPACES=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
CONFIG_PROFILING=y
CONFIG_KEXEC=y
CONFIG_SMP=y
CONFIG_IOSF_MBI=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
# CONFIG_MTRR_SANITIZER is not set
CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_EFI_MIXED=y
CONFIG_HZ_1000=y
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
CONFIG_ACPI_VIDEO=y
CONFIG_ACPI_DOCK=y
CONFIG_ACPI_BGRT=y
CONFIG_IA32_EMULATION=y
CONFIG_KVM=y
CONFIG_KVM_INTEL=y
CONFIG_KVM_AMD=y
# CONFIG_SCHED_MC is not set
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
CONFIG_BLK_CGROUP_IOPRIO=y
CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_ZONE_DEVICE=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
CONFIG_SYN_COOKIES=y
# CONFIG_INET_DIAG is not set
CONFIG_TCP_CONG_ADVANCED=y
# CONFIG_TCP_CONG_BIC is not set
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
# CONFIG_IPV6 is not set
CONFIG_NETWORK_SECMARK=y
CONFIG_NET_SCHED=y
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_EMATCH=y
CONFIG_NET_CLS_ACT=y
CONFIG_DNS_RESOLVER=y
CONFIG_CGROUP_NET_PRIO=y
# CONFIG_WIRELESS is not set
CONFIG_NET_9P=y
CONFIG_NET_9P_VIRTIO=y
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI=y
CONFIG_PCCARD=y
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
CONFIG_FW_CFG_SYSFS=y
CONFIG_FW_CFG_SYSFS_CMDLINE=y
# CONFIG_EFI_DISABLE_RUNTIME is not set
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
CONFIG_CHR_DEV_SG=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_SPI_ATTRS=y
CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
CONFIG_SATA_AHCI=y
CONFIG_ATA_PIIX=y
CONFIG_PATA_AMD=y
CONFIG_PATA_OLDPIIX=y
CONFIG_PATA_SCH=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_BLK_DEV_DM=y
CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
CONFIG_NETCONSOLE=y
CONFIG_VIRTIO_NET=y
# CONFIG_ETHERNET is not set
CONFIG_PHYLIB=y
CONFIG_REALTEK_PHY=y
# CONFIG_WLAN is not set
CONFIG_INPUT_FF_MEMLESS=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
CONFIG_INPUT_TOUCHSCREEN=y
CONFIG_INPUT_MISC=y
# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_NR_UARTS=32
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_NONSTANDARD=y
CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM=y
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
CONFIG_NVRAM=y
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
CONFIG_I2C_I801=y
CONFIG_PTP_1588_CLOCK=y
CONFIG_WATCHDOG=y
CONFIG_I6300ESB_WDT=y
CONFIG_AGP=y
CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
CONFIG_DRM=y
# CONFIG_DRM_FBDEV_EMULATION is not set
CONFIG_DRM_BOCHS=y
CONFIG_DRM_VIRTIO_GPU=y
CONFIG_FB=y
CONFIG_FB_VESA=y
CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_SOUND=y
CONFIG_SND=y
CONFIG_SND_HRTIMER=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
# CONFIG_SND_DRIVERS is not set
CONFIG_SND_INTEL8X0=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_CODEC_REALTEK=y
# CONFIG_SND_PCMCIA is not set
# CONFIG_SND_X86 is not set
# CONFIG_HID is not set
CONFIG_RTC_CLASS=y
CONFIG_DMADEVICES=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_BALLOON=y
CONFIG_VIRTIO_INPUT=y
CONFIG_VIRTIO_MMIO=y
CONFIG_EEEPC_LAPTOP=y
CONFIG_ACPI_WMI=y
CONFIG_MAILBOX=y
CONFIG_PCC=y
CONFIG_AMD_IOMMU=y
CONFIG_INTEL_IOMMU=y
# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
CONFIG_IRQ_REMAP=y
CONFIG_VIRTIO_IOMMU=y
CONFIG_FS_DAX=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
CONFIG_QFMT_V2=y
CONFIG_FUSE_FS=y
CONFIG_VIRTIO_FS=y
CONFIG_OVERLAY_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_SQUASHFS=y
CONFIG_SQUASHFS_XZ=y
CONFIG_SQUASHFS_ZSTD=y
CONFIG_9P_FS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_KEYS=y
CONFIG_SECURITYFS=y
CONFIG_CRYPTO_AUTHENC=y
CONFIG_CRYPTO_RSA=y
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_CBC=y
CONFIG_CRYPTO_CCM=y
CONFIG_CRYPTO_GCM=y
CONFIG_CRYPTO_SEQIV=y
CONFIG_CRYPTO_ECHAINIV=y
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_SHA256=y
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
CONFIG_X509_CERTIFICATE_PARSER=y
CONFIG_PKCS7_MESSAGE_PARSER=y
CONFIG_SYSTEM_TRUSTED_KEYRING=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_SCHEDSTATS=y
CONFIG_DEBUG_PREEMPT=y
CONFIG_DEBUG_ATOMIC=y
CONFIG_PROVE_LOCKING=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_CSD_LOCK_WAIT_DEBUG=y
CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT=y
CONFIG_DEBUG_KOBJECT=y
CONFIG_FUNCTION_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_RV=y
CONFIG_RV_MON_WWNR=y
CONFIG_RV_MON_RTAPP=y
CONFIG_RV_MON_STALL=y
CONFIG_RV_MON_DEADLINE=y
CONFIG_RV_REACT_PRINTK_KUNIT=y
CONFIG_RV_REACT_PANIC_KUNIT=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_DEBUG_ENTRY=y
CONFIG_KUNIT=y
# CONFIG_KUNIT_DEBUGFS is not set


And then, using vng to build and run kselftests (since kunit is already 
built-in) can reproduce this issue:

$ vng --build

$ vng -v --run arch/x86/boot/bzImage --user root -- 
tools/testing/selftests/verification/verificationtest-ktap


--
Best wishes,
Wen


> Thanks,
> Gabriele
> 
>> [   44.821211] kunit_try_catch/209 is trying to lock:
>> [   44.821244] ffff8a743ed3e8a0 (&rq->__lock){-...}-{2:2}, at:
>> __schedule+0x102/0x13d0
>> [   44.821688] other info that might help us debug this:
>> [   44.821708] context-{5:5}
>> [   44.821730] 1 lock held by kunit_try_catch/209:
>> [   44.821745]  #0: ffffffffb6ba62c0 (rv_react_map-wait-type-
>> override){+.+.}-{1:1}, at: rv_react+0x9d/0xf0
>> [   44.821803] stack backtrace:
>> [   44.822110] CPU: 10 UID: 0 PID: 209 Comm: kunit_try_catch Tainted:
>> G                 N  7.1.0-rc7-next-20260612-virtme #6
>> PREEMPT_{RT,(full)}
>> [   44.822197] Tainted: [N]=TEST
>> [   44.822210] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX,
>> arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
>> [   44.822328] Call Trace:
>> [   44.822377]  <TASK>
>> [   44.822806]  dump_stack_lvl+0x78/0xe0
>> [   44.822860]  __lock_acquire+0x926/0x1c90
>> [   44.822888]  lock_acquire+0xd3/0x310
>> [   44.822901]  ? __schedule+0x102/0x13d0
>> [   44.822919]  ? rcu_qs+0x2d/0x1a0
>> [   44.822954]  _raw_spin_lock_nested+0x36/0x50
>> [   44.822966]  ? __schedule+0x102/0x13d0
>> [   44.822979]  __schedule+0x102/0x13d0
>> [   44.822993]  ? mark_held_locks+0x40/0x70
>> [   44.823009]  preempt_schedule_irq+0x37/0x70
>> [   44.823018]  irqentry_exit+0x1da/0x8c0
>> [   44.823032]  asm_sysvec_apic_timer_interrupt+0x1a/0x20
>> [   44.823093] RIP: 0010:mock_printk_react+0x2a/0x50
>> [   44.823250] Code: f3 0f 1e fa 0f 1f 44 00 00 41 54 49 89 f4 55 48
>> 89 fd 53 e8 18 8b db ff 4c 89 e6 48 89 ef 48 89 c3 e8 fa 8e ed ff eb
>> 02 f3 90 <e8> 01 8b db ff 48 29 d8 48 3d 3f 4b 4c 00 76 ee 5b 5d 41
>> 5c c3 cc
>> [   44.823303] RSP: 0018:ffffd1c3c0733d38 EFLAGS: 00000297
>> [   44.823332] RAX: 00000000000119f3 RBX: 0000000a74e60d1c RCX:
>> 000000000000001f
>> [   44.823342] RDX: 0000000000000000 RSI: 000000003348c8a2 RDI:
>> ffffffffc1abbfd9
>> [   44.823351] RBP: ffffffffb671b613 R08: 0000000000000002 R09:
>> 0000000000000000
>> [   44.823359] R10: 0000000000000001 R11: 0000000000000000 R12:
>> ffffd1c3c0733d60
>> [   44.823367] R13: ffffffffb575a5fd R14: ffffd1c3c0017be8 R15:
>> ffffd1c3c00179f8
>> [   44.823397]  ? rv_react+0x9d/0xf0
>> [   44.823437]  ? mock_printk_react+0x2f/0x50
>> [   44.823448]  rv_react+0xb4/0xf0
>> [   44.823455]  ? rv_react+0x9d/0xf0
>> [   44.823476]  test_printk_react_called+0x83/0xb0
>> [   44.823486]  ? __pfx_mock_printk_react+0x10/0x10
>> [   44.823502]  ? __pfx_mock_printk_react+0x10/0x10
>> [   44.823513]  kunit_try_run_case+0x97/0x190
>> [   44.823534]  ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10
>> [   44.823544]  kunit_generic_run_threadfn_adapter+0x21/0x40
>> [   44.823551]  kthread+0x124/0x160
>> [   44.823562]  ? __pfx_kthread+0x10/0x10
>> [   44.823574]  ret_from_fork+0x291/0x3b0
>> [   44.823585]  ? __pfx_kthread+0x10/0x10
>> [   44.823595]  ret_from_fork_asm+0x1a/0x30
>> [   44.823641]  </TASK>
>>
>>
>> Patch 1 fixes the lockdep bug by correcting rv_react()'s
>> wait_type_inner
>> from LD_WAIT_CONFIG (which inherits the outer context) to
>> LD_WAIT_SPIN
>> (the tightest constraint callbacks must satisfy).
>>
>> Patch 2 adds KUnit tests for reactor_printk. The busy-wait in the
>> mock
>> callback reproduces the timer interrupt scenario that exposes the
>> bug.
>>
>> Patch 3 adds KUnit tests for reactor_panic, exercising the panic
>> notifier
>> chain without halting the system.
>>
>> Tested with CONFIG_PROVE_LOCKING=y and CONFIG_KUNIT=y.
>>
>>
>> Wen Yang (3):
>>    rv/reactors: fix lockdep "Invalid wait context" in rv_react()
>>    rv/reactors: add KUnit tests for reactor_printk
>>    rv/reactors: add KUnit tests for reactor_panic
>>
>>   kernel/trace/rv/Kconfig                |  20 ++++
>>   kernel/trace/rv/Makefile               |   2 +
>>   kernel/trace/rv/reactor_panic_kunit.c  | 106 +++++++++++++++++++++
>>   kernel/trace/rv/reactor_printk_kunit.c | 123
>> +++++++++++++++++++++++++
>>   kernel/trace/rv/rv_reactors.c          |   8 +-
>>   5 files changed, 258 insertions(+), 1 deletion(-)
>>   create mode 100644 kernel/trace/rv/reactor_panic_kunit.c
>>   create mode 100644 kernel/trace/rv/reactor_printk_kunit.c
> 

^ permalink raw reply

* Re: [PATCH v3] mm/lruvec: trace LRU add drains and drain-all requests
From: Vlastimil Babka (SUSE) @ 2026-06-17 18:18 UTC (permalink / raw)
  To: Shakeel Butt, David Hildenbrand (Arm)
  Cc: JP Kobryn, linux-mm, willy, usama.arif, akpm, mhocko, rostedt,
	mhiramat, mathieu.desnoyers, kasong, qi.zheng, baohua,
	axelrasmussen, yuanchu, weixugc, chrisl, shikemeng, nphamcs,
	baoquan.he, youngjun.park, linux-kernel, linux-trace-kernel
In-Reply-To: <ajK1YsIJmD2ImbAk@linux.dev>

On 6/17/26 17:03, Shakeel Butt wrote:
> On Wed, Jun 17, 2026 at 01:11:16PM +0200, David Hildenbrand (Arm) wrote:
>> On 6/10/26 21:52, JP Kobryn wrote:
>> > LRU add batches can be drained before they reach capacity. This can be a
>> > source of LRU lock contention, but it is not currently possible to
>> > attribute these drains to callers with existing tracepoints.
>> > 
>> > Add mm_lru_add_drain to report the CPU and lru_add batch count when an
>> > lru_add batch is drained. This allows tracing to distinguish full drains
>> > from partial drains and attribute them to the calling stack.
>> > 
>> > Add mm_lru_add_drain_all to capture callers of __lru_add_drain_all and
>> > whether they set the force flag for all CPUs. The tracepoint resembles
>> > the signature of the enclosing function, but is needed because of
>> > potential inlining.
>> > 
>> > Signed-off-by: JP Kobryn <jp.kobryn@linux.dev>
>> > ---
>> >  include/trace/events/pagemap.h | 37 ++++++++++++++++++++++++++++++++++
>> >  mm/swap.c                      |  7 ++++++-
>> >  2 files changed, 43 insertions(+), 1 deletion(-)
>> > 
>> > diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
>> > index 171524d3526d..ff3da07ccb40 100644
>> > --- a/include/trace/events/pagemap.h
>> > +++ b/include/trace/events/pagemap.h
>> > @@ -77,6 +77,43 @@ TRACE_EVENT(mm_lru_activate,
>> >  	TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
>> >  );
>> >  
>> > +TRACE_EVENT(mm_lru_add_drain,
>> > +
>> > +	TP_PROTO(int cpu, unsigned int nr),
>> > +
>> > +	TP_ARGS(cpu, nr),
>> > +
>> > +	TP_STRUCT__entry(
>> > +		__field(int,		cpu	)
>> > +		__field(unsigned int,	nr	)
>> > +	),
>> > +
>> > +	TP_fast_assign(
>> > +		__entry->cpu	= cpu;
>> > +		__entry->nr	= nr;
>> > +	),
>> > +
>> > +	TP_printk("cpu=%d nr=%u", __entry->cpu, __entry->nr)
>> > +);
>> > +
>> > +TRACE_EVENT(mm_lru_add_drain_all,
>> > +
>> > +	TP_PROTO(bool force_all_cpus),
>> > +
>> > +	TP_ARGS(force_all_cpus),
>> > +
>> > +	TP_STRUCT__entry(
>> > +		__field(bool,	force_all_cpus	)
>> > +	),
>> > +
>> > +	TP_fast_assign(
>> > +		__entry->force_all_cpus	= force_all_cpus;
>> > +	),
>> > +
>> > +	TP_printk("force_all_cpus=%s",
>> > +		__entry->force_all_cpus ? "true" : "false")
>> > +);
>> > +
>> >  #endif /* _TRACE_PAGEMAP_H */
>> >  
>> >  /* This part must be outside protection */
>> > diff --git a/mm/swap.c b/mm/swap.c
>> > index 588f50d8f1a8..e14b7612f896 100644
>> > --- a/mm/swap.c
>> > +++ b/mm/swap.c
>> > @@ -694,9 +694,12 @@ void lru_add_drain_cpu(int cpu)
>> >  {
>> >  	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
>> >  	struct folio_batch *fbatch = &fbatches->lru_add;
>> > +	unsigned int nr_folios_add = folio_batch_count(fbatch);
>> >  
>> > -	if (folio_batch_count(fbatch))
>> > +	if (nr_folios_add) {
>> >  		folio_batch_move_lru(fbatch, lru_add);
>> > +		trace_mm_lru_add_drain(cpu, nr_folios_add);
>> > +	}
>> >  
>> >  	fbatch = &fbatches->lru_move_tail;
>> >  	/* Disabling interrupts below acts as a compiler barrier. */
>> > @@ -869,6 +872,8 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
>> >  	if (WARN_ON(!mm_percpu_wq))
>> >  		return;
>> >  
>> > +	trace_mm_lru_add_drain_all(force_all_cpus);
>> > +
>> >  	/*
>> >  	 * Guarantee folio_batch counter stores visible by this CPU
>> >  	 * are visible to other CPUs before loading the current drain
>> 
>> Given that trace events can quickly become stable ABI [1], are we really sure we
>> want to add this?
> 
> Yes, I think so as this is useful to get insights into lru cache draining.
> Trace events being stable or not is secondary IMHO. If in future we rearchitect
> the lru page handling where there is no cache draining anymore, we can make
> these a noops.

Yeah and I don't recall ever that a change to a mm tracepoint would ever
break someone who'd complain and we'd have to revert it. These are niche
enough. So I think the risk is low.

>> 
>> [1] https://lore.kernel.org/r/20260603130006.7d2c4a62@gandalf.local.home
>> 
>> -- 
>> Cheers,
>> 
>> David


^ permalink raw reply

* Re: [RFC PATCH 1/3] mm/compaction: skip isolate mlocked folios when compact_unevictable_allowed=0
From: Vlastimil Babka (SUSE) @ 2026-06-17 18:52 UTC (permalink / raw)
  To: Wandun Chen, linux-mm, linux-kernel, linux-trace-kernel,
	linux-rt-devel
  Cc: akpm, surenb, mhocko, jackmanb, hannes, ziy, rostedt, mhiramat,
	mathieu.desnoyers, david, ljs, liam, rppt, bigeasy, clrkwllms,
	Alexander.Krabler, Hugh Dickins
In-Reply-To: <20260604023812.3700316-2-chenwandun1@gmail.com>

On 6/4/26 04:38, Wandun Chen wrote:
> From: Wandun Chen <chenwandun@lixiang.com>
> 
> compact_unevictable_allowed is default 0 under PREEMPT_RT,
> isolate_migratepages_block() skips folios with PG_unevictable set.
> However, mlock_folio() sets PG_mlocked immediately but defers
> PG_unevictable to mlock_folio_batch(), result in a folio with
> PG_mlocked=1 but PG_unevictable=0. Compaction will isolate such a
> folio.
> 
> Fix by checking folio_test_mlocked() together with the existing
> folio_test_unevictable() check.
> 
> A similar issue has been reported by Alexander Krabler on a 6.12-rt
> aarch64 system. Vlastimil suggested to check the mlocked flag [1].
> 
> Reported-by: Alexander Krabler <Alexander.Krabler@kuka.com>
> Closes: https://lore.kernel.org/all/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Wandun Chen <chenwandun@lixiang.com>
> Link: https://lore.kernel.org/all/33275585-f2db-4779-89f0-3ae24b455a67@suse.cz/ [1]

Well in that thread, Hugh doubted my suggestion and then it seems we didn't
concluded anything. Did you actually in practice observe the issue that
Alexander had, and that this patch fixed it, or is that theoretical?

> ---
>  mm/compaction.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index b776f35ad020..7e07b792bcb5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1116,7 +1116,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>  		is_unevictable = folio_test_unevictable(folio);
>  
>  		/* Compaction might skip unevictable pages but CMA takes them */
> -		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
> +		if (!(mode & ISOLATE_UNEVICTABLE) &&
> +		    (is_unevictable || folio_test_mlocked(folio)))
>  			goto isolate_fail_put;
>  
>  		/*


^ permalink raw reply

* Re: [RFC PATCH 3/3] mm/compaction: respect compact_unevictable_allowed in alloc_contig path
From: Vlastimil Babka (SUSE) @ 2026-06-17 18:57 UTC (permalink / raw)
  To: Wandun Chen, linux-mm, linux-kernel, linux-trace-kernel,
	linux-rt-devel
  Cc: akpm, surenb, mhocko, jackmanb, hannes, ziy, rostedt, mhiramat,
	mathieu.desnoyers, david, ljs, liam, rppt, bigeasy, clrkwllms,
	Alexander.Krabler
In-Reply-To: <20260604023812.3700316-4-chenwandun1@gmail.com>

On 6/4/26 04:38, Wandun Chen wrote:
> From: Wandun Chen <chenwandun@lixiang.com>
> 
> vm.compact_unevictable_allowed=0 is used to prevent compacting
> unevictable pages. However, isolate_migratepages_range() passes
> ISOLATE_UNEVICTABLE regardless of this sysctl, so the setting
> has no effect in the alloc_contig path.
> 
> Fix it by:
>   - Keep ISOLATE_UNEVICTABLE for CMA allocation, discussed in [1].
>   - Honour sysctl_compact_unevictable_allowed for non-CMA allocation.
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Wandun Chen <chenwandun@lixiang.com>
> Link: https://lore.kernel.org/all/25ba0d77-eb61-4efc-b2fc-73878cbd85c1@suse.cz/ [1]

There was also the "Ideally by not having mlock'd pages in CMA areas at
all." part. Is it the case? It was more elaborated here:
https://lore.kernel.org/all/CAPTztWZpnX1j8-7yeppVUsxE=O9hbVeqricDjZt8_pnN7a-kBQ@mail.gmail.com/

> ---
>  include/linux/compaction.h | 6 ++++++
>  mm/compaction.c            | 9 +++++++--
>  mm/internal.h              | 1 +
>  mm/page_alloc.c            | 2 ++
>  4 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index f29ef0653546..04e60f65b976 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -106,6 +106,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
>  extern void __meminit kcompactd_run(int nid);
>  extern void __meminit kcompactd_stop(int nid);
>  extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
> +extern bool compaction_allow_unevictable(void);
>  
>  #else
>  static inline void reset_isolation_suitable(pg_data_t *pgdat)
> @@ -131,6 +132,11 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat,
>  {
>  }
>  
> +static inline bool compaction_allow_unevictable(void)
> +{
> +	return true;
> +}
> +
>  #endif /* CONFIG_COMPACTION */
>  
>  struct node;
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 007d5e00a8ae..a10acb273454 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1341,6 +1341,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
>  							unsigned long end_pfn)
>  {
>  	unsigned long pfn, block_start_pfn, block_end_pfn;
> +	isolate_mode_t mode = cc->allow_unevictable ? ISOLATE_UNEVICTABLE : 0;
>  	int ret = 0;
>  
>  	/* Scan block by block. First and last block may be incomplete */
> @@ -1360,8 +1361,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
>  					block_end_pfn, cc->zone))
>  			continue;
>  
> -		ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
> -						 ISOLATE_UNEVICTABLE);
> +		ret = isolate_migratepages_block(cc, pfn, block_end_pfn, mode);
>  
>  		if (ret)
>  			break;
> @@ -1902,6 +1902,11 @@ typedef enum {
>   * compactable pages.
>   */
>  static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
> +
> +bool compaction_allow_unevictable(void)
> +{
> +	return sysctl_compact_unevictable_allowed;
> +}
>  /*
>   * Tunable for proactive compaction. It determines how
>   * aggressively the kernel should compact memory in the
> diff --git a/mm/internal.h b/mm/internal.h
> index 181e79f1d6a2..163f9d6b37f3 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1052,6 +1052,7 @@ struct compact_control {
>  					 * ensure forward progress.
>  					 */
>  	bool alloc_contig;		/* alloc_contig_range allocation */
> +	bool allow_unevictable;		/* Allow isolation of unevictable folios */
>  };
>  
>  /*
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 81a9d4d1e6c0..1cf9d4a3b14c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -7118,6 +7118,8 @@ int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end,
>  		.ignore_skip_hint = true,
>  		.no_set_skip_hint = true,
>  		.alloc_contig = true,
> +		.allow_unevictable = !!(alloc_flags & ACR_FLAGS_CMA) ||
> +					     compaction_allow_unevictable(),
>  	};
>  	INIT_LIST_HEAD(&cc.migratepages);
>  	enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?


^ permalink raw reply

* [PATCH] usb: typec: add trace point for typec_set_mode
From: Ahmad Fatoum @ 2026-06-17 20:03 UTC (permalink / raw)
  To: Heikki Krogerus, Greg Kroah-Hartman, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-usb, linux-trace-kernel, kernel, Ahmad Fatoum

Some Type-C controllers toggle muxes themselves. Other controllers like
the TUSB320 report the mode to the host, so it can control the muxes.

To improve debuggability of both kinds of drivers, add a trace point that
can be used to keep track of the mode being set inside the Type-C
framework:

  echo 1 > /sys/kernel/debug/tracing/events/typec/typec_mode/enable

Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
---
 MAINTAINERS                  |  1 +
 drivers/usb/typec/class.c    |  9 ++++++++-
 include/trace/events/typec.h | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c8d4b913f26c..ddd59e5e6eaf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27753,6 +27753,7 @@ F:	Documentation/ABI/testing/sysfs-class-typec
 F:	Documentation/driver-api/usb/typec.rst
 F:	drivers/usb/typec/
 F:	include/linux/usb/typec.h
+F:	include/trace/events/typec*.h
 
 USB TYPEC INTEL PMC MUX DRIVER
 M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 0977581ad1b6..9316d067f19a 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -20,6 +20,9 @@
 #include "class.h"
 #include "pd.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/typec.h>
+
 static DEFINE_IDA(typec_index_ida);
 
 const struct class typec_class = {
@@ -2427,10 +2430,14 @@ EXPORT_SYMBOL_GPL(typec_get_orientation);
 int typec_set_mode(struct typec_port *port, int mode)
 {
 	struct typec_mux_state state = { };
+	int ret;
 
 	state.mode = mode;
 
-	return typec_mux_set(port->mux, &state);
+	ret = typec_mux_set(port->mux, &state);
+	trace_typec_mode(port, mode, ret);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(typec_set_mode);
 
diff --git a/include/trace/events/typec.h b/include/trace/events/typec.h
new file mode 100644
index 000000000000..a7dcb9f3fd49
--- /dev/null
+++ b/include/trace/events/typec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM typec
+
+#if !defined(_TRACE_TYPEC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TYPEC_H
+
+#include <linux/usb/typec.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(typec_mode,
+
+	TP_PROTO(struct typec_port *port, int mode, int err),
+
+	TP_ARGS(port, mode, err),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(&port->dev))
+		__field(int, mode)
+		__field(int, err)
+	),
+
+	TP_fast_assign(
+		__assign_str(device);
+		__entry->mode = mode;
+		__entry->err = err;
+	),
+
+	TP_printk("%s mode=%d (%d)",
+		  __get_str(device), __entry->mode, __entry->err)
+);
+
+#endif /* if !defined(_TRACE_TYPEC_H) || defined(TRACE_HEADER_MULTI_READ) */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

---
base-commit: 8cd9520d35a6c38db6567e97dd93b1f11f185dc6
change-id: 20260617-typec_set_mode-tracepoint-011fc43feaca

Best regards,
--  
Ahmad Fatoum <a.fatoum@pengutronix.de>


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox