Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v11 3/5] ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-19  9:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177391152793.193994.8986943289250629418.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Skip invalid sub-buffers when validating the persistent ring buffer
instead of discarding the entire ring buffer. Only skipped buffers
are invalidated (cleared).

If the cache data in memory fails to be synchronized during a reboot,
the persistent ring buffer may become partially corrupted, but other
sub-buffers may still contain readable event data. Only discard the
subbuffers that are found to be corrupted.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
  Changes in v11:
  - Fix a typo.
  Changes in v9:
  - Add meta->subbuf_size check.
  - Fix a typo.
  - Handle invalid reader_page case.
  Changes in v8:
  - Add comment in rb_valudate_buffer()
  - Clear the RB_MISSED_* flags in rb_valudate_buffer() instead of
    skipping subbuf.
  - Remove unused subbuf local variable from rb_cpu_meta_valid().
  Changes in v7:
  - Combined with Handling RB_MISSED_* flags patch, focus on validation at boot.
  - Remove checking subbuffer data when validating metadata, because it should be done
    later.
  - Do not mark the discarded sub buffer page but just reset it.
  Changes in v6:
  - Show invalid page detection message once per CPU.
  Changes in v5:
  - Instead of showing errors for each page, just show the number
    of discarded pages at last.
  Changes in v3:
  - Record missed data event on commit.
---
 0 files changed

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3d2acaf75e79..67826021867b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,12 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 	return local_read(&bpage->page->commit);
 }
 
+/* Size is determined by what has been committed */
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+}
+
 static void free_buffer_page(struct buffer_page *bpage)
 {
 	/* Range pages are not to be freed */
@@ -1762,7 +1768,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			      unsigned long *subbuf_mask)
 {
 	int subbuf_size = PAGE_SIZE;
-	struct buffer_data_page *subbuf;
 	unsigned long buffers_start;
 	unsigned long buffers_end;
 	int i;
@@ -1770,6 +1775,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 	if (!subbuf_mask)
 		return false;
 
+	if (meta->subbuf_size != PAGE_SIZE) {
+		pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+		return false;
+	}
+
 	buffers_start = meta->first_buffer;
 	buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
 
@@ -1786,11 +1796,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 		return false;
 	}
 
-	subbuf = rb_subbufs_from_meta(meta);
-
 	bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
 
-	/* Is the meta buffers and the subbufs themselves have correct data? */
+	/*
+	 * Ensure the meta::buffers array has correct data. The data in each subbufs
+	 * are checked later in rb_meta_validate_events().
+	 */
 	for (i = 0; i < meta->nr_subbufs; i++) {
 		if (meta->buffers[i] < 0 ||
 		    meta->buffers[i] >= meta->nr_subbufs) {
@@ -1798,18 +1809,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			return false;
 		}
 
-		if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
-			pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
-			return false;
-		}
-
 		if (test_bit(meta->buffers[i], subbuf_mask)) {
 			pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
 			return false;
 		}
 
 		set_bit(meta->buffers[i], subbuf_mask);
-		subbuf = (void *)subbuf + subbuf_size;
 	}
 
 	return true;
@@ -1873,13 +1878,22 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+			      struct ring_buffer_cpu_meta *meta)
 {
 	unsigned long long ts;
+	unsigned long tail;
 	u64 delta;
-	int tail;
 
-	tail = local_read(&dpage->commit);
+	/*
+	 * When a sub-buffer is recovered from a read, the commit value may
+	 * have RB_MISSED_* bits set, as these bits are reset on reuse.
+	 * Even after clearing these bits, a commit value greater than the
+	 * subbuf_size is considered invalid.
+	 */
+	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+	if (tail > meta->subbuf_size)
+		return -1;
 	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 }
 
@@ -1890,6 +1904,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *head_page, *orig_head;
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
+	int discarded = 0;
 	int ret;
 	u64 ts;
 	int i;
@@ -1900,14 +1915,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+	ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
-		pr_info("Ring buffer reader page is invalid\n");
-		goto invalid;
+		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
+			cpu_buffer->cpu);
+		discarded++;
+		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+		local_set(&cpu_buffer->reader_page->entries, 0);
+		local_set(&cpu_buffer->reader_page->page->commit, 0);
+	} else {
+		entries += ret;
+		entry_bytes += rb_page_size(cpu_buffer->reader_page);
+		local_set(&cpu_buffer->reader_page->entries, ret);
 	}
-	entries += ret;
-	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
-	local_set(&cpu_buffer->reader_page->entries, ret);
 
 	ts = head_page->page->time_stamp;
 
@@ -1935,7 +1955,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 			break;
 
 		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0)
 			break;
 
@@ -2014,21 +2034,24 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->reader_page)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
-			pr_info("Ring buffer meta [%d] invalid buffer page\n",
-				cpu_buffer->cpu);
-			goto invalid;
-		}
-
-		/* If the buffer has content, update pages_touched */
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-
-		entries += ret;
-		entry_bytes += local_read(&head_page->page->commit);
-		local_set(&head_page->entries, ret);
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+		} else {
+			/* If the buffer has content, update pages_touched */
+			if (ret)
+				local_inc(&cpu_buffer->pages_touched);
 
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			local_set(&head_page->entries, ret);
+		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
 	}
@@ -2042,7 +2065,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->entries, entries);
 	local_set(&cpu_buffer->entries_bytes, entry_bytes);
 
-	pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+	pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
+		cpu_buffer->cpu, discarded);
 	return;
 
  invalid:
@@ -3329,12 +3353,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	return NULL;
 }
 
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
-	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
 static __always_inline unsigned
 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 {


^ permalink raw reply related

* [PATCH v11 2/5] ring-buffer: Flush and stop persistent ring buffer on panic
From: Masami Hiramatsu (Google) @ 2026-03-19  9:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177391152793.193994.8986943289250629418.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

On real hardware, panic and machine reboot may not flush hardware cache
to memory. This means the persistent ring buffer, which relies on a
coherent state of memory, may not have its events written to the buffer
and they may be lost. Moreover, there may be inconsistency with the
counters which are used for validation of the integrity of the
persistent ring buffer which may cause all data to be discarded.

To avoid this issue, stop recording of the ring buffer on panic and
flush the cache of the ring buffer's memory.

Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v11:
   - Do nothing by default since flush_cache_vmap() does nothing on x86
     but it can cause deadlock on some architectures via on_each_cpu()
     because other CPUs will be stoppped when panic notifier is called.
 Changes in v9:
   - Fix typo of & to &&.
   - Fix typo of "Generic"
 Changes in v6:
   - Introduce asm/ring_buffer.h for arch_ring_buffer_flush_range().
   - Use flush_cache_vmap() instead of flush_cache_all().
 Changes in v5:
   - Use ring_buffer_record_off() instead of ring_buffer_record_disable().
   - Use flush_cache_all() to ensure flush all cache.
 Changes in v3:
   - update patch description.
---
 0 files changed

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 483965c5a4de..b154b4e3dfa8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += agp.h
 generic-y += asm-offsets.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4c69522e0328..483caacc6988 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -5,5 +5,6 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 03657ff8fbe3..decad5f2c826 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 
 generated-y += mach-types.h
 generated-y += unistd-nr.h
diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h
new file mode 100644
index 000000000000..62316c406888
--- /dev/null
+++ b/arch/arm64/include/asm/ring_buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_ARM64_RING_BUFFER_H
+#define _ASM_ARM64_RING_BUFFER_H
+
+#include <asm/cacheflush.h>
+
+/* Flush D-cache on persistent ring buffer */
+#define arch_ring_buffer_flush_range(start, end)	dcache_clean_pop(start, end)
+
+#endif /* _ASM_ARM64_RING_BUFFER_H */
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 3a5c7f6e5aac..7dca0c6cdc84 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
 generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1efa1e993d4b..0f887d4238ed 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += extable.h
 generic-y += iomap.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 9034b583a88a..7e92957baf6a 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -10,5 +10,6 @@ generic-y += qrwlock.h
 generic-y += user.h
 generic-y += ioctl.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
 generic-y += statfs.h
 generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index b282e0dd8dc1..62543bf305ff 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -3,5 +3,6 @@ generated-y += syscall_table.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += text-patching.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 7178f990e8b3..0030309b47ad 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += syscalls.h
 generic-y += tlb.h
 generic-y += user.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 684569b2ecd6..9771c3d85074 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 28004301c236..0a2530964413 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += cmpxchg.h
 generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += spinlock.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..8aa34621702d 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += spinlock_types.h
 generic-y += spinlock.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 4fb596d94c89..d48d158f7241 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2e23533b67e3..805b5aeebb6f 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h
 generic-y += agp.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
+generic-y += ring_buffer.h
 generic-y += early_ioremap.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index bd5fc9403295..7721b63642f4 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h
 generic-y += qrwlock.h
 generic-y += qrwlock_types.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 80bad7de7a04..0c1fc47c3ba0 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,3 +7,4 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 4d3f10ed8275..f0403d3ee8ab 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,4 +3,5 @@ generated-y += syscall_table.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 17ee8a273aa6..49c6bb326b75 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
 generic-y += text-patching.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 1b9b82bbe322..2a1629ba8140 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += module.lds.h
 generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += ring_buffer.h
 generic-y += runtime-const.h
 generic-y += softirq_stack.h
 generic-y += switch_to.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4566000e15c4..078fd2c0d69d 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += early_ioremap.h
 generic-y += fprobe.h
 generic-y += mcs_spinlock.h
 generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 13fe45dea296..e57af619263a 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
+generic-y += ring_buffer.h
 generic-y += user.h
 generic-y += text-patching.h
diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h
new file mode 100644
index 000000000000..201d2aee1005
--- /dev/null
+++ b/include/asm-generic/ring_buffer.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic arch dependent ring_buffer macros.
+ */
+#ifndef __ASM_GENERIC_RING_BUFFER_H__
+#define __ASM_GENERIC_RING_BUFFER_H__
+
+#include <linux/cacheflush.h>
+
+/* Flush cache on ring buffer range if needed. Do nothing by default. */
+#define arch_ring_buffer_flush_range(start, end)	do { } while (0)
+
+#endif /* __ASM_GENERIC_RING_BUFFER_H__ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d6bebb782efc..3d2acaf75e79 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
 #include <linux/ring_buffer_types.h>
 #include <linux/sched/isolation.h>
 #include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
 #include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
 #include <linux/oom.h>
 #include <linux/mm.h>
 
+#include <asm/ring_buffer.h>
 #include <asm/local64.h>
 #include <asm/local.h>
 #include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
 
 	unsigned long			range_addr_start;
 	unsigned long			range_addr_end;
+	struct notifier_block		flush_nb;
 
 	struct ring_buffer_meta		*meta;
 
@@ -2520,6 +2523,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+	ring_buffer_record_off(buffer);
+	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+	return NOTIFY_DONE;
+}
+
 static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 					 int order, unsigned long start,
 					 unsigned long end,
@@ -2650,6 +2663,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 
 	mutex_init(&buffer->mutex);
 
+	/* Persistent ring buffer needs to flush cache before reboot. */
+	if (start && end) {
+		buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+		atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+	}
+
 	return_ptr(buffer);
 
  fail_free_buffers:
@@ -2748,6 +2767,9 @@ ring_buffer_free(struct trace_buffer *buffer)
 {
 	int cpu;
 
+	if (buffer->range_addr_start && buffer->range_addr_end)
+		atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
 	cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
 
 	irq_work_sync(&buffer->irq_work.work);


^ permalink raw reply related

* [PATCH v11 1/5] ring-buffer: Fix to update per-subbuf entries of persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-03-19  9:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers
In-Reply-To: <177391152793.193994.8986943289250629418.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since the validation loop in rb_meta_validate_events() updates
the same cpu_buffer->head_page->entries, the other subbuf entries
are not updated.
Fix to use head_page to update the entries field, since it is the
cursor in this loop.

Fixes: 5f3b6e839f3c ("ring-buffer: Validate boot range memory events")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 0 files changed

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96e0d80d492b..d6bebb782efc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2024,7 +2024,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 		entries += ret;
 		entry_bytes += local_read(&head_page->page->commit);
-		local_set(&cpu_buffer->head_page->entries, ret);
+		local_set(&head_page->entries, ret);
 
 		if (head_page == cpu_buffer->commit_page)
 			break;


^ permalink raw reply related

* [PATCH v11 0/5] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu (Google) @ 2026-03-19  9:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers

Hi,

Here is the 11th version of improvement patches for making persistent
ring buffers robust to failures.
The previous version is here:

https://lore.kernel.org/linux-trace-kernel/177374017536.2358053.12341235939816794384.stgit@mhiramat.tok.corp.google.com/

In this version, I updated [2/5] to do nothing by default since
flush_cache_vmap() does nothing on x86 but it can cause deadlock on
some architectures via on_each_cpu(), because other CPUs will be
stoppped when panic notifier is called.
Also update typo in [3/5], and fix to reset timestamp when invalid
whole ring buffer and skip pages which has invalid "timestamp"
instead of invalidating all ring buffers.

Thank you,

---

Masami Hiramatsu (Google) (5):
      ring-buffer: Fix to update per-subbuf entries of persistent ring buffer
      ring-buffer: Flush and stop persistent ring buffer on panic
      ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
      ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
      ring-buffer: Add persistent ring buffer selftest

 0 files changed

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2] blk-mq: add tracepoint block_rq_tag_wait
From: Johannes Thumshirn @ 2026-03-19  7:32 UTC (permalink / raw)
  To: Aaron Tomlin, axboe@kernel.dk, rostedt@goodmis.org,
	mhiramat@kernel.org, mathieu.desnoyers@efficios.com
  Cc: kch@nvidia.com, bvanassche@acm.org, dlemoal@kernel.org,
	ritesh.list@gmail.com, neelx@suse.com, sean@ashe.io,
	mproche@gmail.com, chjohnst@gmail.com,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-trace-kernel@vger.kernel.org
In-Reply-To: <20260319015300.287653-1-atomlin@atomlin.com>

Looks good,

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>


^ permalink raw reply

* Re: [PATCH v9 2/4] ring-buffer: Flush and stop persistent ring buffer on panic
From: Masami Hiramatsu @ 2026-03-19  3:36 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel, Ian Rogers
In-Reply-To: <831523cf-110c-419d-9b22-e54f93a3bdb5@efficios.com>

On Wed, 18 Mar 2026 11:51:28 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> On 2026-03-18 11:29, Masami Hiramatsu (Google) wrote:
> >>
> >> - AFAIU, you are not trying to evince cache lines after creation
> >>     of a new virtual mapping (which is the documented intent of
> >>     flush_cache_vmap).
> > 
> > Ah, OK. That's a good point!
> > (anyway I will replace it with do { } while (0) in the next version.)
> > 
> >>     
> >> - AFAIU flush_cache_vmap maps to no-code on arm64 (asm-generic), what am
> >>     I missing ? It makes sense to be a no-op because AFAIR arm64 does not
> >>     have to deal with virtually aliasing caches.
> > 
> > Yeah, so my patch also introduces arm64 specific implementation.
> 
> Just make sure to call this something else than "flush_cache_vmap",
> because you don't want to slow down vmap on arm64 which does not
> require to evince and certainly not write back cache lines after
> creation of a new virtual mapping.

OK, I will just leave it an empty do-while in asm-generic instead of
flush_cache_vmap(). If any architecture finds persistent ring buffer
needs to write back caches, it can add its own flush implementation.

BTW, do we need dmb(osh)? This runs dcache_clean_pop() after atomic
operation in ring_buffer_record_off().

	ring_buffer_record_off(buffer);
	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);

Thank you,

> 
> Thanks,
> 
> Mathieu
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2] blk-mq: add tracepoint block_rq_tag_wait
From: Damien Le Moal @ 2026-03-19  3:31 UTC (permalink / raw)
  To: Aaron Tomlin, axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: johannes.thumshirn, kch, bvanassche, ritesh.list, neelx, sean,
	mproche, chjohnst, linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260319015300.287653-1-atomlin@atomlin.com>

On 3/19/26 10:53, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
> latency spikes can occur when fast devices (SSDs) are starved of hardware
> tags when sharing the same blk_mq_tag_set.
> 
> Currently, diagnosing this specific hardware queue contention is
> difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
> forces the current thread to block uninterruptible via io_schedule().
> While this can be inferred via sched:sched_switch or dynamically
> traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
> dedicated, out-of-the-box observability for this event.
> 
> This patch introduces the block_rq_tag_wait static trace point in the
> tag allocation slow-path. It triggers immediately before the thread
> yields the CPU, exposing the exact hardware context (hctx) that is
> starved, the specific pool experiencing starvation (hardware or software
> scheduler), and the total pool depth.
> 
> This provides storage engineers and performance monitoring agents
> with a zero-configuration, low-overhead mechanism to definitively
> identify shared-tag bottlenecks and tune I/O schedulers or cgroup
> throttling accordingly.
> 
> Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
> ---
> Changes in v1 [1]:
>  - Improved the description of the trace point (Damien Le Moal)
>  - Removed the redundant "active requests" (Laurence Oberman)
>  - Introduced pool-specific starvation tracking
> 
> [1]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/
> 
>  block/blk-mq-tag.c           |  4 ++++
>  include/trace/events/block.h | 43 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 47 insertions(+)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 33946cdb5716..a6691a4fe7a7 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -13,6 +13,7 @@
>  #include <linux/kmemleak.h>
>  
>  #include <linux/delay.h>
> +#include <trace/events/block.h>
>  #include "blk.h"
>  #include "blk-mq.h"
>  #include "blk-mq-sched.h"
> @@ -187,6 +188,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		if (tag != BLK_MQ_NO_TAG)
>  			break;
>  
> +		trace_block_rq_tag_wait(data->q, data->hctx,
> +					!!(data->rq_flags & RQF_SCHED_TAGS));

I do not think that the "!!" is needed here.

Other than this, this looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

> +
>  		bt_prev = bt;
>  		io_schedule();
>  
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 6aa79e2d799c..f7708d0d7a0c 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq,
>  		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
>  );
>  
> +/**
> + * block_rq_tag_wait - triggered when a request is starved of a tag
> + * @q: request queue of the target device
> + * @hctx: hardware context of the request experiencing starvation
> + * @is_sched_tag: indicates whether the starved pool is the software scheduler
> + *
> + * Called immediately before the submitting context is forced to block due
> + * to the exhaustion of available tags (i.e., physical hardware driver tags
> + * or software scheduler tags). This trace point indicates that the context
> + * will be placed into an uninterruptible state via io_schedule() until an
> + * active request completes and relinquishes its assigned tag.
> + */
> +TRACE_EVENT(block_rq_tag_wait,
> +
> +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
> +
> +	TP_ARGS(q, hctx, is_sched_tag),
> +
> +	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
> +		__field( u32,		hctx_id			)
> +		__field( u32,		nr_tags			)
> +		__field( bool,		is_sched_tag		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= disk_devt(q->disk);
> +		__entry->hctx_id	= hctx->queue_num;
> +		__entry->is_sched_tag	= is_sched_tag;
> +
> +		if (__entry->is_sched_tag)
> +			__entry->nr_tags = hctx->sched_tags->nr_tags;
> +		else
> +			__entry->nr_tags = hctx->tags->nr_tags;
> +	),
> +
> +	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->hctx_id,
> +		  __entry->is_sched_tag ? "scheduler" : "hardware",
> +		  __entry->nr_tags)
> +);
> +
>  /**
>   * block_rq_insert - insert block operation request into queue
>   * @rq: block IO operation request


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply

* Re: [PATCH v2] blk-mq: add tracepoint block_rq_tag_wait
From: Chaitanya Kulkarni @ 2026-03-19  3:18 UTC (permalink / raw)
  To: Aaron Tomlin, axboe@kernel.dk, rostedt@goodmis.org,
	mhiramat@kernel.org, mathieu.desnoyers@efficios.com
  Cc: johannes.thumshirn@wdc.com, Chaitanya Kulkarni,
	bvanassche@acm.org, dlemoal@kernel.org, ritesh.list@gmail.com,
	neelx@suse.com, sean@ashe.io, mproche@gmail.com,
	chjohnst@gmail.com, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
In-Reply-To: <20260319015300.287653-1-atomlin@atomlin.com>

On 3/18/26 18:53, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
> latency spikes can occur when fast devices (SSDs) are starved of hardware
> tags when sharing the same blk_mq_tag_set.
>
> Currently, diagnosing this specific hardware queue contention is
> difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
> forces the current thread to block uninterruptible via io_schedule().
> While this can be inferred viasched:sched_switch or dynamically
> traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
> dedicated, out-of-the-box observability for this event.
>
> This patch introduces the block_rq_tag_wait static trace point in the
> tag allocation slow-path. It triggers immediately before the thread
> yields the CPU, exposing the exact hardware context (hctx) that is
> starved, the specific pool experiencing starvation (hardware or software
> scheduler), and the total pool depth.
>
> This provides storage engineers and performance monitoring agents
> with a zero-configuration, low-overhead mechanism to definitively
> identify shared-tag bottlenecks and tune I/O schedulers or cgroup
> throttling accordingly.
>
> Signed-off-by: Aaron Tomlin<atomlin@atomlin.com>
> ---
> Changes in v1 [1]:
>   - Improved the description of the trace point (Damien Le Moal)
>   - Removed the redundant "active requests" (Laurence Oberman)
>   - Introduced pool-specific starvation tracking
>
> [1]:https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/


LGTM.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply

* [PATCH v2] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-03-19  1:53 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: johannes.thumshirn, kch, bvanassche, dlemoal, ritesh.list, neelx,
	sean, mproche, chjohnst, linux-block, linux-kernel,
	linux-trace-kernel

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait static trace point in the
tag allocation slow-path. It triggers immediately before the thread
yields the CPU, exposing the exact hardware context (hctx) that is
starved, the specific pool experiencing starvation (hardware or software
scheduler), and the total pool depth.

This provides storage engineers and performance monitoring agents
with a zero-configuration, low-overhead mechanism to definitively
identify shared-tag bottlenecks and tune I/O schedulers or cgroup
throttling accordingly.

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
Changes in v1 [1]:
 - Improved the description of the trace point (Damien Le Moal)
 - Removed the redundant "active requests" (Laurence Oberman)
 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/

 block/blk-mq-tag.c           |  4 ++++
 include/trace/events/block.h | 43 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..a6691a4fe7a7 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -187,6 +188,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					!!(data->rq_flags & RQF_SCHED_TAGS));
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..f7708d0d7a0c 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver tags
+ * or software scheduler tags). This trace point indicates that the context
+ * will be placed into an uninterruptible state via io_schedule() until an
+ * active request completes and relinquishes its assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
+
+	TP_ARGS(q, hctx, is_sched_tag),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(q->disk);
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+
+		if (__entry->is_sched_tag)
+			__entry->nr_tags = hctx->sched_tags->nr_tags;
+		else
+			__entry->nr_tags = hctx->tags->nr_tags;
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-03-19  0:22 UTC (permalink / raw)
  To: Damien Le Moal, loberman
  Cc: axboe, rostedt, mhiramat, mathieu.desnoyers, johannes.thumshirn,
	kch, bvanassche, ritesh.list, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <lrnjp7wcrrfita36onlxqihep44sgr4il57ccy4irf2mortdqi@46w7sl52whmr>

[-- Attachment #1: Type: text/plain, Size: 1710 bytes --]

On Wed, Mar 18, 2026 at 09:21:23AM -0400, Aaron Tomlin wrote:
> On Wed, Mar 18, 2026 at 08:38:20AM +0900, Damien Le Moal wrote:
> > Looks OK to me, but I have some suggestions below.

Hi Damien, Laurence,

Upon reviewing the source code once more, it becomes apparent that tracking
"active requests" within this specific trace point is essentially redundant.
If a thread is compelled to invoke io_schedule(), it is mathematically
certain that the number of active requests perfectly equals the total
number of tags.

Now, it would almost always print active=0 in the following scenarios:

    1.  "mq-deadline" Scheduler Starvation: The thread sleeps waiting for a
        scheduler tag. Because the request has not been dispatched to
        hardware yet, blk_mq_inc_active_requests() was never called.
        hctx->nr_active is 0.

    2.  NVMe Hardware Starvation, "none" scheduler: The thread sleeps
        waiting for a hardware tag. Because NVMe drives do not share tags,
        blk_mq_inc_active_requests() instantly aborts to save CPU-cycles.
        hctx->nr_active remains 0.

    3.  RAID Hardware Starvation, "none" scheduler: The thread sleeps
        waiting for a shared hardware tag. Because it is HCTX_SHARED, the
        kernel tracks the active requests in
        hctx->queue->nr_active_requests_shared_tags. The local
        hctx->nr_active counter is completely bypassed and remains 0.

Rather than attempting to print the active count, the trace point should be
modified to indicate exactly which pool experienced starvation: the
hardware pool or the software scheduler pool.

I will submit a follow-up patch.


Kind regards,
-- 
Aaron Tomlin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v2 1/2] kthread: remove kthread_exit()
From: Steven Rostedt @ 2026-03-18 23:12 UTC (permalink / raw)
  To: David Laight
  Cc: Christian Brauner, Linus Torvalds, linux-kernel, linux-modules,
	linux-nfs, bpf, kunit-dev, linux-doc, linux-trace-kernel, netfs,
	io-uring, audit, rcu, kvm, virtualization, netdev, linux-mm,
	linux-security-module, Christian Loehle, linux-fsdevel
In-Reply-To: <20260311104736.51b53405@pumpkin>

On Wed, 11 Mar 2026 10:47:36 +0000
David Laight <david.laight.linux@gmail.com> wrote:

> > -#define module_put_and_kthread_exit(code) kthread_exit(code)
> > +#define module_put_and_kthread_exit(code) do_exit(code)  
> 
> I'm intrigued...
> How does that actually know to do the module_put()?
> (I know it does one - otherwise my driver wouldn't unload.)

It's in the !CONFIG_MODULES section. No module_put() necessary. Only the
kthread_exit (do_exit) is needed.

-- Steve

^ permalink raw reply

* [PATCH] tracing: Fix trace_marker copy link list updates
From: Steven Rostedt @ 2026-03-18 22:55 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel; +Cc: Masami Hiramatsu, Mathieu Desnoyers, Sasha Levin

From: Steven Rostedt <rostedt@goodmis.org>

When the "copy_trace_marker" option is enabled for an instance, anything
written into /sys/kernel/tracing/trace_marker is also copied into that
instances buffer. When the option is set, that instance's trace_array
descriptor is added to the marker_copies link list. This list is protected
by RCU, as all iterations uses an RCU protected list traversal.

When the instance is deleted, all the flags that were enabled are cleared.
This also clears the copy_trace_marker flag and removes the trace_array
descriptor from the list.

The issue is after the flags are called, a direct call to
update_marker_trace() is performed to clear the flag. This function
returns true if the state of the flag changed and false otherwise. If it
returns true here, synchronize_rcu() is called to make sure all readers
see that its removed from the list.

But since the flag was already cleared, the state does not change and the
synchronization is never called, leaving a possible UAF bug.

Move the clearing of all flags below the updating of the copy_trace_marker
option which then makes sure the synchronization is performed.

Also use the flag for checking the state in update_marker_trace() instead
of looking at if the list is empty.

Cc: stable@vger.kernel.org
Fixes: 7b382efd5e8a ("tracing: Allow the top level trace_marker to write into another instances")
Reported-by: Sasha Levin <sashal@kernel.org>
Closes: https://lore.kernel.org/all/20260225133122.237275-1-sashal@kernel.org/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb4a62f4b953..a626211ceb9a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -555,7 +555,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
 	lockdep_assert_held(&event_mutex);

 	if (enabled) {
-		if (!list_empty(&tr->marker_list))
+		if (tr->trace_flags & TRACE_ITER(COPY_MARKER))
 			return false;

 		list_add_rcu(&tr->marker_list, &marker_copies);
@@ -563,10 +563,10 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
 		return true;
 	}

-	if (list_empty(&tr->marker_list))
+	if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER)))
 		return false;

-	list_del_init(&tr->marker_list);
+	list_del_rcu(&tr->marker_list);
 	tr->trace_flags &= ~TRACE_ITER(COPY_MARKER);
 	return true;
 }
@@ -9761,18 +9761,19 @@ static int __remove_instance(struct trace_array *tr)

 	list_del(&tr->list);

-	/* Disable all the flags that were enabled coming in */
-	for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
-		if ((1ULL << i) & ZEROED_TRACE_FLAGS)
-			set_tracer_flag(tr, 1ULL << i, 0);
-	}
-
 	if (printk_trace == tr)
 		update_printk_trace(&global_trace);

+	/* Must be done before disabling all the flags */
 	if (update_marker_trace(tr, 0))
 		synchronize_rcu();

+	/* Disable all the flags that were enabled coming in */
+	for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+		if ((1ULL << i) & ZEROED_TRACE_FLAGS)
+			set_tracer_flag(tr, 1ULL << i, 0);
+	}
+
 	tracing_set_nop(tr);
 	clear_ftrace_function_probes(tr);
 	event_trace_del_tracer(tr);
-- 
2.51.0

^ permalink raw reply related

* [PATCH 9/8] memblock tests: add stubs required for free_reserved_area()
From: Mike Rapoport @ 2026-03-18 20:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
	Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
	Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86
In-Reply-To: <20260318105827.1358927-1-rppt@kernel.org>

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

After moving free_reserved_area() function to mm/memblock.c memblock
tests lack stubs for several functions and macros this function calls.

Add them.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 tools/include/linux/mm.h          |  1 +
 tools/testing/memblock/internal.h | 28 +++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h
index 028f3faf46e7..4407d8396108 100644
--- a/tools/include/linux/mm.h
+++ b/tools/include/linux/mm.h
@@ -17,6 +17,7 @@
 
 #define __va(x) ((void *)((unsigned long)(x)))
 #define __pa(x) ((unsigned long)(x))
+#define __pa_symbol(x) ((unsigned long)(x))
 
 #define pfn_to_page(pfn) ((void *)((pfn) * PAGE_SIZE))
 
diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h
index 009b97bbdd22..7ff61172ab24 100644
--- a/tools/testing/memblock/internal.h
+++ b/tools/testing/memblock/internal.h
@@ -11,9 +11,16 @@ static int memblock_debug = 1;
 
 #define pr_warn_ratelimited(fmt, ...)    printf(fmt, ##__VA_ARGS__)
 
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
 bool mirrored_kernelcore = false;
 
 struct page {};
+static inline void *page_address(struct page *page)
+{
+	BUG();
+	return page;
+}
 
 void memblock_free_pages(unsigned long pfn, unsigned int order)
 {
@@ -23,10 +30,25 @@ static inline void accept_memory(phys_addr_t start, unsigned long size)
 {
 }
 
-static inline unsigned long free_reserved_area(void *start, void *end,
-					       int poison, const char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s);
+void free_reserved_page(struct page *page);
+
+static inline bool deferred_pages_enabled(void)
+{
+	return false;
+}
+
+#define for_each_valid_pfn(pfn, start_pfn, end_pfn)			 \
+	for ((pfn) = (start_pfn); (pfn) < (end_pfn); (pfn)++)
+
+static inline void *kasan_reset_tag(const void *addr)
+{
+	return (void *)addr;
+}
+
+static inline bool __is_kernel(unsigned long addr)
 {
-	return 0;
+	return false;
 }
 
 #endif
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v8 05/13] lib/bootconfig: drop redundant memset of xbc_nodes
From: Markus Elfring @ 2026-03-18 20:22 UTC (permalink / raw)
  To: Josh Law, linux-trace-kernel, Andrew Morton, Masami Hiramatsu
  Cc: LKML, Steven Rostedt
In-Reply-To: <20260318155919.78168-6-objecting@objecting.org>

> memblock_alloc() already returns zeroed memory,

Interesting …


>                                                 so the explicit memset
> in xbc_init() is redundant. …

Would you like to reconsider this conclusion for the mentioned function implementation
in more detail?
https://elixir.bootlin.com/linux/v7.0-rc4/source/lib/bootconfig.c#L932-L998

Regards,
Markus

^ permalink raw reply

* Re: [PATCH v8 2/2] tools/bootconfig: fix fd leak in load_xbc_file() on fstat failure
From: Markus Elfring @ 2026-03-18 20:02 UTC (permalink / raw)
  To: Josh Law, linux-trace-kernel, Andrew Morton, Masami Hiramatsu
  Cc: LKML, Steven Rostedt
In-Reply-To: <20260318155847.78065-3-objecting@objecting.org>

> If fstat() fails after open() succeeds, the function returns without
> closing the file descriptor. Also preserve errno across close(), since
> close() may overwrite it before the error is returned.

I find such a change description improvable.

Did anything hinder to use a corresponding goto chain?
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/coding-style.rst?h=v7.0-rc4#n526
https://elixir.bootlin.com/linux/v7.0-rc4/source/tools/bootconfig/main.c#L155-L173

Regards,
Markus

^ permalink raw reply

* Re: [PATCH] Remove unused headers in x86/tools, scripts, pps, input
From: Nicolas Schier @ 2026-03-18 19:56 UTC (permalink / raw)
  To: Oli
  Cc: Thomas Gleixner, Ingo Molnar, Steven Rostedt, Mathieu Desnoyers,
	Masami Hiramatsu, Rodolfo Giometti, Henrik Rydberg,
	Dmitry Torokhov, Nathan Chancellor, linux-kernel,
	linux-trace-kernel, linux-kbuild, linux-input, x86
In-Reply-To: <CAOW84UxjnSDKSjsaaS9=DBquCk3SDfb74=OmkHTLUyq5qriYsA@mail.gmail.com>

Hi Oli,

thanks for your contribution.  Some comments below:

On Tue, Mar 10, 2026 at 10:01:41PM -0500, Oli wrote:
> From c78a0572f5ec2b927f9b723af687e6ef913561a4 Mon Sep 17 00:00:00 2001
> From: Eddie Hudgins <Oochiolio@gmail.com>
> Date: Tue, 10 Mar 2026 21:53:07 -0500
> Subject: [PATCH] Signed-off-by: Eddie Hudgins <Oochiolio@gmail.com>
>  arch/x86/tools: Removed headers in relocs_32.c scripts/basic: Removed
> headers
>  in fixdep.c drivers/pps: Removed headers in pps.c drivers/input: Removed
>  headers in input-mt.c

Usually, patch mails do not contain mail headers within their body; the
only possible exception is 'From:' if the sender is not the patch
author.  These additional headers prevent the usual patch application
(e.g. 'git am <mail').

> 
> These changes compile for x86, x86_64, and powerpc (Those were the only
> ones fairly tested) under defconfig. This aims to clean up code and
> simplify the files for developers. This will also contribute to start of
> decluttering the environment.

A commit subject should start with a subsystem identifier.  A commit
message should tell about the what and why of the patch, followed by a
'Signed-of-by'.  E.g.:

   kbuild: fixdep: Remove unused includes

   Remove unused #include statements for clean up.

   Signed-off-by: Your Name <your.e.mail@addre.ss>

(More complex changes require more details commit message).

Please check Documentation/process/submitting-patches.rst.

[...]
> diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
> index cdd5da7e009b..feb9e7d8984d 100644
> --- a/scripts/basic/fixdep.c
> +++ b/scripts/basic/fixdep.c
> @@ -89,7 +89,6 @@
>   *  but I don't think the added complexity is worth it)
>   */
> 
> -#include <sys/types.h>
>  #include <sys/stat.h>
>  #include <unistd.h>
>  #include <fcntl.h>
> --
> 2.43.0

The change in scripts/basic/fixdep.c looks good to me.  Do you want to
prepare a new kbuild-only patch and want me to take it for kbuild?

Kind regards,
Nicolas

^ permalink raw reply

* Re: [PATCH mm-unstable v15 13/13] Documentation: mm: update the admin guide for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-03-18 19:49 UTC (permalink / raw)
  To: Nico Pache, Lorenzo Stoakes (Oracle)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Bagas Sanjaya
In-Reply-To: <1adffe75-cc91-4c55-bde7-9406bf656c72@redhat.com>

On 3/18/26 20:08, Nico Pache wrote:
> 
> 
> On 3/17/26 5:02 AM, Lorenzo Stoakes (Oracle) wrote:
>> On Wed, Feb 25, 2026 at 08:27:06PM -0700, Nico Pache wrote:
>>> Now that we can collapse to mTHPs lets update the admin guide to
>>> reflect these changes and provide proper guidance on how to utilize it.
>>>
>>> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>
>> LGTM, but maybe we should mention somewhere about mTHP's max_ptes_none
>> behaviour?
> 
> IIRC we decided to strictly leave that out of the manual. I used to have it in
> here. @david?

I think we argued in the past that we didn't want to document the weird
scaling part.

Documenting that only two values are currently supported (no scaling)
makes sense to me.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v15 11/13] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: David Hildenbrand (Arm) @ 2026-03-18 19:48 UTC (permalink / raw)
  To: Nico Pache, Lorenzo Stoakes (Oracle)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <a9ddfa7b-5ee4-4c68-bac0-8a0c6c355bf7@redhat.com>

On 3/18/26 19:59, Nico Pache wrote:
> 
> 
> On 3/17/26 4:35 AM, Lorenzo Stoakes (Oracle) wrote:
>> On Wed, Feb 25, 2026 at 08:26:31PM -0700, Nico Pache wrote:
>>> There are cases where, if an attempted collapse fails, all subsequent
>>> orders are guaranteed to also fail. Avoid these collapse attempts by
>>> bailing out early.
>>>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>
>> With David's concern addressed:
>>
>> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
>>
>>> ---
>>>  mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 34 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 1c3711ed4513..388d3f2537e2 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -1492,9 +1492,42 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>>>  			ret = collapse_huge_page(mm, collapse_address, referenced,
>>>  						 unmapped, cc, mmap_locked,
>>>  						 order);
>>> -			if (ret == SCAN_SUCCEED) {
>>> +
>>> +			switch (ret) {
>>> +			/* Cases were we continue to next collapse candidate */
>>> +			case SCAN_SUCCEED:
>>>  				collapsed += nr_pte_entries;
>>> +				fallthrough;
>>> +			case SCAN_PTE_MAPPED_HUGEPAGE:
>>>  				continue;
>>> +			/* Cases were lower orders might still succeed */
>>> +			case SCAN_LACK_REFERENCED_PAGE:
>>> +			case SCAN_EXCEED_NONE_PTE:
>>> +			case SCAN_EXCEED_SWAP_PTE:
>>> +			case SCAN_EXCEED_SHARED_PTE:
>>> +			case SCAN_PAGE_LOCK:
>>> +			case SCAN_PAGE_COUNT:
>>> +			case SCAN_PAGE_LRU:
>>> +			case SCAN_PAGE_NULL:
>>> +			case SCAN_DEL_PAGE_LRU:
>>> +			case SCAN_PTE_NON_PRESENT:
>>> +			case SCAN_PTE_UFFD_WP:
>>> +			case SCAN_ALLOC_HUGE_PAGE_FAIL:
>>> +				goto next_order;
>>> +			/* Cases were no further collapse is possible */
>>> +			case SCAN_CGROUP_CHARGE_FAIL:
>>> +			case SCAN_COPY_MC:
>>> +			case SCAN_ADDRESS_RANGE:
>>> +			case SCAN_NO_PTE_TABLE:
>>> +			case SCAN_ANY_PROCESS:
>>> +			case SCAN_VMA_NULL:
>>> +			case SCAN_VMA_CHECK:
>>> +			case SCAN_SCAN_ABORT:
>>> +			case SCAN_PAGE_ANON:
>>> +			case SCAN_PMD_MAPPED:
>>> +			case SCAN_FAIL:
>>> +			default:
>>
>> Agree with david, let's spell them out please :)
> 
> I believe David is arguing for the opposite. To drop all these spelt out cases
> and just leave the default case.
> 
> @david is that correct or did I misunderstand that.

Either spell all out (no default) OR add a default.

I prefer to just ... use the default :)

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v15 13/13] Documentation: mm: update the admin guide for mTHP collapse
From: Nico Pache @ 2026-03-18 19:08 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), david
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Bagas Sanjaya
In-Reply-To: <638caee3-af71-47c7-bdc8-a905d3143387@lucifer.local>



On 3/17/26 5:02 AM, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Feb 25, 2026 at 08:27:06PM -0700, Nico Pache wrote:
>> Now that we can collapse to mTHPs lets update the admin guide to
>> reflect these changes and provide proper guidance on how to utilize it.
>>
>> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
> 
> LGTM, but maybe we should mention somewhere about mTHP's max_ptes_none
> behaviour?

IIRC we decided to strictly leave that out of the manual. I used to have it in
here. @david?

> 
> Anyway with that addressed:
> 
> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> 
>> ---
>>  Documentation/admin-guide/mm/transhuge.rst | 48 +++++++++++++---------
>>  1 file changed, 28 insertions(+), 20 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index eebb1f6bbc6c..67836c683e8d 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -63,7 +63,8 @@ often.
>>  THP can be enabled system wide or restricted to certain tasks or even
>>  memory ranges inside task's address space. Unless THP is completely
>>  disabled, there is ``khugepaged`` daemon that scans memory and
>> -collapses sequences of basic pages into PMD-sized huge pages.
>> +collapses sequences of basic pages into huge pages of either PMD size
>> +or mTHP sizes, if the system is configured to do so.
>>
>>  The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
>>  interface and using madvise(2) and prctl(2) system calls.
>> @@ -219,10 +220,10 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing
>>  	echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
>>  	echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
>>
>> -khugepaged will be automatically started when PMD-sized THP is enabled
>> +khugepaged will be automatically started when any THP size is enabled
>>  (either of the per-size anon control or the top-level control are set
>>  to "always" or "madvise"), and it'll be automatically shutdown when
>> -PMD-sized THP is disabled (when both the per-size anon control and the
>> +all THP sizes are disabled (when both the per-size anon control and the
>>  top-level control are "never")
>>
>>  process THP controls
>> @@ -264,11 +265,6 @@ support the following arguments::
>>  Khugepaged controls
>>  -------------------
>>
>> -.. note::
>> -   khugepaged currently only searches for opportunities to collapse to
>> -   PMD-sized THP and no attempt is made to collapse to other THP
>> -   sizes.
>> -
>>  khugepaged runs usually at low frequency so while one may not want to
>>  invoke defrag algorithms synchronously during the page faults, it
>>  should be worth invoking defrag at least in khugepaged. However it's
>> @@ -296,11 +292,11 @@ allocation failure to throttle the next allocation attempt::
>>  The khugepaged progress can be seen in the number of pages collapsed (note
>>  that this counter may not be an exact count of the number of pages
>>  collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
>> -being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
>> -one 2M hugepage. Each may happen independently, or together, depending on
>> -the type of memory and the failures that occur. As such, this value should
>> -be interpreted roughly as a sign of progress, and counters in /proc/vmstat
>> -consulted for more accurate accounting)::
>> +being replaced by a PMD mapping, or (2) physical pages replaced by one
>> +hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
>> +or together, depending on the type of memory and the failures that occur.
>> +As such, this value should be interpreted roughly as a sign of progress,
>> +and counters in /proc/vmstat consulted for more accurate accounting)::
>>
>>  	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
>>
>> @@ -308,16 +304,19 @@ for each pass::
>>
>>  	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
>>
>> -``max_ptes_none`` specifies how many extra small pages (that are
>> -not already mapped) can be allocated when collapsing a group
>> -of small pages into one large page::
>> +``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
>> +when collapsing a group of small pages into one large page::
>>
>>  	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
>>
>> -A higher value leads to use additional memory for programs.
>> -A lower value leads to gain less thp performance. Value of
>> -max_ptes_none can waste cpu time very little, you can
>> -ignore it.
>> +For PMD-sized THP collapse, this directly limits the number of empty pages
>> +allowed in the 2MB region. For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1)
>> +are supported. Any other value will emit a warning and no mTHP collapse
>> +will be attempted.
>> +
>> +A higher value allows more empty pages, potentially leading to more memory
>> +usage but better THP performance. A lower value is more conservative and
>> +may result in fewer THP collapses.
>>
>>  ``max_ptes_swap`` specifies how many pages can be brought in from
>>  swap when collapsing a group of pages into a transparent huge page::
>> @@ -337,6 +336,15 @@ that THP is shared. Exceeding the number would block the collapse::
>>
>>  A higher value may increase memory footprint for some workloads.
>>
>> +.. note::
>> +   For mTHP collapse, khugepaged does not support collapsing regions that
>> +   contain shared or swapped out pages, as this could lead to continuous
>> +   promotion to higher orders. The collapse will fail if any shared or
>> +   swapped PTEs are encountered during the scan.
>> +
>> +   Currently, madvise_collapse only supports collapsing to PMD-sized THPs
>> +   and does not attempt mTHP collapses.
>> +
>>  Boot parameters
>>  ===============
>>
>> --
>> 2.53.0
>>
> 


^ permalink raw reply

* Re: [PATCH mm-unstable v15 12/13] mm/khugepaged: run khugepaged for all orders
From: Nico Pache @ 2026-03-18 19:07 UTC (permalink / raw)
  To: Lance Yang, baolin.wang
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260317113611.94006-1-lance.yang@linux.dev>



On 3/17/26 5:36 AM, Lance Yang wrote:
> 
> On Wed, Feb 25, 2026 at 08:26:50PM -0700, Nico Pache wrote:
>> From: Baolin Wang <baolin.wang@linux.alibaba.com>
>>
>> If any order (m)THP is enabled we should allow running khugepaged to
>> attempt scanning and collapsing mTHPs. In order for khugepaged to operate
>> when only mTHP sizes are specified in sysfs, we must modify the predicate
>> function that determines whether it ought to run to do so.
>>
>> This function is currently called hugepage_pmd_enabled(), this patch
>> renames it to hugepage_enabled() and updates the logic to check to
>> determine whether any valid orders may exist which would justify
>> khugepaged running.
>>
>> We must also update collapse_allowable_orders() to check all orders if
>> the vma is anonymous and the collapse is khugepaged.
>>
>> After this patch khugepaged mTHP collapse is fully enabled.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>> mm/khugepaged.c | 30 ++++++++++++++++++------------
>> 1 file changed, 18 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 388d3f2537e2..e8bfcc1d0c9a 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -434,23 +434,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
>> 		mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
>> }
>>
>> -static bool hugepage_pmd_enabled(void)
>> +static bool hugepage_enabled(void)
>> {
>> 	/*
>> 	 * We cover the anon, shmem and the file-backed case here; file-backed
>> 	 * hugepages, when configured in, are determined by the global control.
>> -	 * Anon pmd-sized hugepages are determined by the pmd-size control.
>> +	 * Anon hugepages are determined by its per-size mTHP control.
>> 	 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
>> 	 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
>> 	 */
>> 	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
>> 	    hugepage_global_enabled())
>> 		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_always))
>> +	if (READ_ONCE(huge_anon_orders_always))
>> 		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
>> +	if (READ_ONCE(huge_anon_orders_madvise))
>> 		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
>> +	if (READ_ONCE(huge_anon_orders_inherit) &&
>> 	    hugepage_global_enabled())
>> 		return true;
>> 	if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
>> @@ -521,8 +521,14 @@ static unsigned int collapse_max_ptes_none(unsigned int order)
>> static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
>> 			vm_flags_t vm_flags, bool is_khugepaged)
>> {
>> +	unsigned long orders;
>> 	enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>> -	unsigned long orders = BIT(HPAGE_PMD_ORDER);
>> +
>> +	/* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
>> +	if (is_khugepaged && vma_is_anonymous(vma))
>> +		orders = THP_ORDERS_ALL_ANON;
>> +	else
>> +		orders = BIT(HPAGE_PMD_ORDER);
>>
>> 	return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>> }
> 
> IIUC, an anonymous VMA can pass collapse_allowable_orders() even if it
> is smaller than 2MB ...
> 
> But collapse_scan_mm_slot() still scans only full PMD-sized windows:
> 
> 		hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
> 		hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
> 		if (khugepaged_scan.address > hend) {
> 			cc->progress++;
> 			continue;
> 		}
> 
> and hugepage_vma_revalidate() still requires PMD suitability:
> 
> 	/* Always check the PMD order to ensure its not shared by another VMA */
> 	if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
> 		return SCAN_ADDRESS_RANGE;
> 
> 
>> @@ -531,7 +537,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
>> 			  vm_flags_t vm_flags)
>> {
>> 	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
>> -	    hugepage_pmd_enabled()) {
>> +	    hugepage_enabled()) {
>> 		if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true))
>> 			__khugepaged_enter(vma->vm_mm);
> 
> I wonder if we should also require at least one PMD-sized scan window
> here? Not a big deal, just might be good to tighten the gate a bit :)

IIUC, you are worried that we are operating on VMAs smaller than a PMD?
thp_vma_allowable_orders should guard from that via thp_vma_suitable. the
revalidation also checks this in hugepage_vma_revalidate() and is the reason we
must leave the suitable_order check in revalidate() checking the PMD_ORDER than
than the attempted collapse order.

lmk if that clears things up!

Thanks
-- Nico

> 
> Apart from that, LGTM!
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> 


^ permalink raw reply

* Re: [PATCH mm-unstable v15 12/13] mm/khugepaged: run khugepaged for all orders
From: Nico Pache @ 2026-03-18 19:02 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <a74178af-54f3-44ba-a007-4eaf49e40ab3@lucifer.local>



On 3/17/26 4:58 AM, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Feb 25, 2026 at 08:26:50PM -0700, Nico Pache wrote:
>> From: Baolin Wang <baolin.wang@linux.alibaba.com>
>>
>> If any order (m)THP is enabled we should allow running khugepaged to
>> attempt scanning and collapsing mTHPs. In order for khugepaged to operate
>> when only mTHP sizes are specified in sysfs, we must modify the predicate
>> function that determines whether it ought to run to do so.
>>
>> This function is currently called hugepage_pmd_enabled(), this patch
>> renames it to hugepage_enabled() and updates the logic to check to
>> determine whether any valid orders may exist which would justify
>> khugepaged running.
>>
>> We must also update collapse_allowable_orders() to check all orders if
>> the vma is anonymous and the collapse is khugepaged.
>>
>> After this patch khugepaged mTHP collapse is fully enabled.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
> 
> This looks good to me, so:
> 
> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Thanks!

> 
>> ---
>>  mm/khugepaged.c | 30 ++++++++++++++++++------------
>>  1 file changed, 18 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 388d3f2537e2..e8bfcc1d0c9a 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -434,23 +434,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
>>  		mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
>>  }
>>
>> -static bool hugepage_pmd_enabled(void)
>> +static bool hugepage_enabled(void)
>>  {
>>  	/*
>>  	 * We cover the anon, shmem and the file-backed case here; file-backed
>>  	 * hugepages, when configured in, are determined by the global control.
>> -	 * Anon pmd-sized hugepages are determined by the pmd-size control.
>> +	 * Anon hugepages are determined by its per-size mTHP control.
> 
> Well also PMD right? I mean this terminology sucks because in a sense mTHP
> includes PMD... :)

yeah kinda hard with our verbiage being so broad and overlapping some times.

> 
>>  	 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
>>  	 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
>>  	 */
>>  	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
>>  	    hugepage_global_enabled())
>>  		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_always))
>> +	if (READ_ONCE(huge_anon_orders_always))
>>  		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
>> +	if (READ_ONCE(huge_anon_orders_madvise))
>>  		return true;
>> -	if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
>> +	if (READ_ONCE(huge_anon_orders_inherit) &&
>>  	    hugepage_global_enabled())
>>  		return true;
>>  	if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
>> @@ -521,8 +521,14 @@ static unsigned int collapse_max_ptes_none(unsigned int order)
>>  static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
>>  			vm_flags_t vm_flags, bool is_khugepaged)
>>  {
>> +	unsigned long orders;
>>  	enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>> -	unsigned long orders = BIT(HPAGE_PMD_ORDER);
>> +
>> +	/* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
>> +	if (is_khugepaged && vma_is_anonymous(vma))
>> +		orders = THP_ORDERS_ALL_ANON;
>> +	else
>> +		orders = BIT(HPAGE_PMD_ORDER);
>>
>>  	return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>>  }
>> @@ -531,7 +537,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
>>  			  vm_flags_t vm_flags)
>>  {
>>  	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
>> -	    hugepage_pmd_enabled()) {
>> +	    hugepage_enabled()) {
>>  		if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true))
>>  			__khugepaged_enter(vma->vm_mm);
>>  	}
>> @@ -2929,7 +2935,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
>>
>>  static int khugepaged_has_work(void)
>>  {
>> -	return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
>> +	return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
>>  }
>>
>>  static int khugepaged_wait_event(void)
>> @@ -3002,7 +3008,7 @@ static void khugepaged_wait_work(void)
>>  		return;
>>  	}
>>
>> -	if (hugepage_pmd_enabled())
>> +	if (hugepage_enabled())
>>  		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
>>  }
>>
>> @@ -3033,7 +3039,7 @@ static void set_recommended_min_free_kbytes(void)
>>  	int nr_zones = 0;
>>  	unsigned long recommended_min;
>>
>> -	if (!hugepage_pmd_enabled()) {
>> +	if (!hugepage_enabled()) {
>>  		calculate_min_free_kbytes();
>>  		goto update_wmarks;
>>  	}
>> @@ -3083,7 +3089,7 @@ int start_stop_khugepaged(void)
>>  	int err = 0;
>>
>>  	mutex_lock(&khugepaged_mutex);
>> -	if (hugepage_pmd_enabled()) {
>> +	if (hugepage_enabled()) {
>>  		if (!khugepaged_thread)
>>  			khugepaged_thread = kthread_run(khugepaged, NULL,
>>  							"khugepaged");
>> @@ -3109,7 +3115,7 @@ int start_stop_khugepaged(void)
>>  void khugepaged_min_free_kbytes_update(void)
>>  {
>>  	mutex_lock(&khugepaged_mutex);
>> -	if (hugepage_pmd_enabled() && khugepaged_thread)
>> +	if (hugepage_enabled() && khugepaged_thread)
>>  		set_recommended_min_free_kbytes();
>>  	mutex_unlock(&khugepaged_mutex);
>>  }
>> --
>> 2.53.0
>>
> 


^ permalink raw reply

* Re: [PATCH mm-unstable v15 11/13] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: Nico Pache @ 2026-03-18 18:59 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), david
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <d8028d32-4791-4993-aae3-66506bf6d1bd@lucifer.local>



On 3/17/26 4:35 AM, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Feb 25, 2026 at 08:26:31PM -0700, Nico Pache wrote:
>> There are cases where, if an attempted collapse fails, all subsequent
>> orders are guaranteed to also fail. Avoid these collapse attempts by
>> bailing out early.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
> 
> With David's concern addressed:
> 
> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> 
>> ---
>>  mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>>  1 file changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 1c3711ed4513..388d3f2537e2 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1492,9 +1492,42 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>>  			ret = collapse_huge_page(mm, collapse_address, referenced,
>>  						 unmapped, cc, mmap_locked,
>>  						 order);
>> -			if (ret == SCAN_SUCCEED) {
>> +
>> +			switch (ret) {
>> +			/* Cases were we continue to next collapse candidate */
>> +			case SCAN_SUCCEED:
>>  				collapsed += nr_pte_entries;
>> +				fallthrough;
>> +			case SCAN_PTE_MAPPED_HUGEPAGE:
>>  				continue;
>> +			/* Cases were lower orders might still succeed */
>> +			case SCAN_LACK_REFERENCED_PAGE:
>> +			case SCAN_EXCEED_NONE_PTE:
>> +			case SCAN_EXCEED_SWAP_PTE:
>> +			case SCAN_EXCEED_SHARED_PTE:
>> +			case SCAN_PAGE_LOCK:
>> +			case SCAN_PAGE_COUNT:
>> +			case SCAN_PAGE_LRU:
>> +			case SCAN_PAGE_NULL:
>> +			case SCAN_DEL_PAGE_LRU:
>> +			case SCAN_PTE_NON_PRESENT:
>> +			case SCAN_PTE_UFFD_WP:
>> +			case SCAN_ALLOC_HUGE_PAGE_FAIL:
>> +				goto next_order;
>> +			/* Cases were no further collapse is possible */
>> +			case SCAN_CGROUP_CHARGE_FAIL:
>> +			case SCAN_COPY_MC:
>> +			case SCAN_ADDRESS_RANGE:
>> +			case SCAN_NO_PTE_TABLE:
>> +			case SCAN_ANY_PROCESS:
>> +			case SCAN_VMA_NULL:
>> +			case SCAN_VMA_CHECK:
>> +			case SCAN_SCAN_ABORT:
>> +			case SCAN_PAGE_ANON:
>> +			case SCAN_PMD_MAPPED:
>> +			case SCAN_FAIL:
>> +			default:
> 
> Agree with david, let's spell them out please :)

I believe David is arguing for the opposite. To drop all these spelt out cases
and just leave the default case.

@david is that correct or did I misunderstand that.

-- Nico

> 
>> +				return collapsed;
>>  			}
>>  		}
>>
>> --
>> 2.53.0
>>
> 


^ permalink raw reply

* [RFC PATCH v3 4/4] locking: Add contended_release tracepoint to spinning locks
From: Dmitry Ilvokhin @ 2026-03-18 18:45 UTC (permalink / raw)
  To: Arnd Bergmann, Dennis Zhou, Tejun Heo, Christoph Lameter,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long
  Cc: linux-arch, linux-kernel, linux-mm, linux-trace-kernel,
	kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1773858853.git.d@ilvokhin.com>

Extend the contended_release tracepoint to queued spinlocks and queued
rwlocks.

When the tracepoint is disabled, the only addition to the hot path is a
single NOP instruction (the static branch). When enabled, the contention
check, trace call, and unlock are combined in an out-of-line function to
minimize hot path impact, avoiding the compiler needing to preserve the
lock pointer in a callee-saved register across the trace call.

Binary size impact (x86_64, defconfig):
  uninlined unlock (common case): +983 bytes  (+0.00%)
  inlined unlock (worst case):    +71554 bytes (+0.30%)

The inlined unlock case could not be achieved through Kconfig options on
x86_64 as PREEMPT_BUILD unconditionally selects UNINLINE_SPIN_UNLOCK on
x86_64. The UNINLINE_SPIN_UNLOCK guards were manually inverted to force
inline the unlock path and estimate the worst case binary size increase.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 include/asm-generic/qrwlock.h   | 48 +++++++++++++++++++++++++++------
 include/asm-generic/qspinlock.h | 25 +++++++++++++++--
 kernel/locking/qrwlock.c        | 16 +++++++++++
 kernel/locking/qspinlock.c      |  8 ++++++
 4 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 75b8f4601b28..e24dc537fd66 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -14,6 +14,7 @@
 #define __ASM_GENERIC_QRWLOCK_H
 
 #include <linux/atomic.h>
+#include <linux/tracepoint-defs.h>
 #include <asm/barrier.h>
 #include <asm/processor.h>
 
@@ -35,6 +36,10 @@
  */
 extern void queued_read_lock_slowpath(struct qrwlock *lock);
 extern void queued_write_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_unlock_traced(struct qrwlock *lock);
+extern void queued_write_unlock_traced(struct qrwlock *lock);
+
+DECLARE_TRACEPOINT(contended_release);
 
 /**
  * queued_read_trylock - try to acquire read lock of a queued rwlock
@@ -102,10 +107,16 @@ static inline void queued_write_lock(struct qrwlock *lock)
 }
 
 /**
- * queued_read_unlock - release read lock of a queued rwlock
+ * queued_rwlock_is_contended - check if the lock is contended
  * @lock : Pointer to queued rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
  */
-static inline void queued_read_unlock(struct qrwlock *lock)
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+	return arch_spin_is_locked(&lock->wait_lock);
+}
+
+static __always_inline void __queued_read_unlock(struct qrwlock *lock)
 {
 	/*
 	 * Atomically decrement the reader count
@@ -114,22 +125,43 @@ static inline void queued_read_unlock(struct qrwlock *lock)
 }
 
 /**
- * queued_write_unlock - release write lock of a queued rwlock
+ * queued_read_unlock - release read lock of a queued rwlock
  * @lock : Pointer to queued rwlock structure
  */
-static inline void queued_write_unlock(struct qrwlock *lock)
+static inline void queued_read_unlock(struct qrwlock *lock)
+{
+	/*
+	 * Trace and unlock are combined in the traced unlock variant so
+	 * the compiler does not need to preserve the lock pointer across
+	 * the function call, avoiding callee-saved register save/restore
+	 * on the hot path.
+	 */
+	if (tracepoint_enabled(contended_release)) {
+		queued_read_unlock_traced(lock);
+		return;
+	}
+
+	__queued_read_unlock(lock);
+}
+
+static __always_inline void __queued_write_unlock(struct qrwlock *lock)
 {
 	smp_store_release(&lock->wlocked, 0);
 }
 
 /**
- * queued_rwlock_is_contended - check if the lock is contended
+ * queued_write_unlock - release write lock of a queued rwlock
  * @lock : Pointer to queued rwlock structure
- * Return: 1 if lock contended, 0 otherwise
  */
-static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+static inline void queued_write_unlock(struct qrwlock *lock)
 {
-	return arch_spin_is_locked(&lock->wait_lock);
+	/* See comment in queued_read_unlock(). */
+	if (tracepoint_enabled(contended_release)) {
+		queued_write_unlock_traced(lock);
+		return;
+	}
+
+	__queued_write_unlock(lock);
 }
 
 /*
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index bf47cca2c375..8ba463a3b891 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -41,6 +41,7 @@
 
 #include <asm-generic/qspinlock_types.h>
 #include <linux/atomic.h>
+#include <linux/tracepoint-defs.h>
 
 #ifndef queued_spin_is_locked
 /**
@@ -116,6 +117,19 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 #endif
 
 #ifndef queued_spin_unlock
+
+DECLARE_TRACEPOINT(contended_release);
+
+extern void queued_spin_unlock_traced(struct qspinlock *lock);
+
+static __always_inline void __queued_spin_unlock(struct qspinlock *lock)
+{
+	/*
+	 * unlock() needs release semantics:
+	 */
+	smp_store_release(&lock->locked, 0);
+}
+
 /**
  * queued_spin_unlock - release a queued spinlock
  * @lock : Pointer to queued spinlock structure
@@ -123,9 +137,16 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 {
 	/*
-	 * unlock() needs release semantics:
+	 * Trace and unlock are combined in queued_spin_unlock_traced()
+	 * so the compiler does not need to preserve the lock pointer
+	 * across the function call, avoiding callee-saved register
+	 * save/restore on the hot path.
 	 */
-	smp_store_release(&lock->locked, 0);
+	if (tracepoint_enabled(contended_release)) {
+		queued_spin_unlock_traced(lock);
+		return;
+	}
+	__queued_spin_unlock(lock);
 }
 #endif
 
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index d2ef312a8611..5f7a0fc2b27a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -90,3 +90,19 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
 	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
+
+void __lockfunc queued_read_unlock_traced(struct qrwlock *lock)
+{
+	if (queued_rwlock_is_contended(lock))
+		trace_contended_release(lock);
+	__queued_read_unlock(lock);
+}
+EXPORT_SYMBOL(queued_read_unlock_traced);
+
+void __lockfunc queued_write_unlock_traced(struct qrwlock *lock)
+{
+	if (queued_rwlock_is_contended(lock))
+		trace_contended_release(lock);
+	__queued_write_unlock(lock);
+}
+EXPORT_SYMBOL(queued_write_unlock_traced);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index af8d122bb649..1544dcec65fa 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -104,6 +104,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
+void __lockfunc queued_spin_unlock_traced(struct qspinlock *lock)
+{
+	if (queued_spin_is_contended(lock))
+		trace_contended_release(lock);
+	__queued_spin_unlock(lock);
+}
+EXPORT_SYMBOL(queued_spin_unlock_traced);
+
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 3/4] locking: Add contended_release tracepoint to sleepable locks
From: Dmitry Ilvokhin @ 2026-03-18 18:45 UTC (permalink / raw)
  To: Arnd Bergmann, Dennis Zhou, Tejun Heo, Christoph Lameter,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long
  Cc: linux-arch, linux-kernel, linux-mm, linux-trace-kernel,
	kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1773858853.git.d@ilvokhin.com>

Add the contended_release trace event. This tracepoint fires on the
holder side when a contended lock is released, complementing the
existing contention_begin/contention_end tracepoints which fire on the
waiter side.

This enables correlating lock hold time under contention with waiter
events by lock address.

Add trace_contended_release() calls to the slowpath unlock paths of
sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
RT-specific rwbase locks.

Where possible, trace_contended_release() fires before the lock is
released and before the waiter is woken. For rwsem, semaphore, and
rwbase_rt read unlock, the tracepoint fires after the release but before
the wake, as contention is determined by the return value of the atomic
release operation.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 include/trace/events/lock.h   | 17 +++++++++++++++++
 kernel/locking/mutex.c        |  3 +++
 kernel/locking/percpu-rwsem.c |  4 ++++
 kernel/locking/rtmutex.c      |  1 +
 kernel/locking/rwbase_rt.c    |  8 +++++++-
 kernel/locking/rwsem.c        |  9 +++++++--
 kernel/locking/semaphore.c    |  4 +++-
 7 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index da978f2afb45..1ded869cd619 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
 	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
 );
 
+TRACE_EVENT(contended_release,
+
+	TP_PROTO(void *lock),
+
+	TP_ARGS(lock),
+
+	TP_STRUCT__entry(
+		__field(void *, lock_addr)
+	),
+
+	TP_fast_assign(
+		__entry->lock_addr = lock;
+	),
+
+	TP_printk("%p", __entry->lock_addr)
+);
+
 #endif /* _TRACE_LOCK_H */
 
 /* This part must be outside protection */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 427187ff02db..bb25741c2768 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		wake_q_add(&wake_q, next);
 	}
 
+	if (trace_contended_release_enabled() && waiter)
+		trace_contended_release(lock);
+
 	if (owner & MUTEX_FLAG_HANDOFF)
 		__mutex_handoff(lock, next);
 
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f3ee7a0d6047..6bf2ccddd42d 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -263,6 +263,9 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
 
+	if (trace_contended_release_enabled() && wq_has_sleeper(&sem->waiters))
+		trace_contended_release(sem);
+
 	/*
 	 * Signal the writer is done, no fast path yet.
 	 *
@@ -292,6 +295,7 @@ EXPORT_SYMBOL_GPL(percpu_up_write);
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	lockdep_assert_preemption_disabled();
+	trace_contended_release(sem);
 	/*
 	 * slowpath; reader will only ever wake a single blocked
 	 * writer.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ccaba6148b61..3db8a840b4e8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1466,6 +1466,7 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
 		raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	}
 
+	trace_contended_release(lock);
 	/*
 	 * The wakeup next waiter path does not suffer from the above
 	 * race. See the comments there.
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 82e078c0665a..081778934b13 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -162,8 +162,10 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
 	 * worst case which can happen is a spurious wakeup.
 	 */
 	owner = rt_mutex_owner(rtm);
-	if (owner)
+	if (owner) {
+		trace_contended_release(rwb);
 		rt_mutex_wake_q_add_task(&wqh, owner, state);
+	}
 
 	/* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
 	preempt_disable();
@@ -205,6 +207,8 @@ static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+	if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+		trace_contended_release(rwb);
 	__rwbase_write_unlock(rwb, WRITER_BIAS, flags);
 }
 
@@ -214,6 +218,8 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+	if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+		trace_contended_release(rwb);
 	/* Release it and account current as reader */
 	__rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bf647097369c..767a1a2b7d8c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1390,6 +1390,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
 		      RWSEM_FLAG_WAITERS)) {
 		clear_nonspinnable(sem);
+		trace_contended_release(sem);
 		rwsem_wake(sem);
 	}
 	preempt_enable();
@@ -1413,8 +1414,10 @@ static inline void __up_write(struct rw_semaphore *sem)
 	preempt_disable();
 	rwsem_clear_owner(sem);
 	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
-	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+	if (unlikely(tmp & RWSEM_FLAG_WAITERS)) {
+		trace_contended_release(sem);
 		rwsem_wake(sem);
+	}
 	preempt_enable();
 }
 
@@ -1437,8 +1440,10 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	tmp = atomic_long_fetch_add_release(
 		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
 	rwsem_set_reader_owned(sem);
-	if (tmp & RWSEM_FLAG_WAITERS)
+	if (tmp & RWSEM_FLAG_WAITERS) {
+		trace_contended_release(sem);
 		rwsem_downgrade_wake(sem);
+	}
 	preempt_enable();
 }
 
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 74d41433ba13..d46415095dd6 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -231,8 +231,10 @@ void __sched up(struct semaphore *sem)
 	else
 		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
-	if (!wake_q_empty(&wake_q))
+	if (!wake_q_empty(&wake_q)) {
+		trace_contended_release(sem);
 		wake_up_q(&wake_q);
+	}
 }
 EXPORT_SYMBOL(up);
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 2/4] locking/percpu-rwsem: Extract __percpu_up_read()
From: Dmitry Ilvokhin @ 2026-03-18 18:45 UTC (permalink / raw)
  To: Arnd Bergmann, Dennis Zhou, Tejun Heo, Christoph Lameter,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long
  Cc: linux-arch, linux-kernel, linux-mm, linux-trace-kernel,
	kernel-team, Dmitry Ilvokhin, Usama Arif
In-Reply-To: <cover.1773858853.git.d@ilvokhin.com>

Move the percpu_up_read() slowpath out of the inline function into a new
__percpu_up_read() to avoid binary size increase from adding a
tracepoint to an inlined function.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/percpu-rwsem.h  | 15 +++------------
 kernel/locking/percpu-rwsem.c | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c8cb010d655e..39d5bf8e6562 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -107,6 +107,8 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	return ret;
 }
 
+extern void __percpu_up_read(struct percpu_rw_semaphore *sem);
+
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
@@ -118,18 +120,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	if (likely(rcu_sync_is_idle(&sem->rss))) {
 		this_cpu_dec(*sem->read_count);
 	} else {
-		/*
-		 * slowpath; reader will only ever wake a single blocked
-		 * writer.
-		 */
-		smp_mb(); /* B matches C */
-		/*
-		 * In other words, if they see our decrement (presumably to
-		 * aggregate zero, as that is the only time it matters) they
-		 * will also see our critical section.
-		 */
-		this_cpu_dec(*sem->read_count);
-		rcuwait_wake_up(&sem->writer);
+		__percpu_up_read(sem);
 	}
 	preempt_enable();
 }
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ef234469baac..f3ee7a0d6047 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -288,3 +288,21 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	lockdep_assert_preemption_disabled();
+	/*
+	 * slowpath; reader will only ever wake a single blocked
+	 * writer.
+	 */
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to
+	 * aggregate zero, as that is the only time it matters) they
+	 * will also see our critical section.
+	 */
+	this_cpu_dec(*sem->read_count);
+	rcuwait_wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
-- 
2.52.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox