Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v21 0/9] ring-buffer: Making persistent ring buffers robust
From: Steven Rostedt @ 2026-05-22 17:08 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers

This is to make the persistent ring buffer more robust when sub-buffers
are detected to be corrupted. Instead of invalidating the entire buffer,
just invalidate the individual sub-buffers.

I started with Masami's patches and modified some from Sashiko reviews.
I added a few patches to display the dropped events when the persistent
ring buffers validation checks found sub-buffers were dropped due to being
corrupted data.

Changes since v20: https://lore.kernel.org/all/20260520184938.749337513@kernel.org/

- squashed the fix for max_loops in rb_iter_peek()

- Still process reader page if head page fails validation (Sashiko)

- Removed left over printk() (Masami Hiramatsu)


Masami Hiramatsu (Google) (6):
      ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
      ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
      ring-buffer: Add persistent ring buffer invalid-page inject test
      ring-buffer: Show commit numbers in buffer_meta file
      ring-buffer: Cleanup persistent ring buffer validation
      ring-buffer: Cleanup buffer_data_page related code

Steven Rostedt (3):
      ring-buffer: Have dropped subbuffers be persistent across reboots
      ring-buffer: Show persistent buffer dropped events in trace file
      ring-buffer: Show persistent buffer dropped events in trace_pipe file

----
 include/linux/ring_buffer.h |   1 +
 kernel/trace/Kconfig        |  34 +++
 kernel/trace/ring_buffer.c  | 543 +++++++++++++++++++++++++++++---------------
 kernel/trace/trace.c        |   4 +
 4 files changed, 402 insertions(+), 180 deletions(-)

^ permalink raw reply

* [PATCH v21 4/9] ring-buffer: Show commit numbers in buffer_meta file
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

In addition to the index number, show the commit numbers of
each data page in the per_cpu buffer_meta file.
This is useful for understanding the current status of the
persistent ring buffer. (Note that this file is shown
only for persistent ring buffer and its backup instance)

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc603d9c9414..88e613e78632 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2231,6 +2231,7 @@ static int rbm_show(struct seq_file *m, void *v)
 	struct ring_buffer_per_cpu *cpu_buffer = m->private;
 	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
 	unsigned long val = (unsigned long)v;
+	struct buffer_data_page *dpage;
 
 	if (val == 1) {
 		seq_printf(m, "head_buffer:   %d\n",
@@ -2243,7 +2244,9 @@ static int rbm_show(struct seq_file *m, void *v)
 	}
 
 	val -= 2;
-	seq_printf(m, "buffer[%ld]:    %d\n", val, meta->buffers[val]);
+	dpage = rb_range_buffer(cpu_buffer, val);
+	seq_printf(m, "buffer[%ld]:    %d (commit: %ld)\n",
+		   val, meta->buffers[val], dpage ? local_read(&dpage->commit) : -1);
 
 	return 0;
 }
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 1/9] ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
From: Steven Rostedt @ 2026-05-22 17:08 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Skip invalid sub-buffers when validating the persistent ring buffer
instead of discarding the entire ring buffer. Only skipped buffers
are invalidated (cleared).

If the cache data in memory fails to be synchronized during a reboot,
the persistent ring buffer may become partially corrupted, but other
sub-buffers may still contain readable event data. Only discard the
subbuffers that are found to be corrupted.

Link: https://lore.kernel.org/all/20260520185018.051228084@kernel.org/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
[SDR: Fixed max_loops in rb_iter_peek() as well ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v20: https://patch.msgid.link/20260520185017.044376275@kernel.org

- squashed the fix for max_loops in rb_iter_peek()

 kernel/trace/ring_buffer.c | 120 ++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 47 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7b07d2004cc6..6afd8ea5081a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,12 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 	return local_read(&bpage->page->commit);
 }
 
+/* Size is determined by what has been committed */
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+}
+
 static void free_buffer_page(struct buffer_page *bpage)
 {
 	/* Range pages are not to be freed */
@@ -1762,7 +1768,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			      unsigned long *subbuf_mask)
 {
 	int subbuf_size = PAGE_SIZE;
-	struct buffer_data_page *subbuf;
 	unsigned long buffers_start;
 	unsigned long buffers_end;
 	int i;
@@ -1770,6 +1775,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 	if (!subbuf_mask)
 		return false;
 
+	if (meta->subbuf_size != PAGE_SIZE) {
+		pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+		return false;
+	}
+
 	buffers_start = meta->first_buffer;
 	buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
 
@@ -1786,11 +1796,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 		return false;
 	}
 
-	subbuf = rb_subbufs_from_meta(meta);
-
 	bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
 
-	/* Is the meta buffers and the subbufs themselves have correct data? */
+	/*
+	 * Ensure the meta::buffers array has correct data. The data in each subbufs
+	 * are checked later in rb_meta_validate_events().
+	 */
 	for (i = 0; i < meta->nr_subbufs; i++) {
 		if (meta->buffers[i] < 0 ||
 		    meta->buffers[i] >= meta->nr_subbufs) {
@@ -1798,18 +1809,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
 			return false;
 		}
 
-		if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
-			pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
-			return false;
-		}
-
 		if (test_bit(meta->buffers[i], subbuf_mask)) {
 			pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
 			return false;
 		}
 
 		set_bit(meta->buffers[i], subbuf_mask);
-		subbuf = (void *)subbuf + subbuf_size;
 	}
 
 	return true;
@@ -1873,13 +1878,22 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+			      struct ring_buffer_cpu_meta *meta)
 {
 	unsigned long long ts;
+	unsigned long tail;
 	u64 delta;
-	int tail;
 
-	tail = local_read(&dpage->commit);
+	/*
+	 * When a sub-buffer is recovered from a read, the commit value may
+	 * have RB_MISSED_* bits set, as these bits are reset on reuse.
+	 * Even after clearing these bits, a commit value greater than the
+	 * subbuf_size is considered invalid.
+	 */
+	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+	if (tail > meta->subbuf_size - BUF_PAGE_HDR_SIZE)
+		return -1;
 	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 }
 
@@ -1890,6 +1904,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *head_page, *orig_head, *orig_reader;
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
+	int discarded = 0;
 	int ret;
 	u64 ts;
 	int i;
@@ -1901,14 +1916,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_reader = cpu_buffer->reader_page;
 
 	/* Do the reader page first */
-	ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
+	ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu, meta);
 	if (ret < 0) {
-		pr_info("Ring buffer reader page is invalid\n");
-		goto invalid;
+		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
+			cpu_buffer->cpu);
+		discarded++;
+		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+		local_set(&orig_reader->entries, 0);
+		local_set(&orig_reader->page->commit, 0);
+	} else {
+		entries += ret;
+		entry_bytes += rb_page_size(orig_reader);
+		local_set(&orig_reader->entries, ret);
 	}
-	entries += ret;
-	entry_bytes += local_read(&orig_reader->page->commit);
-	local_set(&orig_reader->entries, ret);
 
 	ts = head_page->page->time_stamp;
 
@@ -1936,7 +1956,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 			break;
 
 		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0)
 			break;
 
@@ -1945,7 +1965,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (ret)
 			local_inc(&cpu_buffer->pages_touched);
 		entries += ret;
-		entry_bytes += rb_page_commit(head_page);
+		entry_bytes += rb_page_size(head_page);
 	}
 	if (i)
 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2015,21 +2035,24 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == orig_reader)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
 		if (ret < 0) {
-			pr_info("Ring buffer meta [%d] invalid buffer page\n",
-				cpu_buffer->cpu);
-			goto invalid;
-		}
-
-		/* If the buffer has content, update pages_touched */
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-
-		entries += ret;
-		entry_bytes += local_read(&head_page->page->commit);
-		local_set(&head_page->entries, ret);
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
+			local_set(&head_page->entries, 0);
+			local_set(&head_page->page->commit, 0);
+		} else {
+			/* If the buffer has content, update pages_touched */
+			if (ret)
+				local_inc(&cpu_buffer->pages_touched);
 
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			local_set(&head_page->entries, ret);
+		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
 	}
@@ -2043,7 +2066,10 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->entries, entries);
 	local_set(&cpu_buffer->entries_bytes, entry_bytes);
 
-	pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+	pr_info("Ring buffer meta [%d] is from previous boot!", cpu_buffer->cpu);
+	if (discarded)
+		pr_cont(" (%d pages discarded)", discarded);
+	pr_cont("\n");
 	return;
 
  invalid:
@@ -3330,12 +3356,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	return NULL;
 }
 
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
-	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
 static __always_inline unsigned
 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -5636,8 +5656,9 @@ __rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
 static struct buffer_page *
 __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	struct buffer_page *reader = NULL;
+	int max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3;
 	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
+	struct buffer_page *reader = NULL;
 	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
@@ -5649,11 +5670,14 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
  again:
 	/*
 	 * This should normally only loop twice. But because the
-	 * start of the reader inserts an empty page, it causes
-	 * a case where we will loop three times. There should be no
-	 * reason to loop four times (that I know of).
+	 * start of the reader inserts an empty page, it causes a
+	 * case where we will loop three times. There should be no
+	 * reason to loop four times unless the ring buffer is a
+	 * recovered persistent ring buffer. For persistent ring buffers,
+	 * invalid pages are reset during recovery, so there may be more
+	 * than 3 contiguous pages can be empty, but less than nr_pages.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > max_loops)) {
 		reader = NULL;
 		goto out;
 	}
@@ -5950,12 +5974,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
 	int nr_loops = 0;
+	int max_loops;
 
 	if (ts)
 		*ts = 0;
 
 	cpu_buffer = iter->cpu_buffer;
 	buffer = cpu_buffer->buffer;
+	max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3;
 
 	/*
 	 * Check if someone performed a consuming read to the buffer
@@ -5978,7 +6004,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	 * the ring buffer with an active write as the consumer is.
 	 * Do not warn if the three failures is reached.
 	 */
-	if (++nr_loops > 3)
+	if (++nr_loops > max_loops)
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 3/9] ring-buffer: Add persistent ring buffer invalid-page inject test
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Add a self-corrupting test for the persistent ring buffer.

This will inject an erroneous value to some sub-buffer pages (where
the index is even or multiples of 5) in the persistent ring buffer
when the kernel panics, and checks whether the number of detected
invalid pages and the total entry_bytes are the same as the recorded
values after reboot.

This ensures that the kernel can correctly recover a partially
corrupted persistent ring buffer after a reboot or panic.

The test only runs on the persistent ring buffer whose name is
"ptracingtest". The user has to fill it with events before a
kernel panic.

To run the test, enable CONFIG_RING_BUFFER_PERSISTENT_INJECT
and add the following kernel cmdline:

 reserve_mem=20M:2M:trace trace_instance=ptracingtest^traceoff@trace
 panic=1

Run the following commands after the 1st boot:

 cd /sys/kernel/tracing/instances/ptracingtest
 echo 1 > tracing_on
 echo 1 > events/enable
 sleep 3
 echo c > /proc/sysrq-trigger

After panic message, the kernel will reboot and run the verification
on the persistent ring buffer, e.g.

 Ring buffer meta [2] invalid buffer page detected
 Ring buffer meta [2] is from previous boot! (318 pages discarded)
 Ring buffer testing [2] invalid pages: PASSED (318/318)
 Ring buffer testing [2] entry_bytes: PASSED (1300476/1300476)

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/Kconfig        | 34 ++++++++++++++++
 kernel/trace/ring_buffer.c  | 79 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |  4 ++
 4 files changed, 118 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 994f52b34344..0670742b2d60 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -238,6 +238,7 @@ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
+	RB_FL_TESTING		= 1 << 1,
 };
 
 #ifdef CONFIG_RING_BUFFER
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..084f34dc6c9f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1202,6 +1202,40 @@ config RING_BUFFER_VALIDATE_TIME_DELTAS
 	  Only say Y if you understand what this does, and you
 	  still want it enabled. Otherwise say N
 
+config RING_BUFFER_PERSISTENT_INJECT
+	bool "Enable persistent ring buffer error injection test"
+	depends on RING_BUFFER
+	help
+	  This option will have the kernel check if the persistent ring
+	  buffer is named "ptracingtest". and if so, it will corrupt some
+	  of its pages on a kernel panic. This is used to test if the
+	  persistent ring buffer can recover from some of its sub-buffers
+	  being corrupted.
+	  To use this, boot a kernel with a "ptracingtest" persistent
+	  ring buffer, e.g.
+
+	   reserve_mem=20M:2M:trace trace_instance=ptracingtest@trace panic=1
+
+	  And after the 1st boot, run the following commands:
+
+	   cd /sys/kernel/tracing/instances/ptracingtest
+	   echo 1 > events/enable
+	   echo 1 > tracing_on
+	   sleep 3
+	   echo c > /proc/sysrq-trigger
+
+	  After the panic message, the kernel will reboot and will show
+	  the test results in the console output.
+
+	  Note that events for the test ring buffer needs to be enabled
+	  prior to crashing the kernel so that the ring buffer has content
+	  that the test will corrupt.
+	  As the test will corrupt events in the "ptracingtest" persistent
+	  ring buffer, it should not be used for any other purpose other
+	  than this test.
+
+	  If unsure, say N
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c10cf4ba91d6..dc603d9c9414 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -64,6 +64,10 @@ struct ring_buffer_cpu_meta {
 	unsigned long	commit_buffer;
 	__u32		subbuf_size;
 	__u32		nr_subbufs;
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+	__u32		nr_invalid;
+	__u32		entry_bytes;
+#endif
 	int		buffers[];
 };
 
@@ -2101,6 +2105,21 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	if (discarded)
 		pr_cont(" (%d pages discarded)", discarded);
 	pr_cont("\n");
+
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+	if (meta->nr_invalid)
+		pr_warn("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n",
+			cpu_buffer->cpu,
+			(discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
+			discarded, meta->nr_invalid);
+	if (meta->entry_bytes)
+		pr_warn("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n",
+			cpu_buffer->cpu,
+			(entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
+			(long)entry_bytes, (long)meta->entry_bytes);
+	meta->nr_invalid = 0;
+	meta->entry_bytes = 0;
+#endif
 	return;
 
  invalid:
@@ -2581,12 +2600,72 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
+#ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
+static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_cpu_meta *meta;
+	struct buffer_data_page *dpage;
+	unsigned long entry_bytes = 0;
+	unsigned long ptr;
+	int subbuf_size;
+	int invalid = 0;
+	int cpu;
+	int i;
+
+	if (!(buffer->flags & RB_FL_TESTING))
+		return;
+
+	guard(preempt)();
+	cpu = smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+	if (!cpu_buffer)
+		return;
+	meta = cpu_buffer->ring_meta;
+	if (!meta)
+		return;
+
+	ptr = (unsigned long)rb_subbufs_from_meta(meta);
+	subbuf_size = meta->subbuf_size;
+
+	for (i = 0; i < meta->nr_subbufs; i++) {
+		unsigned long idx = meta->buffers[i];
+
+		dpage = (void *)(ptr + idx * subbuf_size);
+		/* Skip unused pages */
+		if (!local_read(&dpage->commit))
+			continue;
+
+		/*
+		 * Invalidate even pages or multiples of 5. This will cause 3
+		 * contiguous invalidated(empty) pages.
+		 */
+		if (!(i & 0x1) || !(i % 5)) {
+			local_add(subbuf_size + 1, &dpage->commit);
+			invalid++;
+		} else {
+			/* Count total commit bytes. */
+			entry_bytes += local_read(&dpage->commit) & ~RB_MISSED_MASK;
+		}
+	}
+
+	pr_info("Inject invalidated %d pages on CPU%d, total size: %ld\n",
+		invalid, cpu, (long)entry_bytes);
+	meta->nr_invalid = invalid;
+	meta->entry_bytes = entry_bytes;
+}
+#else /* !CONFIG_RING_BUFFER_PERSISTENT_INJECT */
+#define rb_test_inject_invalid_pages(buffer)	do { } while (0)
+#endif
+
 /* Stop recording on a persistent buffer and flush cache if needed. */
 static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
 {
 	struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
 
 	ring_buffer_record_off(buffer);
+	rb_test_inject_invalid_pages(buffer);
 	arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
 	return NOTIFY_DONE;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..4573f65d68ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8383,6 +8383,8 @@ static void setup_trace_scratch(struct trace_array *tr,
 	memset(tscratch, 0, size);
 }
 
+#define TRACE_TEST_PTRACING_NAME	"ptracingtest"
+
 int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
 {
 	enum ring_buffer_flags rb_flags;
@@ -8394,6 +8396,8 @@ int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int
 	buf->tr = tr;
 
 	if (tr->range_addr_start && tr->range_addr_size) {
+		if (tr->name && !strcmp(tr->name, TRACE_TEST_PTRACING_NAME))
+			rb_flags |= RB_FL_TESTING;
 		/* Add scratch buffer to handle 128 modules */
 		buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
 						      tr->range_addr_start,
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 2/9] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
From: Steven Rostedt @ 2026-05-22 17:08 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Skip invalid sub-buffers when rewinding the persistent ring buffer
instead of stopping the rewinding the ring buffer. The skipped
buffers are cleared.

To ensure the rewinding stops at the unused page, this also clears
buffer_data_page::time_stamp when tracing resets the buffer. This
allows us to identify unused pages and empty pages.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
[ SDR: Have reader_page still get evaluated if header_page fails ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v20: https://patch.msgid.link/20260520185017.219342323@kernel.org

- Still process reader page if head page fails validation (Sashiko)

 kernel/trace/ring_buffer.c | 107 ++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 38 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6afd8ea5081a..c10cf4ba91d6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -363,6 +363,7 @@ struct buffer_page {
 static void rb_init_page(struct buffer_data_page *bpage)
 {
 	local_set(&bpage->commit, 0);
+	bpage->time_stamp = 0;
 }
 
 static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
@@ -1878,12 +1879,14 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
-			      struct ring_buffer_cpu_meta *meta)
+static int rb_validate_buffer(struct buffer_page *bpage, int cpu,
+			      struct ring_buffer_cpu_meta *meta, u64 prev_ts, u64 next_ts)
 {
+	struct buffer_data_page *dpage = bpage->page;
 	unsigned long long ts;
 	unsigned long tail;
 	u64 delta;
+	int ret;
 
 	/*
 	 * When a sub-buffer is recovered from a read, the commit value may
@@ -1892,9 +1895,27 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
 	 * subbuf_size is considered invalid.
 	 */
 	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
-	if (tail > meta->subbuf_size - BUF_PAGE_HDR_SIZE)
-		return -1;
-	return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+	if (tail <= meta->subbuf_size - BUF_PAGE_HDR_SIZE)
+		ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+	else
+		ret = -1;
+
+	/*
+	 * The timestamp must be greater than @prev_ts and smaller than @next_ts.
+	 * Since this function works in both forward (verify) and reverse (unwind)
+	 * loop, we don't know both @prev_ts and @next_ts at the same time.
+	 * So use the known boundary as the boundary.
+	 */
+	if (ret < 0 || (prev_ts && prev_ts > ts) || (next_ts && ts > next_ts)) {
+		local_set(&bpage->entries, 0);
+		local_set(&dpage->commit, 0);
+		dpage->time_stamp = prev_ts ? prev_ts : next_ts;
+		ret = -1;
+	} else {
+		local_set(&bpage->entries, ret);
+	}
+
+	return ret;
 }
 
 /* If the meta data has been validated, now validate the events */
@@ -1905,6 +1926,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
 	int discarded = 0;
+	bool skip = false;
 	int ret;
 	u64 ts;
 	int i;
@@ -1915,25 +1937,35 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_head = head_page = cpu_buffer->head_page;
 	orig_reader = cpu_buffer->reader_page;
 
-	/* Do the reader page first */
-	ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu, meta);
+	/* Do the head page first */
+	ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, 0);
+	if (ret < 0) {
+		pr_info("Ring buffer meta [%d] invalid head page detected\n",
+			cpu_buffer->cpu);
+		/* Don't bother rewinding */
+		skip = true;
+		ts = 0;
+	} else {
+		ts = head_page->page->time_stamp;
+	}
+
+	/* Do the reader page - reader must be previous to head. */
+	ret = rb_validate_buffer(orig_reader, cpu_buffer->cpu, meta, 0, ts);
 	if (ret < 0) {
 		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
 			cpu_buffer->cpu);
 		discarded++;
-		/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-		local_set(&orig_reader->entries, 0);
-		local_set(&orig_reader->page->commit, 0);
 	} else {
 		entries += ret;
 		entry_bytes += rb_page_size(orig_reader);
-		local_set(&orig_reader->entries, ret);
+		ts = orig_reader->page->time_stamp;
 	}
 
-	ts = head_page->page->time_stamp;
+	if (skip)
+		goto skip_rewind;
 
 	/*
-	 * Try to rewind the head so that we can read the pages which already
+	 * Try to rewind the head so that we can read the pages which are already
 	 * read in the previous boot.
 	 */
 	if (head_page == cpu_buffer->tail_page)
@@ -1946,26 +1978,27 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == cpu_buffer->tail_page)
 			break;
 
-		/* Ensure the page has older data than head. */
-		if (ts < head_page->page->time_stamp)
+		/* Rewind until unused page (no timestamp, no commit). */
+		if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
 			break;
 
-		ts = head_page->page->time_stamp;
-		/* Ensure the page has correct timestamp and some data. */
-		if (!ts || rb_page_commit(head_page) == 0)
-			break;
-
-		/* Stop rewind if the page is invalid. */
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
-		if (ret < 0)
-			break;
-
-		/* Recover the number of entries and update stats. */
-		local_set(&head_page->entries, ret);
-		if (ret)
-			local_inc(&cpu_buffer->pages_touched);
-		entries += ret;
-		entry_bytes += rb_page_size(head_page);
+		/*
+		 * Skip if the page is invalid, or its timestamp is newer than the
+		 * previous valid page.
+		 */
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, ts);
+		if (ret < 0) {
+			if (!discarded)
+				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+					cpu_buffer->cpu);
+			discarded++;
+		} else {
+			entries += ret;
+			entry_bytes += rb_page_size(head_page);
+			if (ret > 0)
+				local_inc(&cpu_buffer->pages_touched);
+			ts = head_page->page->time_stamp;
+		}
 	}
 	if (i)
 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2027,6 +2060,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		/* Nothing more to do, the only page is the reader page */
 		goto done;
 	}
+	ts = head_page->page->time_stamp;
 
 	/* Iterate until finding the commit page */
 	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
@@ -2035,15 +2069,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == orig_reader)
 			continue;
 
-		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
+		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, ts, 0);
 		if (ret < 0) {
 			if (!discarded)
 				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
 					cpu_buffer->cpu);
 			discarded++;
-			/* Instead of discard whole ring buffer, discard only this sub-buffer. */
-			local_set(&head_page->entries, 0);
-			local_set(&head_page->page->commit, 0);
 		} else {
 			/* If the buffer has content, update pages_touched */
 			if (ret)
@@ -2051,7 +2082,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 			entries += ret;
 			entry_bytes += rb_page_size(head_page);
-			local_set(&head_page->entries, ret);
+			ts = head_page->page->time_stamp;
 		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
@@ -2079,12 +2110,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 	/* Reset the reader page */
 	local_set(&cpu_buffer->reader_page->entries, 0);
-	local_set(&cpu_buffer->reader_page->page->commit, 0);
+	rb_init_page(cpu_buffer->reader_page->page);
 
 	/* Reset all the subbuffers */
 	for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
 		local_set(&head_page->entries, 0);
-		local_set(&head_page->page->commit, 0);
+		rb_init_page(head_page->page);
 	}
 }
 
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 8/9] ring-buffer: Show persistent buffer dropped events in trace file
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

When the persistent ring buffer is validated on boot up, if a subbuffer is
deemed invalid, it resets the buffer and continues. Currently, these lost
events are not shown in the trace file output.

Have the trace iterator look for subbuffers that have the RB_MISSED_EVENTS
set and set the iter->missed_events flag when it is detected. This will
then have the trace file shows "LOST EVENTS" when it reads across a
subbuffer that was corrupted and invalidated.

For example:

 <...>-1016    [005] ...1.  6230.660403: preempt_disable: caller=__mod_memcg_state+0x1c8/0x200 parent=__mod_memcg_state+0x1c8/0x200
CPU:5 [LOST EVENTS]
 <...>-1016    [005] .....  6230.660673: kmem_cache_alloc: call_site=__anon_vma_prepare+0x1ad/0x1e0 ptr=000000006e40294c name=anon_vma bytes_req=200 bytes_alloc=208 gfp_flags=GFP_KERNEL node=-1 accounted=true

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index db72969aefa9..988915f035c7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3525,6 +3525,9 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	else
 		rb_inc_page(&iter->head_page);
 
+	if (rb_page_commit(iter->head_page) & RB_MISSED_EVENTS)
+		iter->missed_events = -1;
+
 	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
 	iter->next_event = 0;
@@ -7061,7 +7064,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *dpage;
 	struct buffer_page *reader;
-	unsigned long missed_events;
+	long missed_events;
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
@@ -7187,6 +7190,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		data_page->data = dpage;
+		if (!missed_events && rb_data_page_commit(dpage) & RB_MISSED_EVENTS)
+			missed_events = -1;
 
 		/*
 		 * Use the real_end for the data size,
@@ -7204,10 +7209,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	 * Set a flag in the commit field if we lost events
 	 */
 	if (missed_events) {
-		/* If there is room at the end of the page to save the
+		/*
+		 * If there is room at the end of the page to save the
 		 * missed events, then record it there.
 		 */
-		if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
+		if (missed_events > 0 &&
+		    buffer->subbuf_size - commit >= sizeof(missed_events)) {
 			memcpy(&dpage->data[commit], &missed_events,
 			       sizeof(missed_events));
 			local_add(RB_MISSED_STORED, &dpage->commit);
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 7/9] ring-buffer: Have dropped subbuffers be persistent across reboots
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

When the persistent ring buffer detects a corrupted subbuffer, it will
zero its size and report dropped pages in the dmesg, then it continues
normally.

But if a reboot happens without clearing or restarting tracing on the
persistent ring buffer, the next boot will show no pages are dropped.

If the persistent ring buffer is still the same, then it should still
report dropped pages so the user knows that the buffer has missing events.

Add the RB_MISSED_EVENTS flag to the commit value of the subbuffer so that
the next boot will still show that pages were dropped.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fd4a8dc5484e..db72969aefa9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1915,7 +1915,7 @@ static int __rb_validate_buffer(struct buffer_page *bpage, int cpu,
 	 * Even after clearing these bits, a commit value greater than the
 	 * subbuf_size is considered invalid.
 	 */
-	tail = rb_data_page_size(dpage);
+	tail = rb_data_page_commit(dpage);
 	if (tail <= meta->subbuf_size - BUF_PAGE_HDR_SIZE)
 		ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 	else
@@ -1929,7 +1929,7 @@ static int __rb_validate_buffer(struct buffer_page *bpage, int cpu,
 	 */
 	if (ret < 0 || (prev_ts && prev_ts > ts) || (next_ts && ts > next_ts)) {
 		local_set(&bpage->entries, 0);
-		local_set(&dpage->commit, 0);
+		local_set(&dpage->commit, RB_MISSED_EVENTS);
 		dpage->time_stamp = prev_ts ? prev_ts : next_ts;
 		ret = -1;
 	} else {
@@ -3451,7 +3451,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 * is a mb(), which will synchronize with the rmb here.
 	 * (see rb_tail_page_update() and __rb_reserve_next())
 	 */
-	commit = rb_page_commit(iter_head_page);
+	commit = rb_page_size(iter_head_page);
 	smp_rmb();
 
 	/* An event needs to be at least 8 bytes in size */
@@ -3480,7 +3480,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 
 	/* Make sure the page didn't change since we read this */
 	if (iter->page_stamp != iter_head_page->page->time_stamp ||
-	    commit > rb_page_commit(iter_head_page))
+	    commit > rb_page_size(iter_head_page))
 		goto reset;
 
 	iter->next_event = iter->head + length;
@@ -5651,7 +5651,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
 	 * (see rb_tail_page_update())
 	 */
 	smp_rmb();
-	commit = rb_page_commit(commit_page);
+	commit = rb_page_size(commit_page);
 	/* We want to make sure that the commit page doesn't change */
 	smp_rmb();
 
@@ -5844,6 +5844,7 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 */
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
+	rb_init_data_page(cpu_buffer->reader_page->page);
 	cpu_buffer->reader_page->real_end = 0;
 
  spin:
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 5/9] ring-buffer: Cleanup persistent ring buffer validation
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Cleanup rb_meta_validate_events() function to make it easier to read.
This includes the following cleanups:
 - Introduce rb_validatation_state to hold working variables in
   validation.
 - Move repleated validation state updates into rb_validate_buffer().
 - Move reader_page injection code outside of rb_meta_validate_events().

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 200 ++++++++++++++++++++-----------------
 1 file changed, 108 insertions(+), 92 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 88e613e78632..73f453ba12ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1883,8 +1883,16 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
 	return events;
 }
 
-static int rb_validate_buffer(struct buffer_page *bpage, int cpu,
-			      struct ring_buffer_cpu_meta *meta, u64 prev_ts, u64 next_ts)
+struct rb_validation_state {
+	unsigned long entries;
+	unsigned long entry_bytes;
+	int discarded;
+	u64 ts;
+};
+
+static int __rb_validate_buffer(struct buffer_page *bpage, int cpu,
+				struct ring_buffer_cpu_meta *meta,
+				u64 prev_ts, u64 next_ts)
 {
 	struct buffer_data_page *dpage = bpage->page;
 	unsigned long long ts;
@@ -1922,17 +1930,95 @@ static int rb_validate_buffer(struct buffer_page *bpage, int cpu,
 	return ret;
 }
 
+/**
+ * rb_validate_buffer - validates a single buffer page and updates the state.
+ * @bpage: buffer page to validate
+ * @cpu_buffer: cpu_buffer this page belongs to
+ * @meta: meta of the cpu_buffer
+ * @state: validation state
+ * @prev_ts: previous buffer's timestamp (optional)
+ * @next_ts: next buffer's timestamp (optional)
+ *
+ * If the page is invalid (wrong event length or timestamp), it increments the
+ * discarded counter and warns it. Otherwise, it updates the validation state.
+ */
+static void rb_validate_buffer(struct buffer_page *bpage,
+			       struct ring_buffer_per_cpu *cpu_buffer,
+			       struct ring_buffer_cpu_meta *meta,
+			       struct rb_validation_state *state,
+			       u64 prev_ts, u64 next_ts)
+{
+	int ret;
+
+	ret = __rb_validate_buffer(bpage, cpu_buffer->cpu, meta, prev_ts, next_ts);
+	if (ret < 0) {
+		if (!state->discarded)
+			pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+				cpu_buffer->cpu);
+		state->discarded++;
+	} else {
+		/* If the buffer has content, update pages_touched */
+		if (ret)
+			local_inc(&cpu_buffer->pages_touched);
+
+		state->entries += ret;
+		state->entry_bytes += rb_page_size(bpage);
+		state->ts = bpage->page->time_stamp;
+	}
+}
+
+static void rb_meta_inject_reader_page(struct ring_buffer_per_cpu *cpu_buffer,
+				       struct ring_buffer_cpu_meta *meta,
+				       struct buffer_page *orig_head,
+				       struct buffer_page *head_page)
+{
+	struct buffer_page *bpage = orig_head;
+	int i;
+
+	rb_dec_page(&bpage);
+	/*
+	 * Insert the reader_page before the original head page.
+	 * Since the list encode RB_PAGE flags, general list
+	 * operations should be avoided.
+	 */
+	cpu_buffer->reader_page->list.next = &orig_head->list;
+	cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+	orig_head->list.prev = &cpu_buffer->reader_page->list;
+	bpage->list.next = &cpu_buffer->reader_page->list;
+
+	/* Make the head_page the reader page */
+	cpu_buffer->reader_page = head_page;
+	bpage = head_page;
+	rb_inc_page(&head_page);
+	head_page->list.prev = bpage->list.prev;
+	rb_dec_page(&bpage);
+	bpage->list.next = &head_page->list;
+	rb_set_list_to_head(&bpage->list);
+	cpu_buffer->pages = &head_page->list;
+
+	cpu_buffer->head_page = head_page;
+	meta->head_buffer = (unsigned long)head_page->page;
+
+	/* Reset all the indexes */
+	bpage = cpu_buffer->reader_page;
+	meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+	bpage->id = 0;
+
+	for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+	     i++, rb_inc_page(&bpage)) {
+		meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+		bpage->id = i;
+	}
+}
+
 /* If the meta data has been validated, now validate the events */
 static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
 	struct buffer_page *head_page, *orig_head, *orig_reader;
-	unsigned long entry_bytes = 0;
-	unsigned long entries = 0;
-	int discarded = 0;
+	struct rb_validation_state state = { 0 };
 	bool skip = false;
 	int ret;
-	u64 ts;
 	int i;
 
 	if (!meta || !meta->head_buffer)
@@ -1942,28 +2028,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	orig_reader = cpu_buffer->reader_page;
 
 	/* Do the head page first */
-	ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, 0);
+	ret = __rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, 0);
 	if (ret < 0) {
 		pr_info("Ring buffer meta [%d] invalid head page detected\n",
 			cpu_buffer->cpu);
 		/* Don't bother rewinding */
 		skip = true;
-		ts = 0;
+		state.ts = 0;
 	} else {
-		ts = head_page->page->time_stamp;
+		state.ts = head_page->page->time_stamp;
 	}
 
 	/* Do the reader page - reader must be previous to head. */
-	ret = rb_validate_buffer(orig_reader, cpu_buffer->cpu, meta, 0, ts);
-	if (ret < 0) {
-		pr_info("Ring buffer meta [%d] invalid reader page detected\n",
-			cpu_buffer->cpu);
-		discarded++;
-	} else {
-		entries += ret;
-		entry_bytes += rb_page_size(orig_reader);
-		ts = orig_reader->page->time_stamp;
-	}
+	rb_validate_buffer(orig_reader, cpu_buffer, meta, &state, 0, state.ts);
 
 	if (skip)
 		goto skip_rewind;
@@ -1990,19 +2067,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		 * Skip if the page is invalid, or its timestamp is newer than the
 		 * previous valid page.
 		 */
-		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, 0, ts);
-		if (ret < 0) {
-			if (!discarded)
-				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
-					cpu_buffer->cpu);
-			discarded++;
-		} else {
-			entries += ret;
-			entry_bytes += rb_page_size(head_page);
-			if (ret > 0)
-				local_inc(&cpu_buffer->pages_touched);
-			ts = head_page->page->time_stamp;
-		}
+		rb_validate_buffer(head_page, cpu_buffer, meta, &state, 0, state.ts);
 	}
 	if (i)
 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2016,43 +2081,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	 * into the location just before the original head page.
 	 */
 	if (head_page != orig_head) {
-		struct buffer_page *bpage = orig_head;
-
-		rb_dec_page(&bpage);
-		/*
-		 * Insert the reader_page before the original head page.
-		 * Since the list encode RB_PAGE flags, general list
-		 * operations should be avoided.
-		 */
-		cpu_buffer->reader_page->list.next = &orig_head->list;
-		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
-		orig_head->list.prev = &cpu_buffer->reader_page->list;
-		bpage->list.next = &cpu_buffer->reader_page->list;
-
-		/* Make the head_page the reader page */
-		cpu_buffer->reader_page = head_page;
-		bpage = head_page;
-		rb_inc_page(&head_page);
-		head_page->list.prev = bpage->list.prev;
-		rb_dec_page(&bpage);
-		bpage->list.next = &head_page->list;
-		rb_set_list_to_head(&bpage->list);
-		cpu_buffer->pages = &head_page->list;
-
-		cpu_buffer->head_page = head_page;
-		meta->head_buffer = (unsigned long)head_page->page;
-
-		/* Reset all the indexes */
-		bpage = cpu_buffer->reader_page;
-		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
-		bpage->id = 0;
-
-		for (i = 1, bpage = head_page; i < meta->nr_subbufs;
-		     i++, rb_inc_page(&bpage)) {
-			meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
-			bpage->id = i;
-		}
-
+		rb_meta_inject_reader_page(cpu_buffer, meta, orig_head, head_page);
 		/* We'll restart verifying from orig_head */
 		head_page = orig_head;
 	}
@@ -2064,7 +2093,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		/* Nothing more to do, the only page is the reader page */
 		goto done;
 	}
-	ts = head_page->page->time_stamp;
+	state.ts = head_page->page->time_stamp;
 
 	/* Iterate until finding the commit page */
 	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
@@ -2073,21 +2102,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		if (head_page == orig_reader)
 			continue;
 
-		ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta, ts, 0);
-		if (ret < 0) {
-			if (!discarded)
-				pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
-					cpu_buffer->cpu);
-			discarded++;
-		} else {
-			/* If the buffer has content, update pages_touched */
-			if (ret)
-				local_inc(&cpu_buffer->pages_touched);
+		rb_validate_buffer(head_page, cpu_buffer, meta, &state, state.ts, 0);
 
-			entries += ret;
-			entry_bytes += rb_page_size(head_page);
-			ts = head_page->page->time_stamp;
-		}
 		if (head_page == cpu_buffer->commit_page)
 			break;
 	}
@@ -2098,25 +2114,25 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 		goto invalid;
 	}
  done:
-	local_set(&cpu_buffer->entries, entries);
-	local_set(&cpu_buffer->entries_bytes, entry_bytes);
+	local_set(&cpu_buffer->entries, state.entries);
+	local_set(&cpu_buffer->entries_bytes, state.entry_bytes);
 
 	pr_info("Ring buffer meta [%d] is from previous boot!", cpu_buffer->cpu);
-	if (discarded)
-		pr_cont(" (%d pages discarded)", discarded);
+	if (state.discarded)
+		pr_cont(" (%d pages discarded)", state.discarded);
 	pr_cont("\n");
 
 #ifdef CONFIG_RING_BUFFER_PERSISTENT_INJECT
 	if (meta->nr_invalid)
 		pr_warn("Ring buffer testing [%d] invalid pages: %s (%d/%d)\n",
 			cpu_buffer->cpu,
-			(discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
-			discarded, meta->nr_invalid);
+			(state.discarded == meta->nr_invalid) ? "PASSED" : "FAILED",
+			state.discarded, meta->nr_invalid);
 	if (meta->entry_bytes)
 		pr_warn("Ring buffer testing [%d] entry_bytes: %s (%ld/%ld)\n",
 			cpu_buffer->cpu,
-			(entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
-			(long)entry_bytes, (long)meta->entry_bytes);
+			(state.entry_bytes == meta->entry_bytes) ? "PASSED" : "FAILED",
+			(long)state.entry_bytes, (long)meta->entry_bytes);
 	meta->nr_invalid = 0;
 	meta->entry_bytes = 0;
 #endif
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 6/9] ring-buffer: Cleanup buffer_data_page related code
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Code cleanup related to buffer_data_page for readability,
which includes:
- Introduce rb_data_page_commit() and rb_data_page_size()
- Use 'dpage' for buffer_data_page, instead of 'bpage' because
  'bpage' is used for buffer_page.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 112 ++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 52 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 73f453ba12ce..fd4a8dc5484e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -364,21 +364,30 @@ struct buffer_page {
 #define RB_WRITE_MASK		0xfffff
 #define RB_WRITE_INTCNT		(1 << 20)
 
-static void rb_init_page(struct buffer_data_page *bpage)
+static void rb_init_data_page(struct buffer_data_page *bpage)
 {
 	local_set(&bpage->commit, 0);
 	bpage->time_stamp = 0;
 }
 
+static __always_inline long rb_data_page_commit(struct buffer_data_page *dpage)
+{
+	return local_read(&dpage->commit);
+}
+
+static __always_inline long rb_data_page_size(struct buffer_data_page *dpage)
+{
+	return rb_data_page_commit(dpage) & ~RB_MISSED_MASK;
+}
+
 static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 {
-	return local_read(&bpage->page->commit);
+	return rb_data_page_commit(bpage->page);
 }
 
-/* Size is determined by what has been committed */
 static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
 {
-	return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+	return rb_data_page_size(bpage->page);
 }
 
 static void free_buffer_page(struct buffer_page *bpage)
@@ -419,7 +428,7 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
 		return NULL;
 
 	dpage = page_address(page);
-	rb_init_page(dpage);
+	rb_init_data_page(dpage);
 
 	return dpage;
 }
@@ -659,7 +668,7 @@ static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
 	do {
 		if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
 			done = true;
-		commit = local_read(&page->page->commit);
+		commit = rb_page_commit(page);
 		write = local_read(&page->write);
 		if (addr >= (unsigned long)&page->page->data[commit] &&
 		    addr < (unsigned long)&page->page->data[write])
@@ -1906,7 +1915,7 @@ static int __rb_validate_buffer(struct buffer_page *bpage, int cpu,
 	 * Even after clearing these bits, a commit value greater than the
 	 * subbuf_size is considered invalid.
 	 */
-	tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+	tail = rb_data_page_size(dpage);
 	if (tail <= meta->subbuf_size - BUF_PAGE_HDR_SIZE)
 		ret = rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
 	else
@@ -2145,12 +2154,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 
 	/* Reset the reader page */
 	local_set(&cpu_buffer->reader_page->entries, 0);
-	rb_init_page(cpu_buffer->reader_page->page);
+	rb_init_data_page(cpu_buffer->reader_page->page);
 
 	/* Reset all the subbuffers */
 	for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
 		local_set(&head_page->entries, 0);
-		rb_init_page(head_page->page);
+		rb_init_data_page(head_page->page);
 	}
 }
 
@@ -2210,7 +2219,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc
 		 */
 		for (i = 0; i < meta->nr_subbufs; i++) {
 			meta->buffers[i] = i;
-			rb_init_page(subbuf);
+			rb_init_data_page(subbuf);
 			subbuf += meta->subbuf_size;
 		}
 	}
@@ -2262,7 +2271,7 @@ static int rbm_show(struct seq_file *m, void *v)
 	val -= 2;
 	dpage = rb_range_buffer(cpu_buffer, val);
 	seq_printf(m, "buffer[%ld]:    %d (commit: %ld)\n",
-		   val, meta->buffers[val], dpage ? local_read(&dpage->commit) : -1);
+		   val, meta->buffers[val], dpage ? rb_data_page_commit(dpage) : -1);
 
 	return 0;
 }
@@ -2653,7 +2662,7 @@ static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
 
 		dpage = (void *)(ptr + idx * subbuf_size);
 		/* Skip unused pages */
-		if (!local_read(&dpage->commit))
+		if (!rb_data_page_commit(dpage))
 			continue;
 
 		/*
@@ -2665,7 +2674,7 @@ static void rb_test_inject_invalid_pages(struct trace_buffer *buffer)
 			invalid++;
 		} else {
 			/* Count total commit bytes. */
-			entry_bytes += local_read(&dpage->commit) & ~RB_MISSED_MASK;
+			entry_bytes += rb_data_page_size(dpage);
 		}
 	}
 
@@ -4194,8 +4203,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 		local_set(&cpu_buffer->commit_page->page->commit,
 			  rb_page_write(cpu_buffer->commit_page));
 		RB_WARN_ON(cpu_buffer,
-			   local_read(&cpu_buffer->commit_page->page->commit) &
-			   ~RB_WRITE_MASK);
+			   rb_page_commit(cpu_buffer->commit_page) & ~RB_WRITE_MASK);
 		barrier();
 	}
 
@@ -4567,7 +4575,7 @@ static const char *show_interrupt_level(void)
 	return show_irq_str(level);
 }
 
-static void dump_buffer_page(struct buffer_data_page *bpage,
+static void dump_buffer_page(struct buffer_data_page *dpage,
 			     struct rb_event_info *info,
 			     unsigned long tail)
 {
@@ -4575,12 +4583,12 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
 	u64 ts, delta;
 	int e;
 
-	ts = bpage->time_stamp;
+	ts = dpage->time_stamp;
 	pr_warn("  [%lld] PAGE TIME STAMP\n", ts);
 
 	for (e = 0; e < tail; e += rb_event_length(event)) {
 
-		event = (struct ring_buffer_event *)(bpage->data + e);
+		event = (struct ring_buffer_event *)(dpage->data + e);
 
 		switch (event->type_len) {
 
@@ -4630,7 +4638,7 @@ static atomic_t ts_dump;
 		}							\
 		atomic_inc(&cpu_buffer->record_disabled);		\
 		pr_warn(fmt, ##__VA_ARGS__);				\
-		dump_buffer_page(bpage, info, tail);			\
+		dump_buffer_page(dpage, info, tail);			\
 		atomic_dec(&ts_dump);					\
 		/* There's some cases in boot up that this can happen */ \
 		if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING))	\
@@ -4646,16 +4654,16 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 			 struct rb_event_info *info,
 			 unsigned long tail)
 {
-	struct buffer_data_page *bpage;
+	struct buffer_data_page *dpage;
 	u64 ts, delta;
 	bool full = false;
 	int ret;
 
-	bpage = info->tail_page->page;
+	dpage = info->tail_page->page;
 
 	if (tail == CHECK_FULL_PAGE) {
 		full = true;
-		tail = local_read(&bpage->commit);
+		tail = rb_data_page_commit(dpage);
 	} else if (info->add_timestamp &
 		   (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
 		/* Ignore events with absolute time stamps */
@@ -4666,7 +4674,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Do not check the first event (skip possible extends too).
 	 * Also do not check if previous events have not been committed.
 	 */
-	if (tail <= 8 || tail > local_read(&bpage->commit))
+	if (tail <= 8 || tail > rb_data_page_commit(dpage))
 		return;
 
 	/*
@@ -4675,7 +4683,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 	if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
 		goto out;
 
-	ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+	ret = rb_read_data_buffer(dpage, tail, cpu_buffer->cpu, &ts, &delta);
 	if (ret < 0) {
 		if (delta < ts) {
 			buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
@@ -6466,7 +6474,7 @@ static void rb_clear_buffer_page(struct buffer_page *page)
 {
 	local_set(&page->write, 0);
 	local_set(&page->entries, 0);
-	rb_init_page(page->page);
+	rb_init_data_page(page->page);
 	page->read = 0;
 }
 
@@ -6951,7 +6959,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 	local_irq_restore(flags);
 
 	if (bpage->data) {
-		rb_init_page(bpage->data);
+		rb_init_data_page(bpage->data);
 	} else {
 		bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order);
 		if (!bpage->data) {
@@ -6976,8 +6984,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
 				struct buffer_data_read_page *data_page)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_data_page *bpage = data_page->data;
-	struct page *page = virt_to_page(bpage);
+	struct buffer_data_page *dpage = data_page->data;
+	struct page *page = virt_to_page(dpage);
 	unsigned long flags;
 
 	if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
@@ -6997,15 +7005,15 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
 	arch_spin_lock(&cpu_buffer->lock);
 
 	if (!cpu_buffer->free_page) {
-		cpu_buffer->free_page = bpage;
-		bpage = NULL;
+		cpu_buffer->free_page = dpage;
+		dpage = NULL;
 	}
 
 	arch_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
  out:
-	free_pages((unsigned long)bpage, data_page->order);
+	free_pages((unsigned long)dpage, data_page->order);
 	kfree(data_page);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -7050,7 +7058,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
-	struct buffer_data_page *bpage;
+	struct buffer_data_page *dpage;
 	struct buffer_page *reader;
 	unsigned long missed_events;
 	unsigned int commit;
@@ -7076,8 +7084,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	if (data_page->order != buffer->subbuf_order)
 		return -1;
 
-	bpage = data_page->data;
-	if (!bpage)
+	dpage = data_page->data;
+	if (!dpage)
 		return -1;
 
 	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
@@ -7143,7 +7151,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 			 * We have already ensured there's enough space if this
 			 * is a time extend. */
 			size = rb_event_length(event);
-			memcpy(bpage->data + pos, rpage->data + rpos, size);
+			memcpy(dpage->data + pos, rpage->data + rpos, size);
 
 			len -= size;
 
@@ -7159,9 +7167,9 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 			size = rb_event_ts_length(event);
 		} while (len >= size);
 
-		/* update bpage */
-		local_set(&bpage->commit, pos);
-		bpage->time_stamp = save_timestamp;
+		/* update dpage */
+		local_set(&dpage->commit, pos);
+		dpage->time_stamp = save_timestamp;
 
 		/* we copied everything to the beginning */
 		read = 0;
@@ -7171,13 +7179,13 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		cpu_buffer->read_bytes += rb_page_size(reader);
 
 		/* swap the pages */
-		rb_init_page(bpage);
-		bpage = reader->page;
+		rb_init_data_page(dpage);
+		dpage = reader->page;
 		reader->page = data_page->data;
 		local_set(&reader->write, 0);
 		local_set(&reader->entries, 0);
 		reader->read = 0;
-		data_page->data = bpage;
+		data_page->data = dpage;
 
 		/*
 		 * Use the real_end for the data size,
@@ -7185,12 +7193,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		 * on the page.
 		 */
 		if (reader->real_end)
-			local_set(&bpage->commit, reader->real_end);
+			local_set(&dpage->commit, reader->real_end);
 	}
 
 	cpu_buffer->lost_events = 0;
 
-	commit = local_read(&bpage->commit);
+	commit = rb_data_page_commit(dpage);
 	/*
 	 * Set a flag in the commit field if we lost events
 	 */
@@ -7199,19 +7207,19 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		 * missed events, then record it there.
 		 */
 		if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
-			memcpy(&bpage->data[commit], &missed_events,
+			memcpy(&dpage->data[commit], &missed_events,
 			       sizeof(missed_events));
-			local_add(RB_MISSED_STORED, &bpage->commit);
+			local_add(RB_MISSED_STORED, &dpage->commit);
 			commit += sizeof(missed_events);
 		}
-		local_add(RB_MISSED_EVENTS, &bpage->commit);
+		local_add(RB_MISSED_EVENTS, &dpage->commit);
 	}
 
 	/*
 	 * This page may be off to user land. Zero it out here.
 	 */
 	if (commit < buffer->subbuf_size)
-		memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
+		memset(&dpage->data[commit], 0, buffer->subbuf_size - commit);
 
 	return read;
 }
@@ -7842,7 +7850,7 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 
 	if (missed_events) {
 		if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
-			struct buffer_data_page *bpage = reader->page;
+			struct buffer_data_page *dpage = reader->page;
 			unsigned int commit;
 			/*
 			 * Use the real_end for the data size,
@@ -7850,18 +7858,18 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 			 * on the page.
 			 */
 			if (reader->real_end)
-				local_set(&bpage->commit, reader->real_end);
+				local_set(&dpage->commit, reader->real_end);
 			/*
 			 * If there is room at the end of the page to save the
 			 * missed events, then record it there.
 			 */
 			commit = rb_page_size(reader);
 			if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
-				memcpy(&bpage->data[commit], &missed_events,
+				memcpy(&dpage->data[commit], &missed_events,
 				       sizeof(missed_events));
-				local_add(RB_MISSED_STORED, &bpage->commit);
+				local_add(RB_MISSED_STORED, &dpage->commit);
 			}
-			local_add(RB_MISSED_EVENTS, &bpage->commit);
+			local_add(RB_MISSED_EVENTS, &dpage->commit);
 		} else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
 				      "Reader on commit with %ld missed events",
 				      missed_events)) {
-- 
2.53.0



^ permalink raw reply related

* [PATCH v21 9/9] ring-buffer: Show persistent buffer dropped events in trace_pipe file
From: Steven Rostedt @ 2026-05-22 17:09 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Ian Rogers
In-Reply-To: <20260522170857.263969486@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

When the persistent ring buffer is validated on boot up, if a subbuffer is
deemed invalid, it resets the buffer and continues. Have the code preserve
the RB_MISSED_EVENTS flag in the commit portion of the subbuffer header
and pass that back so that the trace_pipe file can show the missed events
like the trace file does.

For example:

   <...>-1242    [005] d....  4429.120116: page_fault_user: address=0x7ffaebb6e728 ip=0x7ffaeb9d4960 error_code=0x7
   <...>-1242    [005] .....  4429.120124: mm_page_alloc: page=00000000055254f3 pfn=0x1373bd order=0 migratetype=1 gfp_flags=GFP_HIGHUSER_MOVABLE|__GFP_COMP
   <...>-1242    [005] d..2.  4429.120132: tlb_flush: pages:1 reason:local MM shootdown (3)
CPU:5 [LOST EVENTS]
   <...>-1242    [005] d....  4429.120661: page_fault_user: address=0x55ba7c2d0944 ip=0x55ba7c20cd02 error_code=0x7
   <...>-1242    [005] .....  4429.120669: mm_page_alloc: page=0000000005a02500 pfn=0x12b6e4 order=0 migratetype=1 gfp_flags=GFP_HIGHUSER_MOVABLE|__GFP_COMP
   <...>-1242    [005] d..2.  4429.120680: tlb_flush: pages:1 reason:local MM shootdown (3)

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v20: https://patch.msgid.link/20260520185018.470465795@kernel.org

- Removed left over printk() (Masami Hiramatsu)

 kernel/trace/ring_buffer.c | 56 +++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 988915f035c7..910f6b3adf74 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5801,6 +5801,7 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *reader = NULL;
 	unsigned long overwrite;
 	unsigned long flags;
+	int missed_events = 0;
 	int nr_loops = 0;
 	bool ret;
 
@@ -5901,6 +5902,9 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	if (!ret)
 		goto spin;
 
+	if (rb_page_commit(reader) & RB_MISSED_EVENTS)
+		missed_events = -1;
+
 	if (cpu_buffer->ring_meta)
 		rb_update_meta_reader(cpu_buffer, reader);
 
@@ -5965,6 +5969,8 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 */
 	smp_rmb();
 
+	if (!cpu_buffer->lost_events)
+		cpu_buffer->lost_events = missed_events;
 
 	return reader;
 }
@@ -7066,6 +7072,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	struct buffer_page *reader;
 	long missed_events;
 	unsigned int commit;
+	unsigned int size;
 	unsigned int read;
 	u64 save_timestamp;
 	bool force_memcpy;
@@ -7101,7 +7108,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	event = rb_reader_event(cpu_buffer);
 
 	read = reader->read;
-	commit = rb_page_size(reader);
+	commit = rb_page_commit(reader);
+	size = rb_page_size(reader);
 
 	/* Check if any events were dropped */
 	missed_events = cpu_buffer->lost_events;
@@ -7115,13 +7123,14 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	 * we must copy the data from the page to the buffer.
 	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
-	if (read || (len < (commit - read)) ||
+	if (read || (len < (size - read)) ||
 	    cpu_buffer->reader_page == cpu_buffer->commit_page ||
 	    force_memcpy) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 		unsigned int rpos = read;
 		unsigned int pos = 0;
-		unsigned int size;
+		unsigned int event_size;
+		unsigned int flags = 0;
 
 		/*
 		 * If a full page is expected, this can still be returned
@@ -7130,19 +7139,22 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		 * the reader page.
 		 */
 		if (full &&
-		    (!read || (len < (commit - read)) ||
+		    (!read || (len < (size - read)) ||
 		     cpu_buffer->reader_page == cpu_buffer->commit_page))
 			return -1;
 
-		if (len > (commit - read))
-			len = (commit - read);
+		if (len > (size - read))
+			len = (size - read);
 
 		/* Always keep the time extend and data together */
-		size = rb_event_ts_length(event);
+		event_size = rb_event_ts_length(event);
 
-		if (len < size)
+		if (len < event_size)
 			return -1;
 
+		if (commit & RB_MISSED_EVENTS)
+			flags = RB_MISSED_EVENTS;
+
 		/* save the current timestamp, since the user will need it */
 		save_timestamp = cpu_buffer->read_stamp;
 
@@ -7154,25 +7166,25 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 			 * one or two events.
 			 * We have already ensured there's enough space if this
 			 * is a time extend. */
-			size = rb_event_length(event);
-			memcpy(dpage->data + pos, rpage->data + rpos, size);
+			event_size = rb_event_length(event);
+			memcpy(dpage->data + pos, rpage->data + rpos, event_size);
 
-			len -= size;
+			len -= event_size;
 
 			rb_advance_reader(cpu_buffer);
 			rpos = reader->read;
-			pos += size;
+			pos += event_size;
 
-			if (rpos >= commit)
+			if (rpos >= event_size)
 				break;
 
 			event = rb_reader_event(cpu_buffer);
 			/* Always keep the time extend and data together */
-			size = rb_event_ts_length(event);
-		} while (len >= size);
+			event_size = rb_event_ts_length(event);
+		} while (len >= event_size);
 
 		/* update dpage */
-		local_set(&dpage->commit, pos);
+		local_set(&dpage->commit, pos | flags);
 		dpage->time_stamp = save_timestamp;
 
 		/* we copied everything to the beginning */
@@ -7204,7 +7216,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 
 	cpu_buffer->lost_events = 0;
 
-	commit = rb_data_page_commit(dpage);
+	size = rb_data_page_size(dpage);
 	/*
 	 * Set a flag in the commit field if we lost events
 	 */
@@ -7214,11 +7226,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		 * missed events, then record it there.
 		 */
 		if (missed_events > 0 &&
-		    buffer->subbuf_size - commit >= sizeof(missed_events)) {
-			memcpy(&dpage->data[commit], &missed_events,
+		    buffer->subbuf_size - size >= sizeof(missed_events)) {
+			memcpy(&dpage->data[size], &missed_events,
 			       sizeof(missed_events));
 			local_add(RB_MISSED_STORED, &dpage->commit);
-			commit += sizeof(missed_events);
+			size += sizeof(missed_events);
 		}
 		local_add(RB_MISSED_EVENTS, &dpage->commit);
 	}
@@ -7226,8 +7238,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	/*
 	 * This page may be off to user land. Zero it out here.
 	 */
-	if (commit < buffer->subbuf_size)
-		memset(&dpage->data[commit], 0, buffer->subbuf_size - commit);
+	if (size < buffer->subbuf_size)
+		memset(&dpage->data[size], 0, buffer->subbuf_size - size);
 
 	return read;
 }
-- 
2.53.0



^ permalink raw reply related

* Re: [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: Lorenzo Stoakes @ 2026-05-22 17:12 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, akpm, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <CAA1CXcCoyGzEyeFPW+zKiA2AOj=0Lm7R=odLtVru+dQa0P_2cQ@mail.gmail.com>

On Fri, May 22, 2026 at 10:31:41AM -0600, Nico Pache wrote:
> On Fri, May 22, 2026 at 10:20 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
> > There's some kind of confusion here.
> >
> > This series isn't suited for 7.2.
> >
> > Sorry but Zi's series, unless it depends on functionality here, will have
> > to be rebased.
> >
> > People have been at conferences, people have been on leave, I've had to
> > pace myself for health reasons and it seems there's been more than simply
> > review comment-based changes happening here.
> >
> > (Again I strongly encourage, at this stage, to ONLY be making changes based
> > on review, not adding ANYTHING else or changing ANYTHING else to avoid
> > delays :)
>
> All the changes are based on review points. Very small changes in this
> version; the largest being the one that you specifically argeed too.

16->17

 Documentation/admin-guide/mm/transhuge.rst |  24 +++++-------------
 include/linux/khugepaged.h                 |   7 ++---
 include/trace/events/huge_memory.h         |   3 ++-
 mm/huge_memory.c                           |   2 +-
 mm/khugepaged.c                            | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------
 mm/vma.c                                   |   6 ++---
 tools/testing/vma/include/stubs.h          |   3 ++-
 7 files changed, 103 insertions(+), 110 deletions(-)

17->18

 Documentation/admin-guide/mm/transhuge.rst |   5 +++--
 include/trace/events/huge_memory.h         |   3 +--
 mm/khugepaged.c                            | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------------------
 3 files changed, 66 insertions(+), 63 deletions(-)

These are not small 'very small changes'.

We're nearly at rc-5, and this is a major, invasive, dangerous change that
we have to get right.

You've also made changes unrelated to review, repeatedly, throughout this
process, which as I've told you, is causing delays.

You've also throughout the review of this series done stuff like make MAJOR
changes to things and _kept review tags_.

You're forcing us to use git range-diff etc. to forensically check that the
series is what is claimed.

Dude I mean you switched to using // comment style which is not used in mm
anywhere for instance? Don't do things like that and complain about
delays. Honestly.

Also, again, LSF happened. Other confeerences happened. Bandwidth is
reduced.

So again, I'm sorry, but you've been hit with some bad luck here.

I really wanted this in for 7.2, and I feel bad that we couldn't make it,
but you're also doing thing that's making it difficult for us.

I've spent double-digits hours on your series, and I've also had work
pushed out becasue of that leading me to work evenings and weekends as a
result.

And I'm not even going to get any credit for it :))

So while I sypmathise, really, please have empathy and realise it goes both
ways, please.

I'm not being mean for the sake of it, I'm pushing back because I feel this
is not at a stage where I'd feel confident in this being merged at this
time.

And it's very much a regret, as I _really_ wanted us to have it in this
time. But life and circumstances and the issues mentioned above have
intervened, sadly.

>
> >
> > Also - shouldn't mm-unstable already have mm-hotfixes-unstable in it?
> >
> > I think in mm-next we will have an stable branch, that everything is
> > based on, where things go once review is complete and things are mergeable.
> >
> > And a separate hotfixes branch based on Linus's tree.
> >
> > That would avoid issues like this :)
>
> Im sorry im new to this, but I really dont think this tiny error, and
> something that I'd confirmed with Andrew beforehand deserves NAKing
> and defering it. Ive worked through my PTO to clean up some of these
> review nits just to get it in 7.2. I even through this through my
> rounds of testing today before resending.

The issue wasn't the error (though it wasn't tiny...!), it's the state of
review. There was fresh review comments from a few days ago, and there's
big diffs between revisions.

You've also made unrelated changes as you have done throughout the series.

As I said above, I'm sorry that you spent time in your PTO on this, but we
cannot rush this in when things are not clearly ready yet, and I am not
confident in this being ready at this stage.

>
> >
> > >
> > > The intent wasn't that this is a hotfix, just that this was the
> > > closest base before the v17 that is already in the tree.
> >
> > The convention is that [PATCH ... <branch>] indicates the target of the
> > changes. Putting the hotfixes branch there implies it's a hotfix.
>
> Sorry I thought the <branch> was what base you used.

I mean, sure there's clearly confusion here as you sent [PATCH 7.2 v16 ...]
(against an unreleased kernel version) then a branch specifier then the
hotfixes one...

Anyway sure, it's fine, I've made vastly more dumb mistakes than that
myself, nobody minds, but it's concerning as by convention [PATCH
... <mm->hotfixes<whatever>] generally is taken to mean 'please rush this
to hotfixes!' :)

So be careful with that please!

>
> >
> > So please be careful with that in future :)
>
> Yes will do for sure.

Thanks!

>
> >
> > >
> > > Sorry for the confusion, hopefully Andrew can still apply it to the
> > > correct tree.
> >
> > I'm not even sure what's best for that at this stage given we have
> > conflicts and this has to be delayed until 7.3.
> >
> > I wonder if given that we should not have this in mm-unstable at all and
> > just wait it out until the next cycle begins? Review can happen
> > concurrently.
>
> I still dont see why this has to be deferred, I was working with
> Andrew to prevent merge headaches.

I've explained the why above, and David and I co-maintain THP so I feel
that ultimately given the blood, sweat and tears we've put into THP review
we ought to have some input on this :)

Thanks, Lorenzo

^ permalink raw reply

* Re: [PATCHv3 04/12] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: Andrii Nakryiko @ 2026-05-22 18:50 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-5-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Andrii reported an issue with optimized uprobes [1] that can clobber
> redzone area with call instruction storing return address on stack
> where user code may keep temporary data without adjusting rsp.
>
> Fixing this by moving the optimized uprobes on top of 10-bytes nop
> instruction, so we can squeeze another instruction to escape the
> redzone area before doing the call, like:
>
>   lea -0x80(%rsp), %rsp
>   call tramp
>
> Note the lea instruction is used to adjust the rsp register without
> changing the flags.
>
> We use nop10 and following transofrmation to optimized instructions
> above and back as suggested by Peterz [2].
>
> Optimize path (int3_update_optimize):
>
>   1) Initial state after set_swbp() installed the uprobe:
>       cc 2e 0f 1f 84 00 00 00 00 00
>
>      From offset 0 this is INT3 followed by the tail of the original
>      10-byte NOP.
>
>   2) Trap the call slot before rewriting the NOP tail:
>       cc 2e 0f 1f 84 [cc] 00 00 00 00
>
>      From offset 0 this traps on the uprobe INT3.  A thread reaching
>      offset 5 traps on the temporary INT3 instead of seeing a partially
>      patched call.
>
>   3) Rewrite the LEA tail and call displacement, keeping both INT3 bytes:
>       cc [8d 64 24 80] cc [d0 d1 d2 d3]
>
>      From offset 0 and offset 5 this still traps.  The bytes between
>      them are not executable entry points while both traps are in place.
>
>   4) Restore the call opcode at offset 5:
>       cc 8d 64 24 80 [e8] d0 d1 d2 d3
>
>      From offset 0 this still traps.  From offset 5 the instruction is
>      the final CALL to the uprobe trampoline.
>

I'm sorry if I'm slow, but I don't understand why we need that second
cc at offset 5? Isn't original nop10 processed by CPU as single
instruction? So it will either be at ip of nop10, or at ip+10, no? If
we trap at ip and in int3 handler +10 from there while we are
installing lea+call, why do we need cc on byte 5?

I.e., I don't understand how CPU can end up being at ip+5 until we
finalize lea+call sequence? Can it?


>   5) Publish the first LEA byte:
>       [48] 8d 64 24 80 e8 d0 d1 d2 d3
>
>      From offset 0 this is:
>         lea -0x80(%rsp), %rsp
>         call <uprobe-trampoline>
>
> Unoptimize path (int3_update_unoptimize):
>
>   1) Initial optimized state:
>       48 8d 64 24 80 e8 d0 d1 d2 d3
>      Same as 5) above.
>
>   2) Trap new entries before restoring the NOP bytes:
>       [cc] 8d 64 24 80 e8 d0 d1 d2 d3
>
>      From offset 0 this traps. A thread that had already executed the
>      LEA can still reach the intact CALL at offset 5.
>
>   3) Restore bytes 1..4 of the original NOP while keeping byte 0 trapped
>      and byte 5 as CALL.
>       cc [2e 0f 1f 84] e8 d0 d1 d2 d3
>
>      From offset 0 this still traps. Offset 5 is still the CALL for any
>      thread that was already past the first LEA byte.
>
>   4) Publish the first byte of the original NOP:
>       [66] 2e 0f 1f 84 e8 d0 d1 d2 d3
>
>      From offset 0 this is the restored 10-byte NOP; the CALL opcode and
>      displacement are now only NOP operands.  Offset 5 still decodes as
>      CALL for a thread that was already there.

it's cool that we don't have to do jmp for the first byte, fancy :)

>
> Note as explained in [2] we need to use following nop10:
>        PF1   PF2   ESC   NOPL  MOD   SIB   DISP32
> NOP10: 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 -- cs nopw 0x00000000(%rax,%rax,1)
>
> which means we need to allow 0x2e prefix which maps to INAT_PFX_CS
> attribute in is_prefix_bad function.
>
> The optimized uprobe performance stays the same:
>
>         uprobe-nop     :    3.129 ± 0.013M/s
>         uprobe-push    :    3.045 ± 0.006M/s
>         uprobe-ret     :    1.095 ± 0.004M/s
>   -->   uprobe-nop10   :    7.170 ± 0.020M/s
>         uretprobe-nop  :    2.143 ± 0.021M/s
>         uretprobe-push :    2.090 ± 0.000M/s
>         uretprobe-ret  :    0.942 ± 0.000M/s
>   -->   uretprobe-nop10:    3.381 ± 0.003M/s
>         usdt-nop       :    3.245 ± 0.004M/s
>   -->   usdt-nop10     :    7.256 ± 0.023M/s
>
> [1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
> [2] https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/#t
> Reported-by: Andrii Nakryiko <andrii@kernel.org>
> Closes: https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
> Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
> Assisted-by: Codex:GPT-5.5
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/kernel/uprobes.c | 281 +++++++++++++++++++++++++++++---------
>  1 file changed, 217 insertions(+), 64 deletions(-)
>

[...]

^ permalink raw reply

* Re: [PATCHv3 01/12] uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
From: Andrii Nakryiko @ 2026-05-22 18:50 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-2-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> In the unregister path we use __in_uprobe_trampoline check with
> current->mm for the VMA lookup, which is wrong, because we are
> in the tracer context, not the traced process.
>
> Add mm_struct pointer argument to __in_uprobe_trampoline and
> changing related callers to pass proper mm_struct pointer.
>
> Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/kernel/uprobes.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
>

lgtm, thanks!

Acked-by: Andrii Nakryiko <andrii@kernel.org>

[...]

^ permalink raw reply

* Re: [PATCHv3 03/12] uprobes/x86: Allow to copy uprobe trampolines on fork
From: Andrii Nakryiko @ 2026-05-22 18:50 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-4-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> When we do fork or clone without CLONE_VM the new process won't
> have uprobe trampoline vma objects and at the same time it will
> have optimized code calling that trampoline and crash.
>
> Fixing this by allowing vma uprobe trampoline objects to be copied
> on fork to the new process.
>
> Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/kernel/uprobes.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 6824376e253d..11ec6b89b135 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -701,7 +701,7 @@ static struct vm_area_struct *get_uprobe_trampoline(unsigned long vaddr)
>                 return ERR_PTR(vaddr);
>
>         return _install_special_mapping(current->mm, vaddr, PAGE_SIZE,
> -                               VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
> +                               VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_IO,

so on fork we'll get sys_uprobe invocations which will go into uprobe
trampoline and syscall will just keep returning -EPROTO, is that
right?

what would happen in the similar situation for process with int3
uprobe being forked/cloned? Will it inherit int3 as well, and then
will keep hitting interrupts that would just do nothing?

is there a way to restore original memory page for clones? this
behavior (unless I'm misunderstanding) seems suboptimal
performance-wise

>                                 &tramp_mapping);
>  }

>
> --
> 2.53.0
>

^ permalink raw reply

* Re: [PATCHv3 02/12] uprobes/x86: Remove struct uprobe_trampoline object
From: Andrii Nakryiko @ 2026-05-22 18:50 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-3-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Removing struct uprobe_trampoline object and it's tracking code,
> because it's not needed. We can do same thing directly on top of
> struct vm_area_struct objects.
>
> This makes the code simpler and allows easy propagation of the
> trampoline vma object into child process in following change.
>
> Note the original code called destroy_uprobe_trampoline if the
> optimiation failed, but it only freed the struct uprobe_trampoline
> object, not the vma.
>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/kernel/uprobes.c | 102 ++++++++------------------------------
>  include/linux/uprobes.h   |   5 --
>  kernel/events/uprobes.c   |  10 ----
>  kernel/fork.c             |   1 -
>  4 files changed, 20 insertions(+), 98 deletions(-)
>

nice cleanup

Acked-by: Andrii Nakryiko <andrii@kernel.org>


[...]

^ permalink raw reply

* Re: [PATCHv3 05/12] libbpf: Change has_nop_combo to work on top of nop10
From: Andrii Nakryiko @ 2026-05-22 18:52 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, Jakub Sitnicki, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-6-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:45 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> We now expect nop combo with 10 bytes nop instead of 5 bytes nop,
> fixing has_nop_combo to reflect that.
>
> Fixes: 41a5c7df4466 ("libbpf: Add support to detect nop,nop5 instructions combo for usdt probe")
> Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  tools/lib/bpf/usdt.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
> index e3710933fd52..484a4354e82b 100644
> --- a/tools/lib/bpf/usdt.c
> +++ b/tools/lib/bpf/usdt.c
> @@ -305,7 +305,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj)
>
>         /*
>          * Detect kernel support for uprobe() syscall, it's presence means we can
> -        * take advantage of faster nop5 uprobe handling.
> +        * take advantage of faster nop10 uprobe handling.
>          * Added in: 56101b69c919 ("uprobes/x86: Add uprobe syscall to speed up uprobe")

Would be nice to add commit that switches nop5 to nop10 (but until it
lands hash is not stable, so, hmmm, maybe we'll land this patch
separately? send it a bit later to bpf-next?)


but otherwise lgtm

Acked-by: Andrii Nakryiko <andrii@kernel.org>

>          */
>         man->has_uprobe_syscall = kernel_supports(obj, FEAT_UPROBE_SYSCALL);
> @@ -596,14 +596,14 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note,
>  #if defined(__x86_64__)
>  static bool has_nop_combo(int fd, long off)
>  {
> -       unsigned char nop_combo[6] = {
> -               0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 /* nop,nop5 */
> +       unsigned char nop_combo[11] = {
> +               0x90, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
>         };
> -       unsigned char buf[6];
> +       unsigned char buf[11];
>
> -       if (pread(fd, buf, 6, off) != 6)
> +       if (pread(fd, buf, 11, off) != 11)
>                 return false;
> -       return memcmp(buf, nop_combo, 6) == 0;
> +       return memcmp(buf, nop_combo, 11) == 0;
>  }
>  #else
>  static bool has_nop_combo(int fd, long off)
> @@ -814,8 +814,8 @@ static int collect_usdt_targets(struct usdt_manager *man, struct elf_fd *elf_fd,
>                 memset(target, 0, sizeof(*target));
>
>                 /*
> -                * We have uprobe syscall and usdt with nop,nop5 instructions combo,
> -                * so we can place the uprobe directly on nop5 (+1) and get this probe
> +                * We have uprobe syscall and usdt with nop,nop10 instructions combo,
> +                * so we can place the uprobe directly on nop10 (+1) and get this probe
>                  * optimized.
>                  */
>                 if (man->has_uprobe_syscall && has_nop_combo(elf_fd->fd, usdt_rel_ip)) {
> --
> 2.53.0
>

^ permalink raw reply

* Re: [PATCHv3 08/12] selftests/bpf: Change uprobe syscall tests to use nop10
From: Andrii Nakryiko @ 2026-05-22 18:57 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <20260521124411.31133-9-jolsa@kernel.org>

On Thu, May 21, 2026 at 5:45 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Optimized uprobes are now on top of 10-bytes nop instructions,
> reflect that in existing tests.
>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  .../selftests/bpf/benchs/bench_trigger.c      |  2 +-
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 28 ++++++++++---------
>  tools/testing/selftests/bpf/prog_tests/usdt.c | 25 ++++++++++-------
>  tools/testing/selftests/bpf/usdt_2.c          |  2 +-
>  4 files changed, 32 insertions(+), 25 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index 2f22ec61667b..a60b8173cdc4 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -398,7 +398,7 @@ static void *uprobe_producer_ret(void *input)
>  #ifdef __x86_64__
>  __nocf_check __weak void uprobe_target_nop5(void)

heh, nop5 -> nop_a_lot ;)


>  {
> -       asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00");
> +       asm volatile (".byte 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
>  }
>

[...]

> @@ -420,7 +421,8 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge
>         ASSERT_EQ(skel->bss->executed, executed, "executed");
>
>         /* .. and check the trampoline is as expected. */
> -       call = (struct __arch_relative_insn *) addr;
> +       ASSERT_OK(memcmp(addr, lea_rsp, 5), "lea_rsp");
> +       call = (struct __arch_relative_insn *)(addr + 5);
>         tramp = (void *) (call + 1) + call->raddr;
>         ASSERT_EQ(call->op, 0xe8, "call");
>         ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
> @@ -432,7 +434,7 @@ static void check_detach(void *addr, void *tramp)
>  {
>         /* [uprobes_trampoline] stays after detach */
>         ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
> -       ASSERT_OK(memcmp(addr, nop5, 5), "nop5");
> +       ASSERT_OK(memcmp(addr, jmp2B, 2), "jmp2B");

yeah, not jump anymore?

>  }
>
>  static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,

[...]

^ permalink raw reply

* Re: [PATCH v2] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Rik van Riel @ 2026-05-22 20:39 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team, sashiko-bot, sashiko-reviews
In-Reply-To: <20260520164117.528d02fd@gandalf.local.home>

On Wed, 2026-05-20 at 16:41 -0400, Steven Rostedt wrote:
> 
> Rik,
> 
> Can you reply to Sashiko here?
> 
> The below comes from Sashiko:
> https://sashiko.dev/?list=org.kernel.vger.linux-trace-kernel#/patchset/20260513161916.04151502%40fangorn
> 
> Thanks,
> 
> -- Steve
> 
> 
> > commit f90ac36f39b79a19c015522bb1caf7ef903caf02
> > Author: Rik van Riel <riel@surriel.com>
> > perf/ftrace: Fix WARNING in __unregister_ftrace_function
> > This commit attempts to fix a WARN_ON triggered during
> > perf_event_alloc() error
> > cleanup by checking FTRACE_OPS_FL_ENABLED before unregistering the
> > ftrace
> > function. The warning occurs because the ftrace_ops registration
> > failed or was
> > torn down by perf_try_init_event() earlier, but the event->destroy
> > callback is
> > executed again during the error path.
> > > diff --git a/kernel/trace/trace_event_perf.c
> > > b/kernel/trace/trace_event_perf.c
> > > index 61e3a2620fa3c..8d64f17135e8f 100644
> > > --- a/kernel/trace/trace_event_perf.c
> > > +++ b/kernel/trace/trace_event_perf.c
> > > @@ -495,7 +495,11 @@ static int
> > > perf_ftrace_function_register(struct perf_event *event)
> > >  static int perf_ftrace_function_unregister(struct perf_event
> > > *event)
> > >  {
> > >  	struct ftrace_ops *ops = &event->ftrace_ops;
> > > -	int ret = unregister_ftrace_function(ops);
> > > +	int ret = 0;
> > > +
> > > +	if (ops->flags & FTRACE_OPS_FL_ENABLED)
> > > +		ret = unregister_ftrace_function(ops);
> > > +
> > >  	ftrace_free_filter(ops);
> 
> > Does calling ftrace_free_filter() unconditionally here mask a
> > double-teardown
> > regression while leaving the underlying double-free active?

I don't see how calling ftrace_free_filter() twice would
call issues, given that it sets the ->*_hash values to
EMPTY_HASH:

void ftrace_free_filter(struct ftrace_ops *ops)
{
        ftrace_ops_init(ops);
        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
                return;
        free_ftrace_hash(ops->func_hash->filter_hash);
        free_ftrace_hash(ops->func_hash->notrace_hash);
        ops->func_hash->filter_hash = EMPTY_HASH;
        ops->func_hash->notrace_hash = EMPTY_HASH;
}

void free_ftrace_hash(struct ftrace_hash *hash)
{
        if (!hash || hash == EMPTY_HASH)
                return;
..


> > In perf_try_init_event(), if a PMU event_init() succeeds but a
> > subsequent
> > capability check fails, it explicitly calls event->destroy(event)
> > to roll back:
> > kernel/events/core.c:perf_try_init_event() {
> >     ...
> > 		if (ret && event->destroy)
> > 			event->destroy(event);
> >     ...
> > }

The error handling there all seems to "goto err_destroy"

err_destroy:
        if (event->destroy) {
                event->destroy(event);
                event->destroy = NULL;
        }


> > However, it does not set event->destroy to NULL.

... but it does?

I am not sure what code Sashiko is looking at,
but it does not look like the code I just pulled.

Is there a different tree I should be looking at
than upstream Linus?


-- 
All Rights Reversed.

^ permalink raw reply

* Re: [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: Andrew Morton @ 2026-05-22 20:47 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>

On Fri, 22 May 2026 08:59:55 -0600 Nico Pache <npache@redhat.com> wrote:

> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.

Thanks, I've update mm.git's mm-unstable branch to this version.

It sounds like I might be dropping it soon, haven't started looking at
that yet.  But let's at least eyeball the latest version at this time.

Sashiko was able to apply this, so the base-it-on-hotfixes thing worked
well, thanks.  The AI checking made a few allegations:

	https://sashiko.dev/#/patchset/20260522150009.121603-1-npache@redhat.com

> V18 Changes:
> - Added RBs/Acks
> - [patch 02] Guard count_memcg_folio_events with is_pmd_order() to keep
>   THP_COLLAPSE_ALLOC PMD-only (Usama, Lance)
> - [patch 03] Convert C++ comments to C-style; fix "none-page" terminology
>   to "empty PTEs or PTEs mapping the shared zeropage"; drop unnecessary
>   userfaultfd comment; add const to local max_ptes_* variables; fix
>   "repect" typo (Lance, David)
> - [patch 04] collapse_max_ptes_none() now returns 0 instead of -EINVAL for
>   unsupported values; remove SCAN_INVALID_PTES_NONE; change return type
>   from int to unsigned int and propagate to all callers; add comment above
>   __collapse_huge_page_swapin explaining mTHP swap bail-out (David,
>   Lorenzo, Lance, Wei Yang, Usama)
> - [patch 05] Rewrite collapse_huge_page lock comment to David's suggested
>   wording (David)
> - [patch 11] Propagate unsigned int return type for max_ptes_none; remove
>   the now-unnecessary negative return check (consequence of patch 04);
>   Add optimization to the next_order goto that will prevent unnecessary
>   iterations if there are no lower orders enabled (Vernon); update locking
>   comment; pass VMA to mthp_collapse to improve uffd-armed detection, and
>   prevent unnecessary work. (Wei)
> - [patch 14] Update documentation to reflect fallback-to-0 behavior
> 

Below is how v18 altered mm.git.

Quite a lot of it seems to be replacement of "//"-style comments.  It's
unfortunate that this work isn't separated from the substantive
changes.  We could have done that with a few followup fixes rather than
a wholesale replacement of the series.


 Documentation/admin-guide/mm/transhuge.rst |    5 
 include/trace/events/huge_memory.h         |    3 
 mm/khugepaged.c                            |  121 +++++++++----------
 3 files changed, 66 insertions(+), 63 deletions(-)

--- a/Documentation/admin-guide/mm/transhuge.rst~b
+++ a/Documentation/admin-guide/mm/transhuge.rst
@@ -312,8 +312,9 @@ when collapsing a group of small pages i
 For PMD-sized THP collapse, this directly limits the number of empty pages
 allowed in the 2MB region.
 
-For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other value
-will emit a warning and no mTHP collapse will be attempted.
+For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. At
+HPAGE_PMD_NR - 1, we collapse to the highest possible order. Any intermediate
+value will emit a warning and mTHP collapse will default to max_ptes_none=0.
 
 A higher value allows more empty pages, potentially leading to more memory
 usage but better THP performance. A lower value is more conservative and
--- a/include/trace/events/huge_memory.h~b
+++ a/include/trace/events/huge_memory.h
@@ -39,8 +39,7 @@
 	EM( SCAN_STORE_FAILED,		"store_failed")			\
 	EM( SCAN_COPY_MC,		"copy_poisoned_page")		\
 	EM( SCAN_PAGE_FILLED,		"page_filled")			\
-	EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")	\
-	EMe(SCAN_INVALID_PTES_NONE,	"invalid_ptes_none")
+	EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
 
 #undef EM
 #undef EMe
--- a/mm/khugepaged.c~b
+++ a/mm/khugepaged.c
@@ -61,7 +61,6 @@ enum scan_result {
 	SCAN_COPY_MC,
 	SCAN_PAGE_FILLED,
 	SCAN_PAGE_DIRTY_OR_WRITEBACK,
-	SCAN_INVALID_PTES_NONE,
 };
 
 #define CREATE_TRACE_POINTS
@@ -380,41 +379,43 @@ static bool pte_none_or_zero(pte_t pte)
 }
 
 /**
- * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
- * PTEs for the given collapse operation.
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs or PTEs mapping
+ * the shared zeropage for the given collapse operation.
  * @cc: The collapse control struct
  * @vma: The vma to check for userfaultfd
  * @order: The folio order being collapsed to
  *
- * Return: Maximum number of none-page or zero-page PTEs allowed for the
- * collapse operation.
+ * Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
  */
-static int collapse_max_ptes_none(struct collapse_control *cc,
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
 		struct vm_area_struct *vma, unsigned int order)
 {
 	unsigned int max_ptes_none = khugepaged_max_ptes_none;
-	// If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
+
 	if (vma && userfaultfd_armed(vma))
 		return 0;
-	// for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
+	/* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
-	// for PMD collapse, respect the user defined maximum.
+	/* for PMD collapse, respect the user defined maximum */
 	if (is_pmd_order(order))
 		return max_ptes_none;
-	/* Zero/non-present collapse disabled. */
-	if (!max_ptes_none)
-		return 0;
-	// for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
-	// scale the maximum number of PTEs to the order of the collapse.
+	/*
+	 * for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
+	 * scale the maximum number of PTEs to the order of the collapse.
+	 */
 	if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
 		return (1 << order) - 1;
-
-	// We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
-	// Emit a warning and return -EINVAL.
-	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
-		      KHUGEPAGED_MAX_PTES_LIMIT);
-	return -EINVAL;
+	if (!max_ptes_none)
+		return 0;
+	/*
+	 * For mTHP collapse of values other than 0 or KHUGEPAGED_MAX_PTES_LIMIT,
+	 * emit a warning and return 0.
+	 */
+	pr_warn_once("mTHP collapse does not support max_ptes_none values"
+		     " other than 0 or %u, defaulting to 0.\n",
+		     KHUGEPAGED_MAX_PTES_LIMIT);
+	return 0;
 }
 
 /**
@@ -429,15 +430,19 @@ static int collapse_max_ptes_none(struct
 static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
 		unsigned int order)
 {
-	// for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
-	// anonymous pages.
+	/*
+	 * For MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+	 * anonymous pages.
+	 */
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
-	// for mTHP collapse do not allow collapsing anonymous memory pages that
-	// are shared between processes.
+	/*
+	 * for mTHP collapse do not allow collapsing anonymous memory pages that
+	 * are shared between processes.
+	 */
 	if (!is_pmd_order(order))
 		return 0;
-	// for PMD collapse, respect the user defined maximum.
+	/* for PMD collapse, respect the user defined maximum */
 	return khugepaged_max_ptes_shared;
 }
 
@@ -453,14 +458,16 @@ static unsigned int collapse_max_ptes_sh
 static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
 		unsigned int order)
 {
-	// for MADV_COLLAPSE, do not restrict the number PTEs entries or
-	// pagecache entries that are non-present.
+	/*
+	 * For MADV_COLLAPSE, do not restrict the number PTEs entries or
+	 * pagecache entries that are non-present.
+	 */
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
-	// for mTHP collapse do not allow any non-present PTEs or pagecache entries.
+	/* for mTHP collapse do not allow any non-present PTEs or pagecache entries */
 	if (!is_pmd_order(order))
 		return 0;
-	// for PMD collapse, respect the user defined maximum.
+	/* for PMD collapse, respect the user defined maximum */
 	return khugepaged_max_ptes_swap;
 }
 
@@ -593,9 +600,8 @@ static unsigned long collapse_allowable_
 void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  vm_flags_t vm_flags)
 {
-	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
-	    collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED) &&
-	    hugepage_enabled())
+	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_enabled()
+	    && collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED))
 		__khugepaged_enter(vma->vm_mm);
 }
 
@@ -670,6 +676,8 @@ static enum scan_result __collapse_huge_
 		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
 		unsigned int order, struct list_head *compound_pagelist)
 {
+	const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
+	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
 	const unsigned long nr_pages = 1UL << order;
 	struct page *page = NULL;
 	struct folio *folio = NULL;
@@ -677,11 +685,6 @@ static enum scan_result __collapse_huge_
 	pte_t *_pte;
 	int none_or_zero = 0, shared = 0, referenced = 0;
 	enum scan_result result = SCAN_FAIL;
-	int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
-	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
-
-	if (max_ptes_none < 0)
-		return SCAN_INVALID_PTES_NONE;
 
 	for (_pte = pte; _pte < pte + nr_pages;
 	     _pte++, addr += PAGE_SIZE) {
@@ -1136,6 +1139,10 @@ static enum scan_result check_pmd_still_
  * Bring missing pages in from swap, to complete THP collapse.
  * Only done if khugepaged_scan_pmd believes it is worthwhile.
  *
+ * For mTHP orders the function bails on the first swap entry, because
+ * faulting pages back in during collapse could re-populate PTEs that
+ * push a later scan over the threshold for a higher-order collapse.
+ *
  * Called and returns without pte mapped or spinlocks held.
  * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
  */
@@ -1257,19 +1264,18 @@ static enum scan_result alloc_charge_fol
 		return SCAN_CGROUP_CHARGE_FAIL;
 	}
 
-	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+	if (is_pmd_order(order))
+		count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
 
 	*foliop = folio;
 	return SCAN_SUCCEED;
 }
 
 /*
- * collapse_huge_page expects the mmap_read_lock to be dropped before
- * entering this function. The function will also always return with the lock
- * dropped. The function starts by allocation a folio, which can potentially
- * take a long time if it involves sync compaction, and we do not need to hold
- * the mmap_lock during that. We must recheck the vma after taking it again in
- * write mode.
+ * collapse_huge_page expects the mmap_lock to be unlocked before entering and
+ * will always return with the lock unlocked, to avoid holding the mmap_lock
+ * while allocating a THP, as that could trigger direct reclaim/compaction.
+ * Note that the VMA must be rechecked after grabbing the mmap_lock again.
  */
 static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
 		int referenced, int unmapped, struct collapse_control *cc,
@@ -1500,12 +1506,12 @@ static unsigned int collapse_mthp_count_
  * If a collapse is permitted, we attempt to collapse the PTE range into a
  * mTHP.
  */
-static int mthp_collapse(struct mm_struct *mm, unsigned long address,
-		int referenced, int unmapped, struct collapse_control *cc,
-		unsigned long enabled_orders)
+static int mthp_collapse(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, int referenced, int unmapped,
+		struct collapse_control *cc, unsigned long enabled_orders)
 {
-	unsigned int nr_occupied_ptes, nr_ptes;
-	int max_ptes_none, collapsed = 0, stack_size = 0;
+	unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
+	int collapsed = 0, stack_size = 0;
 	unsigned long collapse_address;
 	struct mthp_range range;
 	u16 offset;
@@ -1522,10 +1528,7 @@ static int mthp_collapse(struct mm_struc
 		if (!test_bit(order, &enabled_orders))
 			goto next_order;
 
-		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
-
-		if (max_ptes_none < 0)
-			return collapsed;
+		max_ptes_none = collapse_max_ptes_none(cc, vma, order);
 
 		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
 							       nr_ptes);
@@ -1565,7 +1568,7 @@ static int mthp_collapse(struct mm_struc
 		}
 
 next_order:
-		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
+		if ((BIT(order) - 1) & enabled_orders) {
 			const u8 next_order = order - 1;
 			const u16 mid_offset = offset + (nr_ptes / 2);
 
@@ -1582,9 +1585,9 @@ static enum scan_result collapse_scan_pm
 		struct vm_area_struct *vma, unsigned long start_addr,
 		bool *lock_dropped, struct collapse_control *cc)
 {
-	int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
 	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
 	const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
 	enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
 	pmd_t *pmd;
 	pte_t *pte, *_pte, pteval;
@@ -1772,9 +1775,9 @@ out_unmap:
 	if (result == SCAN_SUCCEED) {
 		/* collapse_huge_page expects the lock to be dropped before calling */
 		mmap_read_unlock(mm);
-		nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
-					      cc, enabled_orders);
-		/* collapse_huge_page will return with the mmap_lock released */
+		nr_collapsed = mthp_collapse(mm, vma, start_addr, referenced,
+					     unmapped, cc, enabled_orders);
+		/* mmap_lock was released above, set lock_dropped */
 		*lock_dropped = true;
 		result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
 	}
@@ -2665,7 +2668,7 @@ static enum scan_result collapse_scan_fi
 		unsigned long addr, struct file *file, pgoff_t start,
 		struct collapse_control *cc)
 {
-	const int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+	const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
 	const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
 	struct folio *folio = NULL;
 	struct address_space *mapping = file->f_mapping;
_


^ permalink raw reply

* Re: [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: David Hildenbrand (Arm) @ 2026-05-22 21:13 UTC (permalink / raw)
  To: Nico Pache, Vlastimil Babka (SUSE)
  Cc: linux-doc, akpm, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <CAA1CXcD373fFfo9YPWRj8mJ_rsnzyCrpn1uk3=k7kU=QuaLOgg@mail.gmail.com>

On 5/22/26 18:11, Nico Pache wrote:
> On Fri, May 22, 2026 at 9:13 AM Vlastimil Babka (SUSE)
> <vbabka@kernel.org> wrote:
>>
>> On 5/22/26 17:07, Nico Pache wrote:
>>>
>>> Whoops I manually changed the coverletter subject to reflect that this
>>> in on mm-hotfixes-unstable but never updated the others...
>>
>> But why? That branch is for hotfixes that would go to the current 7.1-rcX
>> series. mm-unstable would be the correct one for this, AFAICT.
> 
> Sorry this was a misunderstanding. The goal here was to base this off
> the closest base commit behind where my v17 already lies in the tree.

Ah, I guess this is a problem of "v17 is already in mm-unstable, so against what
to base v18".

Yeah, we touched on that problem in the LSF/MM process discussion ...

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v18 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: David Hildenbrand (Arm) @ 2026-05-22 21:16 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <20260522150009.121603-4-npache@redhat.com>


>  int hugepage_madvise(struct vm_area_struct *vma,
>  		     vm_flags_t *vm_flags, int advice)
>  {
> @@ -540,6 +598,8 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
>  		struct list_head *compound_pagelist)
>  {
> +	const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> +	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);


Yeah, it's good that these are all const now.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCHv3 04/12] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: Jiri Olsa @ 2026-05-22 21:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Masami Hiramatsu, Andrii Nakryiko,
	bpf, linux-trace-kernel
In-Reply-To: <20260521133548.GK3126523@noisy.programming.kicks-ass.net>

On Thu, May 21, 2026 at 03:35:48PM +0200, Peter Zijlstra wrote:
> On Thu, May 21, 2026 at 02:44:03PM +0200, Jiri Olsa wrote:
> > Andrii reported an issue with optimized uprobes [1] that can clobber
> > redzone area with call instruction storing return address on stack
> > where user code may keep temporary data without adjusting rsp.
> > 
> > Fixing this by moving the optimized uprobes on top of 10-bytes nop
> > instruction, so we can squeeze another instruction to escape the
> > redzone area before doing the call, like:
> > 
> >   lea -0x80(%rsp), %rsp
> >   call tramp
> > 
> > Note the lea instruction is used to adjust the rsp register without
> > changing the flags.
> > 
> > We use nop10 and following transofrmation to optimized instructions
> > above and back as suggested by Peterz [2].
> > 
> > Optimize path (int3_update_optimize):
> > 
> >   1) Initial state after set_swbp() installed the uprobe:
> >       cc 2e 0f 1f 84 00 00 00 00 00
> > 
> >      From offset 0 this is INT3 followed by the tail of the original
> >      10-byte NOP.
> > 
> >   2) Trap the call slot before rewriting the NOP tail:
> >       cc 2e 0f 1f 84 [cc] 00 00 00 00
> > 
> >      From offset 0 this traps on the uprobe INT3.  A thread reaching
> >      offset 5 traps on the temporary INT3 instead of seeing a partially
> >      patched call.
> > 
> >   3) Rewrite the LEA tail and call displacement, keeping both INT3 bytes:
> >       cc [8d 64 24 80] cc [d0 d1 d2 d3]
> > 
> >      From offset 0 and offset 5 this still traps.  The bytes between
> >      them are not executable entry points while both traps are in place.
> > 
> >   4) Restore the call opcode at offset 5:
> >       cc 8d 64 24 80 [e8] d0 d1 d2 d3
> > 
> >      From offset 0 this still traps.  From offset 5 the instruction is
> >      the final CALL to the uprobe trampoline.
> > 
> >   5) Publish the first LEA byte:
> >       [48] 8d 64 24 80 e8 d0 d1 d2 d3
> > 
> >      From offset 0 this is:
> >         lea -0x80(%rsp), %rsp
> >         call <uprobe-trampoline>
> > 
> > Unoptimize path (int3_update_unoptimize):
> > 
> >   1) Initial optimized state:
> >       48 8d 64 24 80 e8 d0 d1 d2 d3
> >      Same as 5) above.
> > 
> >   2) Trap new entries before restoring the NOP bytes:
> >       [cc] 8d 64 24 80 e8 d0 d1 d2 d3
> > 
> >      From offset 0 this traps. A thread that had already executed the
> >      LEA can still reach the intact CALL at offset 5.
> > 
> >   3) Restore bytes 1..4 of the original NOP while keeping byte 0 trapped
> >      and byte 5 as CALL.
> >       cc [2e 0f 1f 84] e8 d0 d1 d2 d3
> > 
> >      From offset 0 this still traps. Offset 5 is still the CALL for any
> >      thread that was already past the first LEA byte.
> > 
> >   4) Publish the first byte of the original NOP:
> >       [66] 2e 0f 1f 84 e8 d0 d1 d2 d3
> > 
> >      From offset 0 this is the restored 10-byte NOP; the CALL opcode and
> >      displacement are now only NOP operands.  Offset 5 still decodes as
> >      CALL for a thread that was already there.
> > 
> > Note as explained in [2] we need to use following nop10:
> >        PF1   PF2   ESC   NOPL  MOD   SIB   DISP32
> > NOP10: 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 -- cs nopw 0x00000000(%rax,%rax,1)
> > 
> > which means we need to allow 0x2e prefix which maps to INAT_PFX_CS
> > attribute in is_prefix_bad function.
> > 
> > The optimized uprobe performance stays the same:
> > 
> >         uprobe-nop     :    3.129 ± 0.013M/s
> >         uprobe-push    :    3.045 ± 0.006M/s
> >         uprobe-ret     :    1.095 ± 0.004M/s
> >   -->   uprobe-nop10   :    7.170 ± 0.020M/s
> >         uretprobe-nop  :    2.143 ± 0.021M/s
> >         uretprobe-push :    2.090 ± 0.000M/s
> >         uretprobe-ret  :    0.942 ± 0.000M/s
> >   -->   uretprobe-nop10:    3.381 ± 0.003M/s
> >         usdt-nop       :    3.245 ± 0.004M/s
> >   -->   usdt-nop10     :    7.256 ± 0.023M/s
> > 
> 
> > @@ -893,48 +918,134 @@ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *
> >  }
> >  
> >  /*
> > + * Modify the optimized instruction by using INT3 breakpoints on SMP.
> >   * We completely avoid using stop_machine() here, and achieve the
> >   * synchronization using INT3 breakpoints and SMP cross-calls.
> >   * (borrowed comment from smp_text_poke_batch_finish)
> >   *
> > + * The way it is done for optimization (int3_update_optimize):
> > + *   1) Start with the uprobe INT3 trap already installed
> > + *   2) Add an INT3 trap to the call slot
> > + *   3) Update everything but the first byte and the call opcode
> > + *   4) Replace the call slot INT3 by the call opcode
> > + *   5) Replace the first INT3 by the first byte of the LEA instruction
> > + *
> > + * The way it is done for unoptimization (int3_update_unoptimize):
> > + *   1) Start with the optimized uprobe lea/call instructions
> > + *   2) Add an INT3 trap to the address that will be patched
> > + *   3) Restore the NOP bytes before the call opcode
> > + *   4) Replace the first INT3 by the first byte of the NOP instruction
> > + *
> > + * Note that unoptimization deliberately keeps the call opcode and displacement
> > + * in bytes 5..9. Those bytes become operands of the restored 10-byte NOP.
> >   */
> 
> One important thing to note is that (as earlier noted by Andrii) the
> CALL address is never changed. A new optimization pass will not change
> the CALL instruction again.
> 
> If you noted this anywhere, I failed to find it. This is crucially
> important for the correctness of the scheme and should not be emitted.
> 
> That is, please add something like:
> 
>   "Since there is only a single uprobe-trampoline, the CALL instruction
>   will not be changed across unoptimization/optimization cycles.
>   Therefore, any task that is preempted at the CALL instruction is
>   guaranteed to observe that CALL and not anything else."
> 

nope I did not mention it, will add

thanks,
jirka


^ permalink raw reply

* Re: [PATCHv3 04/12] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: Jiri Olsa @ 2026-05-22 21:19 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <CAEf4BzZ+MToBKJS9Vdu=YZrX+2kRpUcmWejVhuoqhtu-ijqDAQ@mail.gmail.com>

On Fri, May 22, 2026 at 11:50:44AM -0700, Andrii Nakryiko wrote:
> On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > Andrii reported an issue with optimized uprobes [1] that can clobber
> > redzone area with call instruction storing return address on stack
> > where user code may keep temporary data without adjusting rsp.
> >
> > Fixing this by moving the optimized uprobes on top of 10-bytes nop
> > instruction, so we can squeeze another instruction to escape the
> > redzone area before doing the call, like:
> >
> >   lea -0x80(%rsp), %rsp
> >   call tramp
> >
> > Note the lea instruction is used to adjust the rsp register without
> > changing the flags.
> >
> > We use nop10 and following transofrmation to optimized instructions
> > above and back as suggested by Peterz [2].
> >
> > Optimize path (int3_update_optimize):
> >
> >   1) Initial state after set_swbp() installed the uprobe:
> >       cc 2e 0f 1f 84 00 00 00 00 00
> >
> >      From offset 0 this is INT3 followed by the tail of the original
> >      10-byte NOP.
> >
> >   2) Trap the call slot before rewriting the NOP tail:
> >       cc 2e 0f 1f 84 [cc] 00 00 00 00
> >
> >      From offset 0 this traps on the uprobe INT3.  A thread reaching
> >      offset 5 traps on the temporary INT3 instead of seeing a partially
> >      patched call.
> >
> >   3) Rewrite the LEA tail and call displacement, keeping both INT3 bytes:
> >       cc [8d 64 24 80] cc [d0 d1 d2 d3]
> >
> >      From offset 0 and offset 5 this still traps.  The bytes between
> >      them are not executable entry points while both traps are in place.
> >
> >   4) Restore the call opcode at offset 5:
> >       cc 8d 64 24 80 [e8] d0 d1 d2 d3
> >
> >      From offset 0 this still traps.  From offset 5 the instruction is
> >      the final CALL to the uprobe trampoline.
> >
> 
> I'm sorry if I'm slow, but I don't understand why we need that second
> cc at offset 5? Isn't original nop10 processed by CPU as single
> instruction? So it will either be at ip of nop10, or at ip+10, no? If
> we trap at ip and in int3 handler +10 from there while we are
> installing lea+call, why do we need cc on byte 5?
> 
> I.e., I don't understand how CPU can end up being at ip+5 until we
> finalize lea+call sequence? Can it?

hum, so I though it's for the case when you do unoptimize+optimize,
then you can have cpu executing the previous lea and hitting the int3
on +5 offset.. but as pointed by Peter (and you) the call instruction
never changes, so now I'm not sure why we need it

jirka

^ permalink raw reply

* Re: [PATCH mm-unstable v18 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-05-22 21:24 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260522150009.121603-5-npache@redhat.com>

On 5/22/26 16:59, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
> 
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2. [1]
> 
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
> 
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes: [2]
> 
> - max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
>   that maps the shared zeropage. Consequently, no memory bloat.
> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>   available mTHP order.
> 
> This removes the possibility of "creep", and a warning will be emitted if
> any non-supported max_ptes_none value is configured with mTHP enabled.
> Any intermediate value will default mTHP collapse to max_ptes_none=0.
> 
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
> 
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
> 
> [1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
> [2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
> 
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 121 +++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 88 insertions(+), 33 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 116f39518948..e98ba5b15163 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -353,30 +353,52 @@ static bool pte_none_or_zero(pte_t pte)
>   * the shared zeropage for the given collapse operation.
>   * @cc: The collapse control struct
>   * @vma: The vma to check for userfaultfd
> + * @order: The folio order being collapsed to
>   *
>   * Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
>   */
>  static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> -		struct vm_area_struct *vma)
> +		struct vm_area_struct *vma, unsigned int order)
>  {
> +	unsigned int max_ptes_none = khugepaged_max_ptes_none;

Can be const, right?

> +
>  	if (vma && userfaultfd_armed(vma))
>  		return 0;
>  	/* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
>  	if (!cc->is_khugepaged)
>  		return HPAGE_PMD_NR;
> -	/* For all other cases respect the user defined maximum */
> -	return khugepaged_max_ptes_none;
> +	/* for PMD collapse, respect the user defined maximum */
> +	if (is_pmd_order(order))
> +		return max_ptes_none;
> +	/*
> +	 * for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
> +	 * scale the maximum number of PTEs to the order of the collapse.
> +	 */
> +	if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
> +		return (1 << order) - 1;
> +	if (!max_ptes_none)
> +		return 0;
> +	/*
> +	 * For mTHP collapse of values other than 0 or KHUGEPAGED_MAX_PTES_LIMIT,
> +	 * emit a warning and return 0.
> +	 */
> +	pr_warn_once("mTHP collapse does not support max_ptes_none values"
> +		     " other than 0 or %u, defaulting to 0.\n",
> +		     KHUGEPAGED_MAX_PTES_LIMIT);
> +	return 0;

This might read slightly clearer as

/*
 * For mTHP ...
 */
if (max_ptes_none)
		pr_warn_once(...)
return 0;

IOW, have a single "return 0" label here and only special-case when to warn.

Acked-by: David Hildenbrand (arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* [PATCHv2] trace: allocate fields with elt struct
From: Rosen Penev @ 2026-05-22 21:26 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	open list:TRACING

Use a flexible array member to embed the fields array in the
tracing_map_elt allocation, reducing the number of allocations
per element.

Since the fields are now embedded in the struct, taking the address
of a field through a const-qualified elt pointer yields a
const-qualified pointer. Rather than adding casts, switch the
comparison functions to take const void * parameters. These are
all read-only operations.

Assisted-by: OpenCode:BigPickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 v2: Rebased.
 kernel/trace/tracing_map.c | 41 ++++++++++++++++----------------------
 kernel/trace/tracing_map.h |  8 ++++----
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index d7922f40dbe2..928948b50737 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -125,32 +125,32 @@ u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
 	return (u64)atomic64_read(&elt->vars[i]);
 }
 
-int tracing_map_cmp_string(void *val_a, void *val_b)
+int tracing_map_cmp_string(const void *val_a, const void *val_b)
 {
-	char *a = val_a;
-	char *b = val_b;
+	const char *a = val_a;
+	const char *b = val_b;
 
 	return strcmp(a, b);
 }
 
-int tracing_map_cmp_none(void *val_a, void *val_b)
+int tracing_map_cmp_none(const void *val_a, const void *val_b)
 {
 	return 0;
 }
 
-static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
+static int tracing_map_cmp_atomic64(const void *val_a, const void *val_b)
 {
-	u64 a = atomic64_read((atomic64_t *)val_a);
-	u64 b = atomic64_read((atomic64_t *)val_b);
+	u64 a = atomic64_read((const atomic64_t *)val_a);
+	u64 b = atomic64_read((const atomic64_t *)val_b);
 
 	return (a > b) ? 1 : ((a < b) ? -1 : 0);
 }
 
 #define DEFINE_TRACING_MAP_CMP_FN(type)					\
-static int tracing_map_cmp_##type(void *val_a, void *val_b)		\
+static int tracing_map_cmp_##type(const void *val_a, const void *val_b)	\
 {									\
-	type a = (type)(*(u64 *)val_a);					\
-	type b = (type)(*(u64 *)val_b);					\
+	type a = (type)(*(const u64 *)val_a);				\
+	type b = (type)(*(const u64 *)val_b);				\
 									\
 	return (a > b) ? 1 : ((a < b) ? -1 : 0);			\
 }
@@ -383,7 +383,6 @@ static void __tracing_map_elt_free(struct tracing_map_elt *elt)
 	if (!elt)
 		return;
 
-	kfree(elt->fields);
 	kfree(elt->vars);
 	kfree(elt->var_set);
 	kfree(elt->key);
@@ -406,7 +405,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
 	struct tracing_map_elt *elt;
 	int err = 0;
 
-	elt = kzalloc_obj(*elt);
+	elt = kzalloc_flex(*elt, fields, map->n_fields);
 	if (!elt)
 		return ERR_PTR(-ENOMEM);
 
@@ -418,12 +417,6 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
 		goto free;
 	}
 
-	elt->fields = kzalloc_objs(*elt->fields, map->n_fields);
-	if (!elt->fields) {
-		err = -ENOMEM;
-		goto free;
-	}
-
 	elt->vars = kzalloc_objs(*elt->vars, map->n_vars);
 	if (!elt->vars) {
 		err = -ENOMEM;
@@ -857,10 +850,10 @@ static int cmp_entries_sum(const void *A, const void *B)
 {
 	const struct tracing_map_elt *elt_a, *elt_b;
 	const struct tracing_map_sort_entry *a, *b;
-	struct tracing_map_sort_key *sort_key;
-	struct tracing_map_field *field;
+	const struct tracing_map_sort_key *sort_key;
+	const struct tracing_map_field *field;
 	tracing_map_cmp_fn_t cmp_fn;
-	void *val_a, *val_b;
+	const void *val_a, *val_b;
 	int ret = 0;
 
 	a = *(const struct tracing_map_sort_entry **)A;
@@ -888,10 +881,10 @@ static int cmp_entries_key(const void *A, const void *B)
 {
 	const struct tracing_map_elt *elt_a, *elt_b;
 	const struct tracing_map_sort_entry *a, *b;
-	struct tracing_map_sort_key *sort_key;
-	struct tracing_map_field *field;
+	const struct tracing_map_sort_key *sort_key;
+	const struct tracing_map_field *field;
 	tracing_map_cmp_fn_t cmp_fn;
-	void *val_a, *val_b;
+	const void *val_a, *val_b;
 	int ret = 0;
 
 	a = *(const struct tracing_map_sort_entry **)A;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 18a02959d77b..90a7fde5dd02 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -13,7 +13,7 @@
 #define TRACING_MAP_VARS_MAX		16
 #define TRACING_MAP_SORT_KEYS_MAX	2
 
-typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
+typedef int (*tracing_map_cmp_fn_t) (const void *val_a, const void *val_b);
 
 /*
  * This is an overview of the tracing_map data structures and how they
@@ -137,11 +137,11 @@ struct tracing_map_field {
 
 struct tracing_map_elt {
 	struct tracing_map		*map;
-	struct tracing_map_field	*fields;
 	atomic64_t			*vars;
 	bool				*var_set;
 	void				*key;
 	void				*private_data;
+	struct tracing_map_field	fields[];
 };
 
 struct tracing_map_entry {
@@ -260,8 +260,8 @@ tracing_map_lookup(struct tracing_map *map, void *key);
 
 extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
 						int field_is_signed);
-extern int tracing_map_cmp_string(void *val_a, void *val_b);
-extern int tracing_map_cmp_none(void *val_a, void *val_b);
+extern int tracing_map_cmp_string(const void *val_a, const void *val_b);
+extern int tracing_map_cmp_none(const void *val_a, const void *val_b);
 
 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
 				   unsigned int i, u64 n);
-- 
2.54.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox