Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v2] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Josh Law @ 2026-03-17 18:06 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel

xbc_node_compose_key_after() passes a size_t buffer length to
snprintf(), but snprintf() returns int. Guard against size values above
INT_MAX before the loop so the existing truncation check can continue to
compare ret against (int)size safely.

Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
so the same source continues to build there.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c                            | 3 +++
 tools/bootconfig/include/linux/bootconfig.h | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 96cbe6738ffe..730209c83e62 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -313,6 +313,9 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
 	if (!node && root)
 		return -EINVAL;
 
+	if (WARN_ON_ONCE(size > INT_MAX))
+		return -EINVAL;
+
 	while (--depth >= 0) {
 		node = xbc_nodes + keys[depth];
 		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
index 6784296a0692..48383c10e036 100644
--- a/tools/bootconfig/include/linux/bootconfig.h
+++ b/tools/bootconfig/include/linux/bootconfig.h
@@ -8,6 +8,7 @@
 #include <stdbool.h>
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <string.h>
 
 
@@ -19,6 +20,10 @@
 	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
 			__FILE__, __LINE__, __func__, #cond) : 0)
 
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(cond)	WARN_ON(cond)
+#endif
+
 #define unlikely(cond)	(cond)
 
 /* Copied from lib/string.c */
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v2] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Steven Rostedt @ 2026-03-17 18:13 UTC (permalink / raw)
  To: Josh Law
  Cc: Masami Hiramatsu, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317180605.50970-1-objecting@objecting.org>

On Tue, 17 Mar 2026 18:06:05 +0000
Josh Law <objecting@objecting.org> wrote:

> xbc_node_compose_key_after() passes a size_t buffer length to
> snprintf(), but snprintf() returns int. Guard against size values above
> INT_MAX before the loop so the existing truncation check can continue to
> compare ret against (int)size safely.
> 
> Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
> so the same source continues to build there.
> 
> Signed-off-by: Josh Law <objecting@objecting.org>
> ---

BTW, you can add here:

Changes since v1: https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/

- Removed typecasting ret to size_t, as it is not needed (Steven Rostedt)

>  lib/bootconfig.c                            | 3 +++
>  tools/bootconfig/include/linux/bootconfig.h | 5 +++++
>  2 files changed, 8 insertions(+)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index 96cbe6738ffe..730209c83e62 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -313,6 +313,9 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>  	if (!node && root)
>  		return -EINVAL;
>  

I wonder if this should have a comment here:

	/*
	 * Greater than 2G isn't needed for the bootconfig. Warn if it is
	 * bigger as to not need to worry about overruns of snprintf()
	 * return value.
	 */

> +	if (WARN_ON_ONCE(size > INT_MAX))
> +		return -EINVAL;
> +

-- Steve

>  	while (--depth >= 0) {
>  		node = xbc_nodes + keys[depth];
>  		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
> diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
> index 6784296a0692..48383c10e036 100644
> --- a/tools/bootconfig/include/linux/bootconfig.h
> +++ b/tools/bootconfig/include/linux/bootconfig.h
> @@ -8,6 +8,7 @@
>  #include <stdbool.h>
>  #include <ctype.h>
>  #include <errno.h>
> +#include <limits.h>
>  #include <string.h>
>  
>  
> @@ -19,6 +20,10 @@
>  	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
>  			__FILE__, __LINE__, __func__, #cond) : 0)
>  
> +#ifndef WARN_ON_ONCE
> +#define WARN_ON_ONCE(cond)	WARN_ON(cond)
> +#endif
> +
>  #define unlikely(cond)	(cond)
>  
>  /* Copied from lib/string.c */


^ permalink raw reply

* [PATCH v3] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Josh Law @ 2026-03-17 18:15 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel

xbc_node_compose_key_after() passes a size_t buffer length to
snprintf(), but snprintf() returns int. Guard against size values above
INT_MAX before the loop so the existing truncation check can continue to
compare ret against (int)size safely.

Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
so the same source continues to build there.

Changes since v2:
 - Added a comment explaining the INT_MAX guard.

Changes since v1:
 - Removed casting ret to size_t; with the INT_MAX guard, the existing
   ret >= (int)size check is sufficient, per Steven Rostedt.
 - Link to v1:
   https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c                            | 8 ++++++++
 tools/bootconfig/include/linux/bootconfig.h | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 96cbe6738ffe..2a54b51dec5c 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -313,6 +313,14 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
 	if (!node && root)
 		return -EINVAL;
 
+	/*
+	 * Bootconfig strings never need multi-GB buffers. Reject sizes
+	 * above INT_MAX so snprintf()'s int return value cannot overflow
+	 * the truncation check below.
+	 */
+	if (WARN_ON_ONCE(size > INT_MAX))
+		return -EINVAL;
+
 	while (--depth >= 0) {
 		node = xbc_nodes + keys[depth];
 		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
index 6784296a0692..48383c10e036 100644
--- a/tools/bootconfig/include/linux/bootconfig.h
+++ b/tools/bootconfig/include/linux/bootconfig.h
@@ -8,6 +8,7 @@
 #include <stdbool.h>
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <string.h>
 
 
@@ -19,6 +20,10 @@
 	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
 			__FILE__, __LINE__, __func__, #cond) : 0)
 
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(cond)	WARN_ON(cond)
+#endif
+
 #define unlikely(cond)	(cond)
 
 /* Copied from lib/string.c */
-- 
2.34.1


^ permalink raw reply related

* [PATCH] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-03-17 18:28 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: johannes.thumshirn, kch, bvanassche, dlemoal, ritesh.list, neelx,
	sean, mproche, linux-block, linux-kernel, linux-trace-kernel

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait static tracepoint in
the tag allocation slow-path. It triggers immediately before the
thread yields the CPU, exposing the exact hardware context (hctx)
that is starved, the total pool size, and the current active request
count.

This provides storage engineers and performance monitoring agents
with a zero-configuration, low-overhead mechanism to definitively
identify shared-tag bottlenecks and tune I/O schedulers or cgroup
throttling accordingly.

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 block/blk-mq-tag.c           |  3 +++
 include/trace/events/block.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..f50993e86ca5 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -187,6 +188,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		trace_block_rq_tag_wait(data->q, data->hctx);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..48e2ba433c87 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,42 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when an I/O request is starved of a tag
+ * @q: queue containing the request
+ * @hctx: hardware context (queue) experiencing starvation
+ *
+ * Called immediately before the submitting thread is forced to block due
+ * to the exhaustion of available hardware tags. This tracepoint indicates
+ * that the thread will be placed into an uninterruptible state via
+ * io_schedule() until an active block I/O operation completes and
+ * relinquishes its assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx),
+
+	TP_ARGS(q, hctx),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( u32,		active_requests		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		  = q->disk ? disk_devt(q->disk) : 0;
+		__entry->hctx_id	  = hctx ? hctx->queue_num : 0;
+		__entry->nr_tags	  = hctx && hctx->tags ? hctx->tags->nr_tags : 0;
+		__entry->active_requests  = hctx ? atomic_read(&hctx->nr_active) : 0;
+	),
+
+	TP_printk("%d,%d hctx=%u starved (active=%u/%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id, __entry->active_requests, __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Leon Romanovsky @ 2026-03-17 19:05 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel, Will Deacon,
	Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260316-dma-debug-overlap-v3-0-1dde90a7f08b@nvidia.com>

On Mon, Mar 16, 2026 at 09:06:44PM +0200, Leon Romanovsky wrote:
> Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
> mappings that must run on a DMA‑coherent system. Such buffers cannot
> use the SWIOTLB path, may overlap with CPU caches, and do not depend on
> explicit cache flushing.
> 
> Mappings using this attribute are rejected on systems where cache
> side‑effects could lead to data corruption, and therefore do not need
> the cache‑overlap debugging logic. This series also includes fixes for
> DMA_ATTR_CPU_CACHE_CLEAN handling.
> Thanks.

<...>

> ---
> Leon Romanovsky (8):
>       dma-debug: Allow multiple invocations of overlapping entries
>       dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
>       dma-mapping: Clarify valid conditions for CPU cache line overlap
>       dma-mapping: Introduce DMA require coherency attribute
>       dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
>       iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
>       RDMA/umem: Tell DMA mapping that UMEM requires coherency
>       mm/hmm: Indicate that HMM requires DMA coherency
> 
>  Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
>  drivers/infiniband/core/umem.c            |  5 ++--
>  drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
>  drivers/virtio/virtio_ring.c              | 10 ++++----
>  include/linux/dma-mapping.h               | 15 ++++++++----
>  include/trace/events/dma.h                |  4 +++-
>  kernel/dma/debug.c                        |  9 ++++----
>  kernel/dma/direct.h                       |  7 +++---
>  kernel/dma/mapping.c                      |  6 +++++
>  mm/hmm.c                                  |  4 ++--
>  10 files changed, 86 insertions(+), 33 deletions(-)

Marek,

Despite the "RDMA ..." tag in the subject, the diffstat clearly shows that
you are the appropriate person to take this patch.

Thanks.


> ---
> base-commit: 11439c4635edd669ae435eec308f4ab8a0804808
> change-id: 20260305-dma-debug-overlap-21487c3fa02c
> 
> Best regards,
> --  
> Leon Romanovsky <leonro@nvidia.com>
> 

^ permalink raw reply

* Re: [PATCH v3] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Steven Rostedt @ 2026-03-17 20:37 UTC (permalink / raw)
  To: Josh Law
  Cc: Masami Hiramatsu, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317181556.53417-1-objecting@objecting.org>

On Tue, 17 Mar 2026 18:15:56 +0000
Josh Law <objecting@objecting.org> wrote:

> xbc_node_compose_key_after() passes a size_t buffer length to
> snprintf(), but snprintf() returns int. Guard against size values above
> INT_MAX before the loop so the existing truncation check can continue to
> compare ret against (int)size safely.
> 
> Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
> so the same source continues to build there.
> 
> Changes since v2:
>  - Added a comment explaining the INT_MAX guard.
> 
> Changes since v1:
>  - Removed casting ret to size_t; with the INT_MAX guard, the existing
>    ret >= (int)size check is sufficient, per Steven Rostedt.
>  - Link to v1:
>    https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/

The changes need to be below the '---' so that they don't get pulled into
the git commit.

> 
> Signed-off-by: Josh Law <objecting@objecting.org>
> ---

  <here>

>  lib/bootconfig.c                            | 8 ++++++++
>  tools/bootconfig/include/linux/bootconfig.h | 5 +++++
>  2 files changed, 13 insertions(+)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index 96cbe6738ffe..2a54b51dec5c 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -313,6 +313,14 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>  	if (!node && root)
>  		return -EINVAL;
>  
> +	/*
> +	 * Bootconfig strings never need multi-GB buffers. Reject sizes
> +	 * above INT_MAX so snprintf()'s int return value cannot overflow
> +	 * the truncation check below.
> +	 */
> +	if (WARN_ON_ONCE(size > INT_MAX))
> +		return -EINVAL;
> +
>  	while (--depth >= 0) {
>  		node = xbc_nodes + keys[depth];
>  		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
> diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
> index 6784296a0692..48383c10e036 100644
> --- a/tools/bootconfig/include/linux/bootconfig.h
> +++ b/tools/bootconfig/include/linux/bootconfig.h
> @@ -8,6 +8,7 @@
>  #include <stdbool.h>
>  #include <ctype.h>
>  #include <errno.h>
> +#include <limits.h>
>  #include <string.h>
>  
>  
> @@ -19,6 +20,10 @@
>  	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
>  			__FILE__, __LINE__, __func__, #cond) : 0)
>  
> +#ifndef WARN_ON_ONCE
> +#define WARN_ON_ONCE(cond)	WARN_ON(cond)
> +#endif
> +
>  #define unlikely(cond)	(cond)
>  
>  /* Copied from lib/string.c */

Other than that.

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply

* Re: [PATCH v3] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Josh Law @ 2026-03-17 20:43 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317163738.3a7863dd@gandalf.local.home>



On 17 March 2026 20:37:38 GMT, Steven Rostedt <rostedt@goodmis.org> wrote:
>On Tue, 17 Mar 2026 18:15:56 +0000
>Josh Law <objecting@objecting.org> wrote:
>
>> xbc_node_compose_key_after() passes a size_t buffer length to
>> snprintf(), but snprintf() returns int. Guard against size values above
>> INT_MAX before the loop so the existing truncation check can continue to
>> compare ret against (int)size safely.
>> 
>> Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
>> so the same source continues to build there.
>> 
>> Changes since v2:
>>  - Added a comment explaining the INT_MAX guard.
>> 
>> Changes since v1:
>>  - Removed casting ret to size_t; with the INT_MAX guard, the existing
>>    ret >= (int)size check is sufficient, per Steven Rostedt.
>>  - Link to v1:
>>    https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/
>
>The changes need to be below the '---' so that they don't get pulled into
>the git commit.
>
>> 
>> Signed-off-by: Josh Law <objecting@objecting.org>
>> ---
>
>  <here>
>
>>  lib/bootconfig.c                            | 8 ++++++++
>>  tools/bootconfig/include/linux/bootconfig.h | 5 +++++
>>  2 files changed, 13 insertions(+)
>> 
>> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
>> index 96cbe6738ffe..2a54b51dec5c 100644
>> --- a/lib/bootconfig.c
>> +++ b/lib/bootconfig.c
>> @@ -313,6 +313,14 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>>  	if (!node && root)
>>  		return -EINVAL;
>>  
>> +	/*
>> +	 * Bootconfig strings never need multi-GB buffers. Reject sizes
>> +	 * above INT_MAX so snprintf()'s int return value cannot overflow
>> +	 * the truncation check below.
>> +	 */
>> +	if (WARN_ON_ONCE(size > INT_MAX))
>> +		return -EINVAL;
>> +
>>  	while (--depth >= 0) {
>>  		node = xbc_nodes + keys[depth];
>>  		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
>> diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
>> index 6784296a0692..48383c10e036 100644
>> --- a/tools/bootconfig/include/linux/bootconfig.h
>> +++ b/tools/bootconfig/include/linux/bootconfig.h
>> @@ -8,6 +8,7 @@
>>  #include <stdbool.h>
>>  #include <ctype.h>
>>  #include <errno.h>
>> +#include <limits.h>
>>  #include <string.h>
>>  
>>  
>> @@ -19,6 +20,10 @@
>>  	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
>>  			__FILE__, __LINE__, __func__, #cond) : 0)
>>  
>> +#ifndef WARN_ON_ONCE
>> +#define WARN_ON_ONCE(cond)	WARN_ON(cond)
>> +#endif
>> +
>>  #define unlikely(cond)	(cond)
>>  
>>  /* Copied from lib/string.c */
>
>Other than that.
>
>Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
>
>-- Steve


I'll be convenient,  I'll make a V4 just fixing that, You can just recommend the reviewed by tag, thanks a lot


V/R


Josh Law

^ permalink raw reply

* [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Josh Law @ 2026-03-17 20:44 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel

xbc_node_compose_key_after() passes a size_t buffer length to
snprintf(), but snprintf() returns int. Guard against size values above
INT_MAX before the loop so the existing truncation check can continue to
compare ret against (int)size safely.

Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
so the same source continues to build there.

Signed-off-by: Josh Law <objecting@objecting.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v3:
 - Moved the revision history below the --- separator so it does not
   become part of the git commit.
 - Added Reviewed-by from Steven Rostedt.

Changes since v2:
 - Added a comment explaining the INT_MAX guard.

Changes since v1:
 - Removed casting ret to size_t; with the INT_MAX guard, the existing
   ret >= (int)size check is sufficient, per Steven Rostedt.
 - Link to v1:
   https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/

 lib/bootconfig.c                            | 8 ++++++++
 tools/bootconfig/include/linux/bootconfig.h | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 96cbe6738ffe..2a54b51dec5c 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -313,6 +313,14 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
 	if (!node && root)
 		return -EINVAL;
 
+	/*
+	 * Bootconfig strings never need multi-GB buffers. Reject sizes
+	 * above INT_MAX so snprintf()'s int return value cannot overflow
+	 * the truncation check below.
+	 */
+	if (WARN_ON_ONCE(size > INT_MAX))
+		return -EINVAL;
+
 	while (--depth >= 0) {
 		node = xbc_nodes + keys[depth];
 		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
index 6784296a0692..48383c10e036 100644
--- a/tools/bootconfig/include/linux/bootconfig.h
+++ b/tools/bootconfig/include/linux/bootconfig.h
@@ -8,6 +8,7 @@
 #include <stdbool.h>
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <string.h>
 
 
@@ -19,6 +20,10 @@
 	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
 			__FILE__, __LINE__, __func__, #cond) : 0)
 
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(cond)	WARN_ON(cond)
+#endif
+
 #define unlikely(cond)	(cond)
 
 /* Copied from lib/string.c */
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH mm-unstable v15 10/13] mm/khugepaged: Introduce mTHP collapse support
From: Lorenzo Stoakes (Oracle) @ 2026-03-17 21:36 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032605.234046-1-npache@redhat.com>

On Wed, Feb 25, 2026 at 08:26:05PM -0700, Nico Pache wrote:
> Enable khugepaged to collapse to mTHP orders. This patch implements the
> main scanning logic using a bitmap to track occupied pages and a stack
> structure that allows us to find optimal collapse sizes.
>
> Previous to this patch, PMD collapse had 3 main phases, a light weight
> scanning phase (mmap_read_lock) that determines a potential PMD
> collapse, a alloc phase (mmap unlocked), then finally heavier collapse

-> an alloc phase

> phase (mmap_write_lock).
>
> To enabled mTHP collapse we make the following changes:
>
> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> orders are enabled, we remove the restriction of max_ptes_none during the
> scan phase to avoid missing potential mTHP collapse candidates. Once we
> have scanned the full PMD range and updated the bitmap to track occupied
> pages, we use the bitmap to find the optimal mTHP size.

Right bu reinstate it later right? :) Though now simpler as we have only two
modes.

>
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack structure
> is used instead of traditional recursion to manage the search. The

Maybe worth saying due to limited kernel stack size.

> algorithm recursively splits the bitmap into smaller chunks to find the
> highest order mTHPs that satisfy the collapse criteria. We start by
> attempting the PMD order, then moved on the consecutively lower orders
> (mTHP collapse). The stack maintains a pair of variables (offset, order),
> indicating the number of PTEs from the start of the PMD, and the order of

Probably worth saying the stack is kept in the collapse_control now?

Having nitted all this, thanks for writing this up much appreciated :)

> the potential collapse candidate.
>
> The algorithm for consuming the bitmap works as such:
>     1) push (0, HPAGE_PMD_ORDER) onto the stack
>     2) pop the stack
>     3) check if the number of set bits in that (offset,order) pair
>        statisfy the max_ptes_none threshold for that order
>     4) if yes, attempt collapse
>     5) if no (or collapse fails), push two new stack items representing
>        the left and right halves of the current bitmap range, at the
>        next lower order
>     6) repeat at step (2) until stack is empty.
>
> Below is a diagram representing the algorithm and stack items:
>
>                            offset       mid_offset
>                             |         |
>                             |         |
>                             v         v
>           __________________^_________________ <-- I think better as -
>          |          PTE Page.Table            |
>          -------------------.------------------
> 			    <-.-----><------->
>                           ^ .order-1  order- <-- not sure
                            . .            if I accidentally
                            . .       deleted or missing :P
                            . .
			    . .. Doesn't line up with..|
			    .this......................|
                                       ^
                                       |
                                       |
That's nice, but as a connoisseur of the ASCII diagram, a few nits :)

I also wonder if the offset, mid-offset is correct to put there?

Because you start off with:

(0, HPAGE_PMD_ORDER)

       offset
          |
          |
          v
          |------------------------------------|
          |                 PTE                |
          |------------------------------------|
	  <------------------------------------>
	                1 << order

Right? Trying to get the PMD sized one, then:

(offset=0, order=HPAGE_PMD_ORDER -1)
(mid_offset=HPAGE_PMD_NR >> 1, order=HPAGE_PMD_ORDER -1)

       offset           mid_offset
          |                  |
          |                  |
          v                  v
          |------------------------------------|
          |                 PTE                |
          |------------------------------------|
	  <------------------><---------------->
	       1 << order          1 << order

And etc.

So probably worth making that clear.

>
> We currently only support mTHP collapse for max_ptes_none values of 0
> and HPAGE_PMD_NR - 1. resulting in the following behavior:
>
>     - max_ptes_none=0: Never introduce new empty pages during collapse
>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>       available mTHP order

Probably worth slightly expanding on what introducing 'new empty pages'
entails in practice here.

>
> Any other max_ptes_none value will emit a warning and skip mTHP collapse
> attempts. There should be no behavior change for PMD collapse.

Maybe worth saying we are doing this for the time being as it avoids issues
with the algorithm tending towards PMD collapse etc.?

>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order (m)THP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will

I think it's confusing mentioning this here, probably better to move it
uabove.

> satisfy the promotion condition once again. This issue is prevented via
> the collapse_max_ptes_none() function which imposes the max_ptes_none
> restrictions above.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.

Overall a great commit message and THANK YOU so much for that excellent
description of the algorithm, much appreciated!

>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 189 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 180 insertions(+), 9 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2fdfb6d42cf9..1c3711ed4513 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -99,6 +99,32 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
>  static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +#define KHUGEPAGED_MIN_MTHP_ORDER	2
> +/*
> + * The maximum number of mTHP ranges that can be stored on the stack.
> + * This is calculated based on the number of PTE entries in a PTE page table
> + * and the minimum mTHP order.
> + *
> + * ilog2(MAX_PTRS_PER_PTE) is log2 of the maximum number of PTE entries.

I think this line is superfluous and can be removed :)

> + * This gives you the PMD_ORDER, and is needed in place of HPAGE_PMD_ORDER due
> + * to restrictions of some architectures (ie ppc64le).

Hm this is vague, why exactly?

> + *
> + * At most there will be 1 << (PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER) mTHP ranges

Maybe worth moving this around and saying 'the absolute most number of mTHP
ranges we can encounter is 1 << (PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER),
but due to some architectures restricting the number of pointers per PTE,
we have to derive the actual maximum from MAX_PTRS_PER_PTE'

And then add a paragraph on why arches do this (if it's just PPC, just
replace arches with PPC) etc.

> + */
> +#define MTHP_STACK_SIZE	(1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))

I think it'd be nice to have a separate define for ilog2(MAX_PTRS_PER_PTE),
maybe MAX_ORDER_PER_PTE?

> +
> +/*
> + * Defines a range of PTE entries in a PTE page table which are being
> + * considered for (m)THP collapse.

Probably we can drop the parens :)

> + *
> + * @offset: the offset of the first PTE entry in a PMD range.
> + * @order: the order of the PTE entries being considered for collapse.
> + */
> +struct mthp_range {
> +	u16 offset;
> +	u8 order;
> +};
> +
>  struct collapse_control {
>  	bool is_khugepaged;
>
> @@ -107,6 +133,11 @@ struct collapse_control {
>
>  	/* nodemask for allocation fallback */
>  	nodemask_t alloc_nmask;
> +
> +	/* bitmap used for mTHP collapse */

This is still super vague. Also you have 2 bitmaps here :)

Something like:

	/* Each bit set represents a present PTE entry  */
> +	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
	/* A mask of the current range being considered for mTHP collapse */
> +	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> +	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];

This time no heart attack as I know this is not on the stack (despite a
misunderstanding on a series where somebody did seem to want to try to do
that recently to reintroduce a coronary event :P)

>  };
>
>  /**
> @@ -1361,17 +1392,138 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>  	return result;
>  }
>
> +static void mthp_stack_push(struct collapse_control *cc, int *stack_size,
> +				   u16 offset, u8 order)
> +{
> +	const int size = *stack_size;
> +	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
> +
> +	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
> +	stack->order = order;
> +	stack->offset = offset;
> +	(*stack_size)++;
> +}
> +
> +static struct mthp_range mthp_stack_pop(struct collapse_control *cc, int *stack_size)
> +{
> +	const int size = *stack_size;
> +
> +	VM_WARN_ON_ONCE(size <= 0);
> +	(*stack_size)--;
> +	return cc->mthp_bitmap_stack[size - 1];
> +}


I know I love helper structs, but I wonder if we chouldn't have a broader
stack object like:

struct mthp_stack_state {
	struct mthp_range *arr; // assigned to cc->mthp_bitmap-stack
	int size;
};

And just make these functions like:

static void mthp_stack_push(struct mthp_stack_state *stack)
{
	struct mthp_range *range = &stack->arr[stack->size++];

	VM_WARN_ON_ONCE(stack->size >= MTHP_STACK_SIZE);
	range->order = order;
	range->offset = offset;
}

static struct mthp_range mthp_stack_pop(struct mthp_stack_state *stack)
{
	struct mthp_range *range = &stack->arr[--stack->size];

	VM_WARN_ON_ONCE(stack->size <= 0);
	return *range;
}

I also fold some other cleanups into that, I think e.g. *stack-size--,
followed by using size (which we copied first) - 1 indexed into the array
is overwrought.

> +
> +static unsigned int mthp_nr_occupied_pte_entries(struct collapse_control *cc,

This name is a bit overwrought. pte_count_present()? I think we maybe don't
even need a mthp_ prefix given the params should give context?

> +						 u16 offset, unsigned long nr_pte_entries)

This line is super long, can we just put the 2nd line 2 tabs indented?

> +{
> +	bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);

Why are we zeroing HPAGE_PMD_NR bits, but mthp_bitmap_mask has
MAX_PTRS_PER_PTE entries?

I thought the comment above was saying how MAX_PTRS_PER_PTE can be <
HPAGE_PMD_NR right? So couldn't this be problematic on ppc?

We should be zeroing the same number of bits as is defined for the bitmap.

> +	bitmap_set(cc->mthp_bitmap_mask, offset, nr_pte_entries);
> +	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> +}

We could pass in a stack_state pointer if we add pointers to
cc->mthp_bitmap_mask and cc->mthp_bitmap to the struct:

static unsigned int pte_count_present_top(struct stack_state *stack)
{
	struct mthp_range *range = stack->arr[stack->size - 1];

	VM_WARN_ON_ONCE(!stack->size);

	/* set up mask for offset, range */
	bitmap_zero(stack->mask, MAX_PTRS_PER_PTE);
	bitmap_set(stack->mask, arr->offset, 1U << arr->order);

	/* Hamming weight of mask & bitmap = count of PTE entries in range. */
	return bitmap_weight_and(stack->bitmap, stack->mask, MAX_PTRS_PER_PTE);
}

Which could also simplify the calling code a lot, which then could pop
afterwards, leaving this to read the top of the stack.

Then again, you use order, nr_pte_entries elsewhere so could be:

static unsigned int pte_count_present(struct stack_state *stack,
		struct mthp_range *range) { ... }

Instead?

> +
> +/*
> + * mthp_collapse() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.

Probably worth dropping the (!none/zero) bit for clarity.

> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. The stack maintains a pair of
> + * variables (offset, order), indicating the number of PTEs from the start of
> + * the PMD, and the order of the potential collapse candidate respectively. We
> + * start at the PMD order and check if it is eligible for collapse; if not, we
> + * add two entries to the stack at a lower order to represent the left and right
> + * halves of the PTE page table we are examining.
> + *
> + *                         offset       mid_offset
> + *                         |         |
> + *                         |         |
> + *                         v         v
> + *      --------------------------------------
> + *      |          cc->mthp_bitmap            |
> + *      --------------------------------------
> + *                         <-------><------->
> + *                          order-1  order-1
> + *
> + * For each of these, we determine how many PTE entries are occupied in the
> + * range of PTE entries we propose to collapse, then we compare this to a
> + * threshold number of PTE entries which would need to be occupied for a
> + * collapse to be permitted at that order (accounting for max_ptes_none).
> +
> + * If a collapse is permitted, we attempt to collapse the PTE range into a
> + * mTHP.

So this is pretty much the same as the commit message and obviously my
comments are the same here as for there :)

But again thanks for doing this!

> + */
> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		bool *mmap_locked, unsigned long enabled_orders)
> +{
> +	unsigned int max_ptes_none, nr_occupied_ptes;
> +	struct mthp_range range;
> +	unsigned long collapse_address;
> +	int collapsed = 0, stack_size = 0;
> +	unsigned long nr_pte_entries;
> +	u16 offset;
> +	u8 order;

I hate to now be the one saying it since somebody got me OCD about it
before, and it doesn't really matter and is a nit, but like reverse xmas
tree would be nice :P

> +
> +	mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
> +
> +	while (stack_size > 0) {

I think we can be kernel-y and make this:

while (stack_size)

As going < 0 would be a bug anyway right?

> +		range = mthp_stack_pop(cc, &stack_size);
> +		order = range.order;
> +		offset = range.offset;
> +		nr_pte_entries = 1UL << order;

See above idea for using stack state type and just reading off top of stack
in calculation.

> +
> +		if (!test_bit(order, &enabled_orders))
> +			goto next_order;
> +
> +		if (cc->is_khugepaged)
> +			max_ptes_none = collapse_max_ptes_none(order);
> +		else
> +			max_ptes_none = COLLAPSE_MAX_PTES_LIMIT;

Hm, should we even be executing this loop at all for MADV_COLLAPSE? Could
we just separate that out as its own thing that just does a PMD-sized entry
and simplify this?

But then hmm you use order, nr_pte_entries elsewhere so maybe could just
pop range and pass that in with stack_state ptr?

> +
> +		if (max_ptes_none == -EINVAL)

Shouldn't we rather do something like IS_ERR_VALUE(max_ptes_none)?

> +			return collapsed;

> +
> +		nr_occupied_ptes = mthp_nr_occupied_pte_entries(cc, offset, nr_pte_entries);
> +
> +		if (nr_occupied_ptes >= nr_pte_entries - max_ptes_none) {

Be nicer to have nr_ptes_entries - max_ptes_none pre-calculated as a value,
e.g. min_occupied_ptes?

> +			int ret;
> +
> +			collapse_address = address + offset * PAGE_SIZE;
> +			ret = collapse_huge_page(mm, collapse_address, referenced,
> +						 unmapped, cc, mmap_locked,
> +						 order);

My kingdom for a helper struct here :)

> +			if (ret == SCAN_SUCCEED) {
> +				collapsed += nr_pte_entries;
> +				continue;
> +			}

I guess we don't care about which flavour of scan failure happens here?

> +		}
> +
> +next_order:
> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
> +			const u8 next_order = order - 1;
> +			const u16 mid_offset = offset + (nr_pte_entries / 2);
> +
> +			mthp_stack_push(cc, &stack_size, mid_offset, next_order);
> +			mthp_stack_push(cc, &stack_size, offset, next_order);

All this could be a helper function, like:

static void push_next_order_range(struct stack_state *stack, u8 order,
		u16 offset)
{
	const u8 next_order = order - 1;

	mthp_stack_push(stack, offset, next_order);
	offset += (1 << next_order);
	mthp_stack_push(stack, offset, next_order);
}

> +		}
> +	}
> +	return collapsed;
> +}

Overall MUCH more understandable thanks for that!

> +
>  static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>  		unsigned int *cur_progress, struct collapse_control *cc)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, *_pte;
> -	int none_or_zero = 0, shared = 0, referenced = 0;
> +	int i;
> +	int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
>  	struct page *page = NULL;
> +	unsigned int max_ptes_none;
>  	struct folio *folio = NULL;
>  	unsigned long addr;
> +	unsigned long enabled_orders;

Kinda hate how much state we're putting here throughout function scope, but
can address that in follow up cleanups I guses.

>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;

>
> @@ -1384,8 +1536,21 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> +	bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);

Again, shouldn't this be MAX_PTRS_PER_PTE?

Be nicer to separate into a helper function esp if you have a stack_state
object, to initialise it separately rather than having a random open-coded
bitmap_zero() here.

>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);

Too long line :) please keep to max 80, with some small exceptions going
over by like 1 or 2 chars.

> +
> +	/*
> +	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> +	 * scan all pages to populate the bitmap for mTHP collapse.
> +	 */
> +	if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER))

Isn't BIT(HPAGE_PMD_ORDER) the same as HPAGE_PMD_NR?

> +		max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER);
> +	else
> +		max_ptes_none = COLLAPSE_MAX_PTES_LIMIT;
> +

Coudl we separate this function into a helper struct please, rather than
piling on more open coded stuff?

>  	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>  	if (!pte) {
>  		if (cur_progress)
> @@ -1394,17 +1559,18 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> -	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> -	     _pte++, addr += PAGE_SIZE) {
> +	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +		_pte = pte + i;

(Still hate this underscore b.s. but it's legacy stuff we should deal with
separately)

> +		addr = start_addr + i * PAGE_SIZE;
> +		pte_t pteval = ptep_get(_pte);

Err you're declaring a type under 2 just-assignments? That's not kernel
style :)

Should be:

<type decls>
newline
<everything else>

> +
>  		if (cur_progress)
>  			*cur_progress += 1;
>
> -		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
>  			++none_or_zero;
>  			if (!userfaultfd_armed(vma) &&
> -			    (!cc->is_khugepaged ||

Why are we dropping this?

> -			     none_or_zero <= khugepaged_max_ptes_none)) {
> +			    none_or_zero <= max_ptes_none) {
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -1478,6 +1644,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  			}
>  		}
>
> +		/* Set bit for occupied pages */
> +		bitmap_set(cc->mthp_bitmap, i, 1);

Again, let's use a helper for this kind of thing rather than open
coding. The stack_state object will help.

>  		/*
>  		 * Record which node the original page is from and save this
>  		 * information to cc->node_load[].
> @@ -1534,9 +1702,12 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
> -		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc, mmap_locked,
> -					    HPAGE_PMD_ORDER);
> +		nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
> +					      cc, mmap_locked, enabled_orders);
> +		if (nr_collapsed > 0)

if (nr_collapsed) is more kernelly :)

> +			result = SCAN_SUCCEED;
> +		else
> +			result = SCAN_FAIL;

I mean maybe we can just use a ?: assignment here like:

result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;

?

>  	}
>  out:
>  	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.53.0
>

Overall we're much much closer now to having this series in, sorry for
delays etc. in review but now I've (finally) got to review everything and
David has had a look too on both series we are honing in on completion!

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Masami Hiramatsu @ 2026-03-17 23:03 UTC (permalink / raw)
  To: Josh Law; +Cc: Andrew Morton, Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317204403.72375-1-objecting@objecting.org>

On Tue, 17 Mar 2026 20:44:03 +0000
Josh Law <objecting@objecting.org> wrote:

> xbc_node_compose_key_after() passes a size_t buffer length to
> snprintf(), but snprintf() returns int. Guard against size values above
> INT_MAX before the loop so the existing truncation check can continue to
> compare ret against (int)size safely.
> 
> Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
> so the same source continues to build there.

NACK.

Don't do such over engineering effort.

Thanks,

> 
> Signed-off-by: Josh Law <objecting@objecting.org>
> Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v3:
>  - Moved the revision history below the --- separator so it does not
>    become part of the git commit.
>  - Added Reviewed-by from Steven Rostedt.
> 
> Changes since v2:
>  - Added a comment explaining the INT_MAX guard.
> 
> Changes since v1:
>  - Removed casting ret to size_t; with the INT_MAX guard, the existing
>    ret >= (int)size check is sufficient, per Steven Rostedt.
>  - Link to v1:
>    https://lore.kernel.org/all/20260317173703.46092-1-objecting@objecting.org/
> 
>  lib/bootconfig.c                            | 8 ++++++++
>  tools/bootconfig/include/linux/bootconfig.h | 5 +++++
>  2 files changed, 13 insertions(+)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index 96cbe6738ffe..2a54b51dec5c 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -313,6 +313,14 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>  	if (!node && root)
>  		return -EINVAL;
>  
> +	/*
> +	 * Bootconfig strings never need multi-GB buffers. Reject sizes
> +	 * above INT_MAX so snprintf()'s int return value cannot overflow
> +	 * the truncation check below.
> +	 */
> +	if (WARN_ON_ONCE(size > INT_MAX))
> +		return -EINVAL;
> +
>  	while (--depth >= 0) {
>  		node = xbc_nodes + keys[depth];
>  		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
> diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
> index 6784296a0692..48383c10e036 100644
> --- a/tools/bootconfig/include/linux/bootconfig.h
> +++ b/tools/bootconfig/include/linux/bootconfig.h
> @@ -8,6 +8,7 @@
>  #include <stdbool.h>
>  #include <ctype.h>
>  #include <errno.h>
> +#include <limits.h>
>  #include <string.h>
>  
>  
> @@ -19,6 +20,10 @@
>  	((cond) ? printf("Internal warning(%s:%d, %s): %s\n",	\
>  			__FILE__, __LINE__, __func__, #cond) : 0)
>  
> +#ifndef WARN_ON_ONCE
> +#define WARN_ON_ONCE(cond)	WARN_ON(cond)
> +#endif
> +
>  #define unlikely(cond)	(cond)
>  
>  /* Copied from lib/string.c */
> -- 
> 2.34.1
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v6 16/17] lib/bootconfig: fix sign-compare in xbc_node_compose_key_after()
From: Masami Hiramatsu @ 2026-03-17 23:15 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317121507.30735331@gandalf.local.home>

On Tue, 17 Mar 2026 12:15:07 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 17 Mar 2026 16:55:49 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > > --- a/lib/bootconfig.c
> > > +++ b/lib/bootconfig.c
> > > @@ -319,10 +319,10 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
> > >  			       depth ? "." : "");
> > >  		if (ret < 0)
> > >  			return ret;
> > > -		if (ret >= size) {
> > > +		if (ret >= (int)size) {  
> > 
> > nit:
> > 
> > 	if ((size_t)ret >= size) {
> > 
> > because sizeof(size_t) > sizeof(int).
> 
> I don't think we need to worry about this. But this does bring up an issue.
> ret comes from:
> 
> 		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
> 			       depth ? "." : "");
> 
> Where size is of type size_t
> 
> snprintf() takes size_t but returns int.
> 
> snprintf() calls vsnprintf() which has:
> 
> 	size_t len, pos;
> 
> Where pos is incremented based on fmt, and vsnprintf() returns:
> 
> 	return pos;
> 
> Which can overflow.

I think that is vsnprintf() (maybe POSIX) design issue.
I believe we're simply using the size_t to represent size of memory
out of convention.

> 
> Now, honestly, we should never have a 2Gig string as that would likely
> cause other horrible things. Does size really need to be size_t?

Even if so, it should be done in vsnprintf() instead of this.
This function just believes that the caller gives collect size
and enough amount of memory. Or, we need to check "INT_MAX > size"
in everywhere.

> 
> Perhaps we should have:
> 
> 	if (WARN_ON_ONCE(size > MAX_INT))
> 		return -EINVAL;

I think this is an over engineering effort especially in
caller side. This overflow should be checked in vsnprintf() and
should return -EINVAL. (and the caller checks the return value.)

Thank you,

> 
> ?
> 
> -- Steve
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Steven Rostedt @ 2026-03-17 23:16 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260318080351.dae637f4b5909bd9f81b27d2@kernel.org>

On Wed, 18 Mar 2026 08:03:51 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Tue, 17 Mar 2026 20:44:03 +0000
> Josh Law <objecting@objecting.org> wrote:
> 
> > xbc_node_compose_key_after() passes a size_t buffer length to
> > snprintf(), but snprintf() returns int. Guard against size values above
> > INT_MAX before the loop so the existing truncation check can continue to
> > compare ret against (int)size safely.
> > 
> > Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
> > so the same source continues to build there.  
> 
> NACK.
> 
> Don't do such over engineering effort.

Hi Masami,

This was somewhat my idea. Why do you think it's over engineering?

This is your code, so you have final say. I'm not going to push it. I'm
just curious to your thoughts.

It is interesting that snprintf() takes a size_t size, and the iterator
inside is also size_t, but then it returns the value as an int.

That itself just looks wrong (and has nothing to do with your code).

-- Steve

^ permalink raw reply

* Re: [PATCH v6 16/17] lib/bootconfig: fix sign-compare in xbc_node_compose_key_after()
From: Josh Law @ 2026-03-17 23:18 UTC (permalink / raw)
  To: Masami Hiramatsu, Steven Rostedt
  Cc: Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260318081540.44c164f2c67d80acf14eaf2e@kernel.org>



On 17 March 2026 23:15:40 GMT, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>On Tue, 17 Mar 2026 12:15:07 -0400
>Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Tue, 17 Mar 2026 16:55:49 +0900
>> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>> 
>> > > --- a/lib/bootconfig.c
>> > > +++ b/lib/bootconfig.c
>> > > @@ -319,10 +319,10 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>> > >  			       depth ? "." : "");
>> > >  		if (ret < 0)
>> > >  			return ret;
>> > > -		if (ret >= size) {
>> > > +		if (ret >= (int)size) {  
>> > 
>> > nit:
>> > 
>> > 	if ((size_t)ret >= size) {
>> > 
>> > because sizeof(size_t) > sizeof(int).
>> 
>> I don't think we need to worry about this. But this does bring up an issue.
>> ret comes from:
>> 
>> 		ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
>> 			       depth ? "." : "");
>> 
>> Where size is of type size_t
>> 
>> snprintf() takes size_t but returns int.
>> 
>> snprintf() calls vsnprintf() which has:
>> 
>> 	size_t len, pos;
>> 
>> Where pos is incremented based on fmt, and vsnprintf() returns:
>> 
>> 	return pos;
>> 
>> Which can overflow.
>
>I think that is vsnprintf() (maybe POSIX) design issue.
>I believe we're simply using the size_t to represent size of memory
>out of convention.
>
>> 
>> Now, honestly, we should never have a 2Gig string as that would likely
>> cause other horrible things. Does size really need to be size_t?
>
>Even if so, it should be done in vsnprintf() instead of this.
>This function just believes that the caller gives collect size
>and enough amount of memory. Or, we need to check "INT_MAX > size"
>in everywhere.
>
>> 
>> Perhaps we should have:
>> 
>> 	if (WARN_ON_ONCE(size > MAX_INT))
>> 		return -EINVAL;
>
>I think this is an over engineering effort especially in
>caller side. This overflow should be checked in vsnprintf() and
>should return -EINVAL. (and the caller checks the return value.)
>
>Thank you,
>
>> 
>> ?
>> 
>> -- Steve
>> 
>
>


I submitted V7 dropping all them patches anyway, V7 should be perfect now.


V/R


Josh Law

^ permalink raw reply

* Re: [PATCH] blk-mq: add tracepoint block_rq_tag_wait
From: Damien Le Moal @ 2026-03-17 23:38 UTC (permalink / raw)
  To: Aaron Tomlin, axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: johannes.thumshirn, kch, bvanassche, ritesh.list, neelx, sean,
	mproche, linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317182835.258183-1-atomlin@atomlin.com>

On 2026/03/18 3:28, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
> latency spikes can occur when fast devices (SSDs) are starved of hardware
> tags when sharing the same blk_mq_tag_set.
> 
> Currently, diagnosing this specific hardware queue contention is
> difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
> forces the current thread to block uninterruptible via io_schedule().
> While this can be inferred via sched:sched_switch or dynamically
> traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
> dedicated, out-of-the-box observability for this event.
> 
> This patch introduces the block_rq_tag_wait static tracepoint in
> the tag allocation slow-path. It triggers immediately before the
> thread yields the CPU, exposing the exact hardware context (hctx)
> that is starved, the total pool size, and the current active request
> count.
> 
> This provides storage engineers and performance monitoring agents
> with a zero-configuration, low-overhead mechanism to definitively
> identify shared-tag bottlenecks and tune I/O schedulers or cgroup
> throttling accordingly.
> 
> Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>

Looks OK to me, but I have some suggestions below.

> ---
>  block/blk-mq-tag.c           |  3 +++
>  include/trace/events/block.h | 36 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 39 insertions(+)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 33946cdb5716..f50993e86ca5 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -13,6 +13,7 @@
>  #include <linux/kmemleak.h>
>  
>  #include <linux/delay.h>
> +#include <trace/events/block.h>
>  #include "blk.h"
>  #include "blk-mq.h"
>  #include "blk-mq-sched.h"
> @@ -187,6 +188,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		if (tag != BLK_MQ_NO_TAG)
>  			break;
>  
> +		trace_block_rq_tag_wait(data->q, data->hctx);
> +
>  		bt_prev = bt;
>  		io_schedule();
>  
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 6aa79e2d799c..48e2ba433c87 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -226,6 +226,42 @@ DECLARE_EVENT_CLASS(block_rq,
>  		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
>  );
>  
> +/**
> + * block_rq_tag_wait - triggered when an I/O request is starved of a tag

when an I/O request -> when a request

> + * @q: queue containing the request

request queue of the target device

("containing" is odd here)

> + * @hctx: hardware context (queue) experiencing starvation

hardware context of the request

> + *
> + * Called immediately before the submitting thread is forced to block due

the submitting thread -> the submitting context

> + * to the exhaustion of available hardware tags. This tracepoint indicates

s/tracepoint/trace point

> + * that the thread will be placed into an uninterruptible state via

s/thread/context

> + * io_schedule() until an active block I/O operation completes and
> + * relinquishes its assigned tag.

until an active request completes

(BIOs do not have tags).

> + */
> +TRACE_EVENT(block_rq_tag_wait,
> +
> +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx),
> +
> +	TP_ARGS(q, hctx),
> +
> +	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
> +		__field( u32,		hctx_id			)
> +		__field( u32,		nr_tags			)
> +		__field( u32,		active_requests		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		  = q->disk ? disk_devt(q->disk) : 0;

I do not think that q->disk can ever be NULL when there is a request being
submitted.

> +		__entry->hctx_id	  = hctx ? hctx->queue_num : 0;
> +		__entry->nr_tags	  = hctx && hctx->tags ? hctx->tags->nr_tags : 0;
> +		__entry->active_requests  = hctx ? atomic_read(&hctx->nr_active) : 0;
> +	),
> +
> +	TP_printk("%d,%d hctx=%u starved (active=%u/%u)",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->hctx_id, __entry->active_requests, __entry->nr_tags)
> +);
> +
>  /**
>   * block_rq_insert - insert block operation request into queue
>   * @rq: block IO operation request


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply

* Re: [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Masami Hiramatsu @ 2026-03-18  0:02 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317191626.5b6172a9@robin>

On Tue, 17 Mar 2026 19:16:26 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 18 Mar 2026 08:03:51 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > On Tue, 17 Mar 2026 20:44:03 +0000
> > Josh Law <objecting@objecting.org> wrote:
> > 
> > > xbc_node_compose_key_after() passes a size_t buffer length to
> > > snprintf(), but snprintf() returns int. Guard against size values above
> > > INT_MAX before the loop so the existing truncation check can continue to
> > > compare ret against (int)size safely.
> > > 
> > > Add a small WARN_ON_ONCE shim for the tools/bootconfig userspace build
> > > so the same source continues to build there.  
> > 
> > NACK.
> > 
> > Don't do such over engineering effort.
> 
> Hi Masami,
> 
> This was somewhat my idea. Why do you think it's over engineering?
> 
> This is your code, so you have final say. I'm not going to push it. I'm
> just curious to your thoughts.

I sent a mail why I thought this is over engineering. I think this
comes from vsnprintf() interface design. If all user of that needs
to do this, that is not fair. It should be checked in vsnprintf()
and caller should just check the returned error.

> 
> It is interesting that snprintf() takes a size_t size, and the iterator
> inside is also size_t, but then it returns the value as an int.

Yes, that is checked in vsnprintf(), not its caller.
I think linux kernel should ensure the the return value is smaller
than INT_MAX, and return -EOVERFLOW if not.

Thank you,

> 
> That itself just looks wrong (and has nothing to do with your code).
> 
> -- Steve


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Steven Rostedt @ 2026-03-18  0:43 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260318090243.7c437f2c5e07a1ce00375102@kernel.org>

On Wed, 18 Mar 2026 09:02:43 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Yes, that is checked in vsnprintf(), not its caller.
> I think linux kernel should ensure the the return value is smaller
> than INT_MAX, and return -EOVERFLOW if not.

Well, there's very few places that could have a buffer size of > 2G.

What's the max bootconfig limit? Could you create a bootconfig that is
greater than 2G?

If not, then yeah, we shouldn't really care about overflows (and that
includes not worrying about typecasting the size variable to int).

-- Steve

^ permalink raw reply

* Re: [PATCH v4] lib/bootconfig: guard xbc_node_compose_key_after() buffer size
From: Masami Hiramatsu @ 2026-03-18  3:07 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317204327.3c61d0ea@robin>

On Tue, 17 Mar 2026 20:43:27 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 18 Mar 2026 09:02:43 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > Yes, that is checked in vsnprintf(), not its caller.
> > I think linux kernel should ensure the the return value is smaller
> > than INT_MAX, and return -EOVERFLOW if not.
> 
> Well, there's very few places that could have a buffer size of > 2G.
> 
> What's the max bootconfig limit? Could you create a bootconfig that is
> greater than 2G?

It's just 32KB. So we don't need it.
Anyway, I sent a patch about that. 

https://lore.kernel.org/all/177379678638.535490.18200744206158553364.stgit@devnote2/

Thank you,

> 
> If not, then yeah, we shouldn't really care about overflows (and that
> includes not worrying about typecasting the size variable to int).
> 
> -- Steve
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [RFC] Coding style consequences for multi-line statements?
From: Markus Elfring @ 2026-03-18  7:30 UTC (permalink / raw)
  To: Steven Rostedt, kernel-janitors, linux-doc, linux-trace-kernel
  Cc: Josh Law, Andrew Morton, Masami Hiramatsu, LKML
In-Reply-To: <20260317111026.62345f9e@gandalf.local.home>

> The brackets *are* appropriate. The rule of omitting the brackets is for
> *single line* statements. The above return statement is long and there's a
> line break, which means, curly brackets *are* required for visibility reasons.

Would any contributors like to clarify and adjust development documentation accordingly?
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/coding-style.rst?h=v7.0-rc4#n197

Regards,
Markus

^ permalink raw reply

* Re: [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Marek Szyprowski @ 2026-03-18  8:03 UTC (permalink / raw)
  To: Leon Romanovsky, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel, Will Deacon,
	Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260317190538.GD61385@unreal>

Hi Leon,

On 17.03.2026 20:05, Leon Romanovsky wrote:
> On Mon, Mar 16, 2026 at 09:06:44PM +0200, Leon Romanovsky wrote:
>> Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
>> mappings that must run on a DMA‑coherent system. Such buffers cannot
>> use the SWIOTLB path, may overlap with CPU caches, and do not depend on
>> explicit cache flushing.
>>
>> Mappings using this attribute are rejected on systems where cache
>> side‑effects could lead to data corruption, and therefore do not need
>> the cache‑overlap debugging logic. This series also includes fixes for
>> DMA_ATTR_CPU_CACHE_CLEAN handling.
>> Thanks.
> <...>
>
>> ---
>> Leon Romanovsky (8):
>>        dma-debug: Allow multiple invocations of overlapping entries
>>        dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
>>        dma-mapping: Clarify valid conditions for CPU cache line overlap
>>        dma-mapping: Introduce DMA require coherency attribute
>>        dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
>>        iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
>>        RDMA/umem: Tell DMA mapping that UMEM requires coherency
>>        mm/hmm: Indicate that HMM requires DMA coherency
>>
>>   Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
>>   drivers/infiniband/core/umem.c            |  5 ++--
>>   drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
>>   drivers/virtio/virtio_ring.c              | 10 ++++----
>>   include/linux/dma-mapping.h               | 15 ++++++++----
>>   include/trace/events/dma.h                |  4 +++-
>>   kernel/dma/debug.c                        |  9 ++++----
>>   kernel/dma/direct.h                       |  7 +++---
>>   kernel/dma/mapping.c                      |  6 +++++
>>   mm/hmm.c                                  |  4 ++--
>>   10 files changed, 86 insertions(+), 33 deletions(-)
> Marek,
>
> Despite the "RDMA ..." tag in the subject, the diffstat clearly shows that
> you are the appropriate person to take this patch.

I plan to take the first 2 patches to the dma-mapping-fixes branch 
(v7.0-rc) and the next to dma-mapping-for-next. Should I also take the 
RDMA and HMM patches, or do You want a stable branch for merging them 
via respective subsystem trees?

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply

* Re: [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Leon Romanovsky @ 2026-03-18  8:18 UTC (permalink / raw)
  To: Marek Szyprowski
  Cc: Robin Murphy, Michael S. Tsirkin, Petr Tesarik, Jonathan Corbet,
	Shuah Khan, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Jason Gunthorpe, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Joerg Roedel, Will Deacon, Andrew Morton,
	iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <de23ccf6-75ef-48af-8c69-2f416c564f2d@samsung.com>

On Wed, Mar 18, 2026 at 09:03:00AM +0100, Marek Szyprowski wrote:
> Hi Leon,
> 
> On 17.03.2026 20:05, Leon Romanovsky wrote:
> > On Mon, Mar 16, 2026 at 09:06:44PM +0200, Leon Romanovsky wrote:
> >> Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
> >> mappings that must run on a DMA‑coherent system. Such buffers cannot
> >> use the SWIOTLB path, may overlap with CPU caches, and do not depend on
> >> explicit cache flushing.
> >>
> >> Mappings using this attribute are rejected on systems where cache
> >> side‑effects could lead to data corruption, and therefore do not need
> >> the cache‑overlap debugging logic. This series also includes fixes for
> >> DMA_ATTR_CPU_CACHE_CLEAN handling.
> >> Thanks.
> > <...>
> >
> >> ---
> >> Leon Romanovsky (8):
> >>        dma-debug: Allow multiple invocations of overlapping entries
> >>        dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
> >>        dma-mapping: Clarify valid conditions for CPU cache line overlap
> >>        dma-mapping: Introduce DMA require coherency attribute
> >>        dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
> >>        iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
> >>        RDMA/umem: Tell DMA mapping that UMEM requires coherency
> >>        mm/hmm: Indicate that HMM requires DMA coherency
> >>
> >>   Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
> >>   drivers/infiniband/core/umem.c            |  5 ++--
> >>   drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
> >>   drivers/virtio/virtio_ring.c              | 10 ++++----
> >>   include/linux/dma-mapping.h               | 15 ++++++++----
> >>   include/trace/events/dma.h                |  4 +++-
> >>   kernel/dma/debug.c                        |  9 ++++----
> >>   kernel/dma/direct.h                       |  7 +++---
> >>   kernel/dma/mapping.c                      |  6 +++++
> >>   mm/hmm.c                                  |  4 ++--
> >>   10 files changed, 86 insertions(+), 33 deletions(-)
> > Marek,
> >
> > Despite the "RDMA ..." tag in the subject, the diffstat clearly shows that
> > you are the appropriate person to take this patch.
> 
> I plan to take the first 2 patches to the dma-mapping-fixes branch 
> (v7.0-rc) and the next to dma-mapping-for-next. Should I also take the 
> RDMA and HMM patches, or do You want a stable branch for merging them 
> via respective subsystem trees?

I suggest taking all patches into the -fixes branch, as the "RDMA/..." patch
also resolves the dmesg splat. With -fixes, there is no need to worry about
a shared branch since we do not expect merge conflicts in that area.

If you still prefer to split the series between -fixes and -next, it would be
better to use a shared branch in that case. There are patches on the RDMA
list targeted for -next that touch ib_umem_get().

Thanks

> 
> Best regards
> -- 
> Marek Szyprowski, PhD
> Samsung R&D Institute Poland
> 
> 

^ permalink raw reply

* [PATCH 0/8] memblock: improve late freeing of reserved memory
From: Mike Rapoport @ 2026-03-18 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
	Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
	Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Hi,

Following a recent discussion about leaks in x86 EFI [1], I audited usage of
memblock_free_late() and free_reserved_area() and made some imporovements how
we handle late freeing of the memory allocated with memblock.

[1] https://lore.kernel.org/all/ec2aaef14783869b3be6e3c253b2dcbf67dbc12a.camel@kernel.crashing.org/

Mike Rapoport (Microsoft) (8):
  powerpc: fadump: pair alloc_pages_exact() with free_pages_exact()
  powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact()
  mm: move free_reserved_area() to mm/memblock.c
  memblock: make free_reserved_area() more robust
  memblock: extract page freeing from free_reserved_area() into a helper
  memblock: make free_reserved_area() update memblock if ARCH_KEEP_MEMBLOCK=y
  memblock, treewide: make memblock_free() handle late freeing
  memblock: warn when freeing reserved memory before memory map is
    initialized

 arch/arm64/mm/init.c                       |   3 -
 arch/powerpc/kernel/fadump.c               |  16 +--
 arch/powerpc/platforms/powernv/opal-core.c |   9 +-
 arch/sparc/kernel/mdesc.c                  |   4 +-
 arch/x86/kernel/setup.c                    |   2 +-
 arch/x86/platform/efi/memmap.c             |   5 +-
 arch/x86/platform/efi/quirks.c             |   2 +-
 drivers/firmware/efi/apple-properties.c    |   2 +-
 drivers/of/kexec.c                         |   2 +-
 include/linux/memblock.h                   |   2 -
 init/initramfs.c                           |   7 --
 kernel/dma/swiotlb.c                       |   6 +-
 lib/bootconfig.c                           |   2 +-
 mm/internal.h                              |  10 ++
 mm/kfence/core.c                           |   4 +-
 mm/memblock.c                              | 110 ++++++++++++++-------
 mm/page_alloc.c                            |  46 ---------
 17 files changed, 102 insertions(+), 130 deletions(-)


base-commit: 1f318b96cc84d7c2ab792fcc0bfd42a7ca890681
--
2.51.0

^ permalink raw reply

* [PATCH 1/8] powerpc: fadump: pair alloc_pages_exact() with free_pages_exact()
From: Mike Rapoport @ 2026-03-18 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
	Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
	Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86
In-Reply-To: <20260318105827.1358927-1-rppt@kernel.org>

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

fadump allocates buffers with alloc_pages_exact(), but then marks them
as reserved and frees using free_reserved_area().

This is completely unnecessary and the pages allocated with
alloc_pages_exact() can be naturally freed with free_pages_exact().

Replace freeing of memory in fadump_free_buffer() with
free_pages_exact() and simplify allocation code so that it won't mark
allocated pages as reserved.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 arch/powerpc/kernel/fadump.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4ebc333dd786..501d43bf18f3 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -775,24 +775,12 @@ void __init fadump_update_elfcore_header(char *bufp)
 
 static void *__init fadump_alloc_buffer(unsigned long size)
 {
-	unsigned long count, i;
-	struct page *page;
-	void *vaddr;
-
-	vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
-	if (!vaddr)
-		return NULL;
-
-	count = PAGE_ALIGN(size) / PAGE_SIZE;
-	page = virt_to_page(vaddr);
-	for (i = 0; i < count; i++)
-		mark_page_reserved(page + i);
-	return vaddr;
+	return  alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
 }
 
 static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
 {
-	free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL);
+	free_pages_exact((void *)vaddr, size);
 }
 
 s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus)
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH 00/15] tracepoint: Avoid double static_branch evaluation at guarded call sites
From: Vineeth Remanan Pillai @ 2026-03-18 10:58 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Steven Rostedt, Andrii Nakryiko, Peter Zijlstra, Dmitry Ilvokhin,
	Masami Hiramatsu, Ingo Molnar, Jens Axboe, io-uring,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Marcelo Ricardo Leitner,
	Xin Long, Jon Maloy, Aaron Conole, Eelco Chaudron, Ilya Maximets,
	netdev, bpf, linux-sctp, tipc-discussion, dev, Oded Gabbay,
	Koby Elbaz, dri-devel, Rafael J. Wysocki, Viresh Kumar,
	Gautham R. Shenoy, Huang Rui, Mario Limonciello, Len Brown,
	Srinivas Pandruvada, linux-pm, MyungJoo Ham, Kyungmin Park,
	Chanwoo Choi, Christian König, Sumit Semwal, linaro-mm-sig,
	Eddie James, Andrew Jeffery, Joel Stanley, linux-fsi,
	David Airlie, Simona Vetter, Alex Deucher, Danilo Krummrich,
	Matthew Brost, Philipp Stanner, Harry Wentland, Leo Li, amd-gfx,
	Jiri Kosina, Benjamin Tissoires, linux-input, Wolfram Sang,
	linux-i2c, Mark Brown, Michael Hennerich, Nuno Sá, linux-spi,
	James E.J. Bottomley, Martin K. Petersen, linux-scsi, Chris Mason,
	David Sterba, linux-btrfs, linux-trace-kernel, linux-kernel
In-Reply-To: <6ca9f884-9566-4a82-9995-4c802a0bf8a0@efficios.com>

On Tue, Mar 17, 2026 at 12:02 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> On 2026-03-17 12:00, Steven Rostedt wrote:
> > On Fri, 13 Mar 2026 10:02:32 -0400
> > Vineeth Remanan Pillai <vineeth@bitbyteword.org> wrote:
> >
> >>>
> >>> Perhaps: call_trace_foo() ?
> >>>
> >> call_trace_foo has one collision with the tracepoint
> >> sched_update_nr_running and a function
> >> call_trace_sched_update_nr_running. I had considered this and later
> >> moved to trace_invoke_foo() because of the collision. But I can rename
> >> call_trace_sched_update_nr_running to something else if call_trace_foo
> >> is the general consensus.
> >
> > OK, then lets go with: trace_call__foo()
> >
> > The double underscore should prevent any name collisions.
> >
> > Does anyone have an objections?
> I'm OK with it.
>
Great thanks! I shall send a v2 with s/trace_invoke_foo/trace_call__foo/ soon.

Thanks,
Vineeth

^ permalink raw reply

* [PATCH 2/8] powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact()
From: Mike Rapoport @ 2026-03-18 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
	Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
	Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86
In-Reply-To: <20260318105827.1358927-1-rppt@kernel.org>

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

opal-core allocates buffers with alloc_pages_exact(), but then
marks them as reserved and frees using free_reserved_area().

This is completely unnecessary and the pages allocated with
alloc_pages_exact() can be naturally freed with free_pages_exact().

Replace freeing of memory in opalcore_cleanup() with
free_pages_exact() and simplify allocation code so that it won't mark
allocated pages as reserved.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 arch/powerpc/platforms/powernv/opal-core.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index e76e462f55f6..abd99ddbf21f 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -303,7 +303,6 @@ static int __init create_opalcore(void)
 	struct device_node *dn;
 	struct opalcore *new;
 	loff_t opalcore_off;
-	struct page *page;
 	Elf64_Phdr *phdr;
 	Elf64_Ehdr *elf;
 	int i, ret;
@@ -329,9 +328,6 @@ static int __init create_opalcore(void)
 		return -ENOMEM;
 	}
 	count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
-	page = virt_to_page(oc_conf->opalcorebuf);
-	for (i = 0; i < count; i++)
-		mark_page_reserved(page + i);
 
 	pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
 
@@ -437,10 +433,7 @@ static void opalcore_cleanup(void)
 
 	/* free the buffer used for setting up OPAL core */
 	if (oc_conf->opalcorebuf) {
-		void *end = (void *)((u64)oc_conf->opalcorebuf +
-				     oc_conf->opalcorebuf_sz);
-
-		free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+		free_pages_exact(oc_conf->opalcorebuf, oc_conf->opalcorebuf_sz);
 		oc_conf->opalcorebuf = NULL;
 		oc_conf->opalcorebuf_sz = 0;
 	}
-- 
2.51.0


^ permalink raw reply related

* [PATCH 3/8] mm: move free_reserved_area() to mm/memblock.c
From: Mike Rapoport @ 2026-03-18 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
	Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
	Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
	David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
	Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
	Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
	Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
	Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
	Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
	Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
	iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
	linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
	sparclinux, x86
In-Reply-To: <20260318105827.1358927-1-rppt@kernel.org>

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

free_reserved_area() is related to memblock as it frees reserved memory
back to the buddy allocator, similar to what memblock_free_late() does.

Move free_reserved_area() to mm/memblock.c to prepare for further
consolidation of the functions that free reserved memory.

No functional changes.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/memblock.c   | 37 ++++++++++++++++++++++++++++++++++++-
 mm/page_alloc.c | 36 ------------------------------------
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index b3ddfdec7a80..8f3010dddc58 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -893,6 +893,42 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 	return memblock_remove_range(&memblock.memory, base, size);
 }
 
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
+{
+	void *pos;
+	unsigned long pages = 0;
+
+	start = (void *)PAGE_ALIGN((unsigned long)start);
+	end = (void *)((unsigned long)end & PAGE_MASK);
+	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+		struct page *page = virt_to_page(pos);
+		void *direct_map_addr;
+
+		/*
+		 * 'direct_map_addr' might be different from 'pos'
+		 * because some architectures' virt_to_page()
+		 * work with aliases.  Getting the direct map
+		 * address ensures that we get a _writeable_
+		 * alias for the memset().
+		 */
+		direct_map_addr = page_address(page);
+		/*
+		 * Perform a kasan-unchecked memset() since this memory
+		 * has not been initialized.
+		 */
+		direct_map_addr = kasan_reset_tag(direct_map_addr);
+		if ((unsigned int)poison <= 0xFF)
+			memset(direct_map_addr, poison, PAGE_SIZE);
+
+		free_reserved_page(page);
+	}
+
+	if (pages && s)
+		pr_info("Freeing %s memory: %ldK\n", s, K(pages));
+
+	return pages;
+}
+
 /**
  * memblock_free - free boot memory allocation
  * @ptr: starting address of the  boot memory allocation
@@ -1776,7 +1812,6 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
 		totalram_pages_inc();
 	}
 }
-
 /*
  * Remaining API functions
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..df3d61253001 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6234,42 +6234,6 @@ void adjust_managed_page_count(struct page *page, long count)
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 
-unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
-{
-	void *pos;
-	unsigned long pages = 0;
-
-	start = (void *)PAGE_ALIGN((unsigned long)start);
-	end = (void *)((unsigned long)end & PAGE_MASK);
-	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
-		struct page *page = virt_to_page(pos);
-		void *direct_map_addr;
-
-		/*
-		 * 'direct_map_addr' might be different from 'pos'
-		 * because some architectures' virt_to_page()
-		 * work with aliases.  Getting the direct map
-		 * address ensures that we get a _writeable_
-		 * alias for the memset().
-		 */
-		direct_map_addr = page_address(page);
-		/*
-		 * Perform a kasan-unchecked memset() since this memory
-		 * has not been initialized.
-		 */
-		direct_map_addr = kasan_reset_tag(direct_map_addr);
-		if ((unsigned int)poison <= 0xFF)
-			memset(direct_map_addr, poison, PAGE_SIZE);
-
-		free_reserved_page(page);
-	}
-
-	if (pages && s)
-		pr_info("Freeing %s memory: %ldK\n", s, K(pages));
-
-	return pages;
-}
-
 void free_reserved_page(struct page *page)
 {
 	clear_page_tag_ref(page);
-- 
2.51.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox