Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v2 1/8] dma-debug: Allow multiple invocations of overlapping entries
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Repeated DMA mappings with DMA_ATTR_CPU_CACHE_CLEAN trigger the
following splat. This prevents using the attribute in cases where a DMA
region is shared and reused more than seven times.

 ------------[ cut here ]------------
 DMA-API: exceeded 7 overlapping mappings of cacheline 0x000000000438c440
 WARNING: kernel/dma/debug.c:467 at add_dma_entry+0x219/0x280, CPU#4: ibv_rc_pingpong/1644
 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core
 CPU: 4 UID: 2733 PID: 1644 Comm: ibv_rc_pingpong Not tainted 6.19.0+ #129 PREEMPT
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 RIP: 0010:add_dma_entry+0x221/0x280
 Code: c0 0f 84 f2 fe ff ff 83 e8 01 89 05 6d 99 11 01 e9 e4 fe ff ff 0f 8e 1f ff ff ff 48 8d 3d 07 ef 2d 01 be 07 00 00 00 48 89 e2 <67> 48 0f b9 3a e9 06 ff ff ff 48 c7 c7 98 05 2b 82 c6 05 72 92 28
 RSP: 0018:ff1100010e657970 EFLAGS: 00010002
 RAX: 0000000000000007 RBX: ff1100010234eb00 RCX: 0000000000000000
 RDX: ff1100010e657970 RSI: 0000000000000007 RDI: ffffffff82678660
 RBP: 000000000438c440 R08: 0000000000000228 R09: 0000000000000000
 R10: 00000000000001be R11: 000000000000089d R12: 0000000000000800
 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010234eb00
 FS:  00007fb15f3f6740(0000) GS:ff110008dcc19000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fb15f32d3a0 CR3: 0000000116f59001 CR4: 0000000000373eb0
 Call Trace:
  <TASK>
  debug_dma_map_sg+0x1b4/0x390
  __dma_map_sg_attrs+0x6d/0x1a0
  dma_map_sgtable+0x19/0x30
  ib_umem_get+0x284/0x3b0 [ib_uverbs]
  mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib]
  ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs]
  ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs]
  ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs]
  ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs]
  ? mmap_region+0x7a/0xb0
  ? do_mmap+0x3b8/0x5c0
  ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs]
  __x64_sys_ioctl+0x14f/0x8b0
  ? ksys_mmap_pgoff+0xc5/0x190
  do_syscall_64+0x8c/0xbf0
  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 RIP: 0033:0x7fb15f5e4eed
 Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
 RSP: 002b:00007ffe09a5c540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 00007ffe09a5c5d0 RCX: 00007fb15f5e4eed
 RDX: 00007ffe09a5c5f0 RSI: 00000000c0181b01 RDI: 0000000000000003
 RBP: 00007ffe09a5c590 R08: 0000000000000028 R09: 00007ffe09a5c794
 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe09a5c794
 R13: 000000000000000c R14: 0000000025a49170 R15: 000000000000000c
  </TASK>
 ---[ end trace 0000000000000000 ]---

Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 kernel/dma/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 86f87e43438c3..be207be749968 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
 	return overlap;
 }
 
-static void active_cacheline_inc_overlap(phys_addr_t cln)
+static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean)
 {
 	int overlap = active_cacheline_read_overlap(cln);
 
@@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
 	/* If we overflowed the overlap counter then we're potentially
 	 * leaking dma-mappings.
 	 */
-	WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+	WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
 		  pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
 		  ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
 }
@@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry,
 	if (rc == -EEXIST) {
 		struct dma_debug_entry *existing;
 
-		active_cacheline_inc_overlap(cln);
+		active_cacheline_inc_overlap(cln, entry->is_cache_clean);
 		existing = radix_tree_lookup(&dma_active_cacheline, cln);
 		/* A lookup failure here after we got -EEXIST is unexpected. */
 		WARN_ON(!existing);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 3/8] dma-mapping: Clarify valid conditions for CPU cache line overlap
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Rename the DMA_ATTR_CPU_CACHE_CLEAN attribute to better reflect that it
is debugging aid to inform DMA core code that CPU cache line overlaps are
allowed, and refine the documentation describing its use.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 Documentation/core-api/dma-attributes.rst | 22 ++++++++++++++--------
 drivers/virtio/virtio_ring.c              | 10 +++++-----
 include/linux/dma-mapping.h               |  8 ++++----
 include/trace/events/dma.h                |  2 +-
 kernel/dma/debug.c                        |  2 +-
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst
index 1d7bfad73b1c7..48cfe86cc06d7 100644
--- a/Documentation/core-api/dma-attributes.rst
+++ b/Documentation/core-api/dma-attributes.rst
@@ -149,11 +149,17 @@ For architectures that require cache flushing for DMA coherence
 DMA_ATTR_MMIO will not perform any cache flushing. The address
 provided must never be mapped cacheable into the CPU.
 
-DMA_ATTR_CPU_CACHE_CLEAN
-------------------------
-
-This attribute indicates the CPU will not dirty any cacheline overlapping this
-DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows
-multiple small buffers to safely share a cacheline without risk of data
-corruption, suppressing DMA debug warnings about overlapping mappings.
-All mappings sharing a cacheline should have this attribute.
+DMA_ATTR_DEBUGGING_IGNORE_CACHELINES
+------------------------------------
+
+This attribute indicates that CPU cache lines may overlap for buffers mapped
+with DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+
+Such overlap may occur when callers map multiple small buffers that reside
+within the same cache line. In this case, callers must guarantee that the CPU
+will not dirty these cache lines after the mappings are established. When this
+condition is met, multiple buffers can safely share a cache line without risking
+data corruption.
+
+All mappings that share a cache line must set this attribute to suppress DMA
+debug warnings about overlapping mappings.
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 335692d41617a..fbca7ce1c6bf0 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2912,10 +2912,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
  * @data: the token identifying the buffer.
  * @gfp: how to do memory allocations (if necessary).
  *
- * Same as virtqueue_add_inbuf but passes DMA_ATTR_CPU_CACHE_CLEAN to indicate
- * that the CPU will not dirty any cacheline overlapping this buffer while it
- * is available, and to suppress overlapping cacheline warnings in DMA debug
- * builds.
+ * Same as virtqueue_add_inbuf but passes DMA_ATTR_DEBUGGING_IGNORE_CACHELINES
+ * to indicate that the CPU will not dirty any cacheline overlapping this buffer
+ * while it is available, and to suppress overlapping cacheline warnings in DMA
+ * debug builds.
  *
  * Caller must ensure we don't call this with other virtqueue operations
  * at the same time (except where noted).
@@ -2928,7 +2928,7 @@ int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq,
 				    gfp_t gfp)
 {
 	return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp,
-			     DMA_ATTR_CPU_CACHE_CLEAN);
+			     DMA_ATTR_DEBUGGING_IGNORE_CACHELINES);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_cache_clean);
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 29973baa05816..da44394b3a1a7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -80,11 +80,11 @@
 #define DMA_ATTR_MMIO		(1UL << 10)
 
 /*
- * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline
- * overlapping this buffer while it is mapped for DMA. All mappings sharing
- * a cacheline must have this attribute for this to be considered safe.
+ * DMA_ATTR_DEBUGGING_IGNORE_CACHELINES: Indicates the CPU cache line can be
+ * overlapped. All mappings sharing a cacheline must have this attribute for
+ * this to be considered safe.
  */
-#define DMA_ATTR_CPU_CACHE_CLEAN	(1UL << 11)
+#define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES	(1UL << 11)
 
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index 69cb3805ee81c..8c64bc0721fe4 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -33,7 +33,7 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 		{ DMA_ATTR_NO_WARN, "NO_WARN" }, \
 		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \
 		{ DMA_ATTR_MMIO, "MMIO" }, \
-		{ DMA_ATTR_CPU_CACHE_CLEAN, "CACHE_CLEAN" })
+		{ DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" })
 
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index be207be749968..83e1cfe05f08d 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -601,7 +601,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
 	unsigned long flags;
 	int rc;
 
-	entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
+	entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES;
 
 	bucket = get_hash_bucket(entry, &flags);
 	hash_bucket_add(bucket, entry);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 4/8] dma-mapping: Introduce DMA require coherency attribute
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

The mapping buffers which carry this attribute require DMA coherent system.
This means that they can't take SWIOTLB path, can perform CPU cache overlap
and doesn't perform cache flushing.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 Documentation/core-api/dma-attributes.rst | 12 ++++++++++++
 include/linux/dma-mapping.h               |  7 +++++++
 include/trace/events/dma.h                |  3 ++-
 kernel/dma/debug.c                        |  3 ++-
 kernel/dma/mapping.c                      |  6 ++++++
 5 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst
index 48cfe86cc06d7..69d094f144c70 100644
--- a/Documentation/core-api/dma-attributes.rst
+++ b/Documentation/core-api/dma-attributes.rst
@@ -163,3 +163,15 @@ data corruption.
 
 All mappings that share a cache line must set this attribute to suppress DMA
 debug warnings about overlapping mappings.
+
+DMA_ATTR_REQUIRE_COHERENT
+-------------------------
+
+The mapping buffers which carry this attribute require DMA coherent system. This means
+that they can't take SWIOTLB path, can perform CPU cache overlap and doesn't perform
+cache flushing.
+
+If the mapping has this attribute then it is prevented from running on systems
+where these cache artifacts can cause corruption, and as such doesn't need
+cache overlapping debugging code (same behavior as for
+DMA_ATTR_DEBUGGING_IGNORE_CACHELINES).
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index da44394b3a1a7..482b919f040f7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -86,6 +86,13 @@
  */
 #define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES	(1UL << 11)
 
+/*
+ * DMA_ATTR_REQUIRE_COHERENT: Indicates that DMA coherency is required.
+ * All mappings that carry this attribute can't work with SWIOTLB and cache
+ * flushing.
+ */
+#define DMA_ATTR_REQUIRE_COHERENT	(1UL << 12)
+
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
  * be given to a device to use as a DMA source or target.  It is specific to a
diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index 8c64bc0721fe4..63597b0044247 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -33,7 +33,8 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 		{ DMA_ATTR_NO_WARN, "NO_WARN" }, \
 		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \
 		{ DMA_ATTR_MMIO, "MMIO" }, \
-		{ DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" })
+		{ DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }, \
+		{ DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" })
 
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 83e1cfe05f08d..0677918f06a80 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
 	unsigned long flags;
 	int rc;
 
-	entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES;
+	entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES |
+					 DMA_ATTR_REQUIRE_COHERENT);
 
 	bucket = get_hash_bucket(entry, &flags);
 	hash_bucket_add(bucket, entry);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 3928a509c44c2..6d3dd0bd3a886 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return DMA_MAPPING_ERROR;
 
+	if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+		return DMA_MAPPING_ERROR;
+
 	if (dma_map_direct(dev, ops) ||
 	    (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
 		addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
@@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 
 	BUG_ON(!valid_dma_direction(dir));
 
+	if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+		return -EOPNOTSUPP;
+
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return 0;
 

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 2/8] dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Tracing prints decoded DMA attribute flags, but it does not yet
include the recently added DMA_ATTR_CPU_CACHE_CLEAN. Add support
for decoding and displaying this attribute in the trace output.

Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/trace/events/dma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index 33e99e792f1aa..69cb3805ee81c 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -32,7 +32,8 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 		{ DMA_ATTR_ALLOC_SINGLE_PAGES, "ALLOC_SINGLE_PAGES" }, \
 		{ DMA_ATTR_NO_WARN, "NO_WARN" }, \
 		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \
-		{ DMA_ATTR_MMIO, "MMIO" })
+		{ DMA_ATTR_MMIO, "MMIO" }, \
+		{ DMA_ATTR_CPU_CACHE_CLEAN, "CACHE_CLEAN" })
 
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr,

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 6/8] iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Add support for the DMA_ATTR_REQUIRE_COHERENT attribute to the exported
functions. This attribute indicates that the SWIOTLB path must not be
used and that no sync operations should be performed.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5dac64be61bb2..94d5141696424 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1211,7 +1211,7 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 	 */
 	if (dev_use_swiotlb(dev, size, dir) &&
 	    iova_unaligned(iovad, phys, size)) {
-		if (attrs & DMA_ATTR_MMIO)
+		if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
 			return DMA_MAPPING_ERROR;
 
 		phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs);
@@ -1223,7 +1223,8 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 		arch_sync_dma_for_device(phys, size, dir);
 
 	iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
-	if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO))
+	if (iova == DMA_MAPPING_ERROR &&
+	    !(attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 	return iova;
 }
@@ -1233,7 +1234,7 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
 {
 	phys_addr_t phys;
 
-	if (attrs & DMA_ATTR_MMIO) {
+	if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) {
 		__iommu_dma_unmap(dev, dma_handle, size);
 		return;
 	}
@@ -1945,9 +1946,21 @@ int dma_iova_link(struct device *dev, struct dma_iova_state *state,
 	if (WARN_ON_ONCE(iova_start_pad && offset > 0))
 		return -EIO;
 
+	/*
+	 * DMA_IOVA_USE_SWIOTLB is set on state after some entry
+	 * took SWIOTLB path, which we were supposed to prevent
+	 * for DMA_ATTR_REQUIRE_COHERENT attribute.
+	 */
+	if (WARN_ON_ONCE((state->__size & DMA_IOVA_USE_SWIOTLB) &&
+			 (attrs & DMA_ATTR_REQUIRE_COHERENT)))
+		return -EOPNOTSUPP;
+
+	if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+		return -EOPNOTSUPP;
+
 	if (dev_use_swiotlb(dev, size, dir) &&
 	    iova_unaligned(iovad, phys, size)) {
-		if (attrs & DMA_ATTR_MMIO)
+		if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
 			return -EPERM;
 
 		return iommu_dma_iova_link_swiotlb(dev, state, phys, offset,

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 7/8] RDMA/umem: Tell DMA mapping that UMEM requires coherency
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

The RDMA subsystem exposes DMA regions through the verbs interface, which
assumes a coherent system. Use the DMA_ATTR_REQUIRE_COHERENCE attribute to
ensure coherency and avoid taking the SWIOTLB path.

In addition, a given region may be exported multiple times, which will trigger
warnings about cacheline overlaps. These warnings are suppressed when using
the new attribute.

infiniband rocep8s0f0: mlx5_ib_reg_user_mr:1592:(pid 5812): start 0x2b28c000, iova 0x2b28c000, length 0x1000, access_flags 0x1
infiniband rocep8s0f0: mlx5_ib_reg_user_mr:1592:(pid 5812): start 0x2b28c001, iova 0x2b28c001, length 0xfff, access_flags 0x1
 ------------[ cut here ]------------
 DMA-API: mlx5_core 0000:08:00.0: cacheline tracking EEXIST, overlapping mappings aren't supported
 WARNING: kernel/dma/debug.c:620 at add_dma_entry+0x1bb/0x280, CPU#6: ibv_rc_pingpong/5812
 Modules linked in: veth xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core
 CPU: 6 UID: 2733 PID: 5812 Comm: ibv_rc_pingpong Tainted: G        W           6.19.0+ #129 PREEMPT
 Tainted: [W]=WARN
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 RIP: 0010:add_dma_entry+0x1be/0x280
 Code: 8b 7b 10 48 85 ff 0f 84 c3 00 00 00 48 8b 6f 50 48 85 ed 75 03 48 8b 2f e8 ff 8e 6a 00 48 89 c6 48 8d 3d 55 ef 2d 01 48 89 ea <67> 48 0f b9 3a 48 85 db 74 1a 48 c7 c7 b0 00 2b 82 e8 9c 25 fd ff
 RSP: 0018:ff11000138717978 EFLAGS: 00010286
 RAX: ffffffffa02d7831 RBX: ff1100010246de00 RCX: 0000000000000000
 RDX: ff110001036fac30 RSI: ffffffffa02d7831 RDI: ffffffff82678650
 RBP: ff110001036fac30 R08: ff11000110dcb4a0 R09: ff11000110dcb478
 R10: 0000000000000000 R11: ffffffff824b30a8 R12: 0000000000000000
 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010246de00
 FS:  00007f59b411c740(0000) GS:ff110008dcc99000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007ffe538f7000 CR3: 000000010e066005 CR4: 0000000000373eb0
 Call Trace:
  <TASK>
  debug_dma_map_sg+0x1b4/0x390
  __dma_map_sg_attrs+0x6d/0x1a0
  dma_map_sgtable+0x19/0x30
  ib_umem_get+0x254/0x380 [ib_uverbs]
  mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib]
  ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs]
  ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs]
  ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs]
  ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs]
  ? mmap_region+0x7a/0xb0
  ? do_mmap+0x3b8/0x5c0
  ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs]
  __x64_sys_ioctl+0x14f/0x8b0
  ? ksys_mmap_pgoff+0xc5/0x190
  do_syscall_64+0x8c/0xbf0
  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 RIP: 0033:0x7f59b430aeed
 Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
 RSP: 002b:00007ffe538f9430 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 00007ffe538f94c0 RCX: 00007f59b430aeed
 RDX: 00007ffe538f94e0 RSI: 00000000c0181b01 RDI: 0000000000000003
 RBP: 00007ffe538f9480 R08: 0000000000000028 R09: 00007ffe538f9684
 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe538f9684
 R13: 000000000000000c R14: 000000002b28d170 R15: 000000000000000c
  </TASK>
 ---[ end trace 0000000000000000 ]---

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/umem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index cff4fcca2c345..edc34c69f0f23 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -55,7 +55,8 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 
 	if (dirty)
 		ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
-					   DMA_BIDIRECTIONAL, 0);
+					   DMA_BIDIRECTIONAL,
+					   DMA_ATTR_REQUIRE_COHERENT);
 
 	for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
 		unpin_user_page_range_dirty_lock(sg_page(sg),
@@ -169,7 +170,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 	unsigned long lock_limit;
 	unsigned long new_pinned;
 	unsigned long cur_base;
-	unsigned long dma_attr = 0;
+	unsigned long dma_attr = DMA_ATTR_REQUIRE_COHERENT;
 	struct mm_struct *mm;
 	unsigned long npages;
 	int pinned, ret;

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 8/8] mm/hmm: Indicate that HMM requires DMA coherency
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

HMM mirroring can work on coherent systems without SWIOTLB path only.
Until introduction of DMA_ATTR_REQUIRE_COHERENT, there was no reliable
way to indicate that and various approximation was done:

int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
                      size_t nr_entries, size_t dma_entry_size)
{
<...>
        /*
         * The HMM API violates our normal DMA buffer ownership rules and can't
         * transfer buffer ownership.  The dma_addressing_limited() check is a
         * best approximation to ensure no swiotlb buffering happens.
         */
        dma_need_sync = !dev->dma_skip_sync;
        if (dma_need_sync || dma_addressing_limited(dev))
                return -EOPNOTSUPP;

So let's mark mapped buffers with DMA_ATTR_REQUIRE_COHERENT attribute
to prevent DMA debugging warnings for cache overlapped entries.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 mm/hmm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index f6c4ddff4bd61..5955f2f0c83db 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -778,7 +778,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
 	struct page *page = hmm_pfn_to_page(pfns[idx]);
 	phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
 	size_t offset = idx * map->dma_entry_size;
-	unsigned long attrs = 0;
+	unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
 	dma_addr_t dma_addr;
 	int ret;
 
@@ -871,7 +871,7 @@ bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
 	struct dma_iova_state *state = &map->state;
 	dma_addr_t *dma_addrs = map->dma_list;
 	unsigned long *pfns = map->pfn_list;
-	unsigned long attrs = 0;
+	unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
 
 	if ((pfns[idx] & valid_dma) != valid_dma)
 		return false;

-- 
2.53.0


^ permalink raw reply related

* [PATCH v2 5/8] dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
From: Leon Romanovsky @ 2026-03-11 19:08 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

DMA_ATTR_REQUIRE_COHERENT indicates that SWIOTLB must not be used.
Ensure the SWIOTLB path is declined whenever the DMA direct path is
selected.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 kernel/dma/direct.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e89f175e9c2d0..6184ff303f080 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -84,7 +84,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	dma_addr_t dma_addr;
 
 	if (is_swiotlb_force_bounce(dev)) {
-		if (attrs & DMA_ATTR_MMIO)
+		if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
 			return DMA_MAPPING_ERROR;
 
 		return swiotlb_map(dev, phys, size, dir, attrs);
@@ -98,7 +98,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		dma_addr = phys_to_dma(dev, phys);
 		if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
 		    dma_kmalloc_needs_bounce(dev, size, dir)) {
-			if (is_swiotlb_active(dev))
+			if (is_swiotlb_active(dev) &&
+			    !(attrs & DMA_ATTR_REQUIRE_COHERENT))
 				return swiotlb_map(dev, phys, size, dir, attrs);
 
 			goto err_overflow;
@@ -123,7 +124,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 {
 	phys_addr_t phys;
 
-	if (attrs & DMA_ATTR_MMIO)
+	if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
 		/* nothing to do: uncached and no swiotlb */
 		return;
 

-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Peter Zijlstra @ 2026-03-11 19:43 UTC (permalink / raw)
  To: Wander Lairson Costa
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco
In-Reply-To: <20260311125021.197638-5-wander@redhat.com>

On Wed, Mar 11, 2026 at 09:50:18AM -0300, Wander Lairson Costa wrote:
> +#define local_irq_enable()				\
> +	do {						\
> +		if (tracepoint_enabled(irq_enable))	\
> +			trace_local_irq_enable();	\

I'm thinking you didn't even look at the assembly generated :/

Otherwise you would have written this like:

		if (tracepoint_enabled(irq_enable))
			__do_trace_local_irq_enable();

> +		raw_local_irq_enable();			\
> +	} while (0)

Again, this was one instruction, and you clearly didn't bother looking
at the mess you've generated :/

^ permalink raw reply

* Re: [PATCH v3 1/4] tracing/preemptirq: Optimize preempt_disable/enable() tracepoint overhead
From: Peter Zijlstra @ 2026-03-11 19:35 UTC (permalink / raw)
  To: Wander Lairson Costa
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton,
	open list:SCHEDULER, open list:TRACING, acme, williams, gmonaco
In-Reply-To: <20260311125021.197638-2-wander@redhat.com>

On Wed, Mar 11, 2026 at 09:50:15AM -0300, Wander Lairson Costa wrote:

> +extern void __trace_preempt_on(void);
> +extern void __trace_preempt_off(void);
> +
> +DECLARE_TRACEPOINT(preempt_enable);
> +DECLARE_TRACEPOINT(preempt_disable);
> +
> +#define __preempt_trace_enabled(type, val) \
> +	(tracepoint_enabled(preempt_##type) && preempt_count() == (val))
> +
> +static __always_inline void preempt_count_add(int val)
> +{
> +	__preempt_count_add(val);
> +
> +	if (__preempt_trace_enabled(disable, val))
> +		__trace_preempt_off();
> +}
> +
> +static __always_inline void preempt_count_sub(int val)
> +{
> +	if (__preempt_trace_enabled(enable, val))
> +		__trace_preempt_on();
> +
> +	__preempt_count_sub(val);
> +}
>  #else
>  #define preempt_count_add(val)	__preempt_count_add(val)
>  #define preempt_count_sub(val)	__preempt_count_sub(val)
>  #define preempt_count_dec_and_test() __preempt_count_dec_and_test()
>  #endif
>  
> +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
> +#define preempt_count_dec_and_test() \
> +	({ preempt_count_sub(1); should_resched(0); })
> +#endif

Why!?!

Why can't you simply have:

static __always_inline bool preempt_count_dec_and_test(void)
{
	if (__preempt_trace_enabled(enable, 1))
		__trace_preempt_on();

	return __preempt_count_dec_and_test();
}

Also, given how !x86 architectures were just complaining about how
terrible their preempt_emable() is, I'm really not liking this much at
all.

Currently the x86 preempt_disable() is _1_ instruction and
preempt_enable() is all of 3. Adding in these tracepoints will bloat
every single such site by at least another 4-5.

That's significant bloat, for really very little gain. Realistically
nobody is going to need these.

^ permalink raw reply

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Steven Rostedt @ 2026-03-11 19:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wander Lairson Costa, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco, Vineeth Pillai
In-Reply-To: <20260311194305.GT606826@noisy.programming.kicks-ass.net>

On Wed, 11 Mar 2026 20:43:05 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Mar 11, 2026 at 09:50:18AM -0300, Wander Lairson Costa wrote:
> > +#define local_irq_enable()				\
> > +	do {						\
> > +		if (tracepoint_enabled(irq_enable))	\
> > +			trace_local_irq_enable();	\  
> 
> I'm thinking you didn't even look at the assembly generated :/
> 
> Otherwise you would have written this like:
> 
> 		if (tracepoint_enabled(irq_enable))
> 			__do_trace_local_irq_enable();

Please don't use the internal functions outside of the tracepoint.h

Vineeth is currently working on a patch set to properly do that. It's going
to introduce:

  trace_invoke_<event>()

Which basically is just __do_trace_<event>(), but as a wrapper that can
handle updates that may be needed, but supplies a proper API where thing
wont randomly break when __do_trace_<event>() changes.

-- Steve


> 
> > +		raw_local_irq_enable();			\
> > +	} while (0)  
> 
> Again, this was one instruction, and you clearly didn't bother looking
> at the mess you've generated :/

^ permalink raw reply

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Peter Zijlstra @ 2026-03-11 19:53 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Wander Lairson Costa, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco, Vineeth Pillai
In-Reply-To: <20260311154842.0823790f@gandalf.local.home>

On Wed, Mar 11, 2026 at 03:48:42PM -0400, Steven Rostedt wrote:
> On Wed, 11 Mar 2026 20:43:05 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Wed, Mar 11, 2026 at 09:50:18AM -0300, Wander Lairson Costa wrote:
> > > +#define local_irq_enable()				\
> > > +	do {						\
> > > +		if (tracepoint_enabled(irq_enable))	\
> > > +			trace_local_irq_enable();	\  
> > 
> > I'm thinking you didn't even look at the assembly generated :/
> > 
> > Otherwise you would have written this like:
> > 
> > 		if (tracepoint_enabled(irq_enable))
> > 			__do_trace_local_irq_enable();
> 
> Please don't use the internal functions outside of the tracepoint.h
> 
> Vineeth is currently working on a patch set to properly do that. It's going
> to introduce:
> 
>   trace_invoke_<event>()
> 
> Which basically is just __do_trace_<event>(), but as a wrapper that can
> handle updates that may be needed, but supplies a proper API where thing
> wont randomly break when __do_trace_<event>() changes.

That's like a 3 line patch, hardly worth the effort. Its not like it'll
be hard to find and fix any users if you do ever change that.

^ permalink raw reply

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Steven Rostedt @ 2026-03-11 20:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wander Lairson Costa, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco, Vineeth Pillai
In-Reply-To: <20260311195310.GU606826@noisy.programming.kicks-ass.net>

On Wed, 11 Mar 2026 20:53:10 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> > Which basically is just __do_trace_<event>(), but as a wrapper that can
> > handle updates that may be needed, but supplies a proper API where thing
> > wont randomly break when __do_trace_<event>() changes.  
> 
> That's like a 3 line patch, hardly worth the effort. Its not like it'll
> be hard to find and fix any users if you do ever change that.

No, but I prefer clean code, and not hacks that use internal functions with
underscores in their names. Not to mention, it properly handles different
cases:

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 22ca1c8b54f3..07219316a8e1 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -294,6 +294,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
 		}							\
+	}								\
+	static inline void trace_invoke_##name(proto)			\
+	{								\
+		__do_trace_##name(args);				\
 	}
 
 #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto)		\
@@ -313,6 +317,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
 		}							\
+	}								\
+	static inline void trace_invoke_##name(proto)			\
+	{								\
+		might_fault();						\
+		__do_trace_##name(args);				\
 	}


Then it goes through and updates every location that has a:

	if (trace_<event>_enabled()) {
		[..]
		trace_<event>();
	}

With the new proper API.

-- Steve

^ permalink raw reply related

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Peter Zijlstra @ 2026-03-11 20:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Wander Lairson Costa, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco, Vineeth Pillai
In-Reply-To: <20260311160714.1e6b7a37@gandalf.local.home>

On Wed, Mar 11, 2026 at 04:07:14PM -0400, Steven Rostedt wrote:
> On Wed, 11 Mar 2026 20:53:10 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > Which basically is just __do_trace_<event>(), but as a wrapper that can
> > > handle updates that may be needed, but supplies a proper API where thing
> > > wont randomly break when __do_trace_<event>() changes.  
> > 
> > That's like a 3 line patch, hardly worth the effort. Its not like it'll
> > be hard to find and fix any users if you do ever change that.
> 
> No, but I prefer clean code, and not hacks that use internal functions with
> underscores in their names. Not to mention, it properly handles different
> cases:
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 22ca1c8b54f3..07219316a8e1 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -294,6 +294,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  			WARN_ONCE(!rcu_is_watching(),			\
>  				  "RCU not watching for tracepoint");	\
>  		}							\
> +	}								\
> +	static inline void trace_invoke_##name(proto)			\
> +	{								\
> +		__do_trace_##name(args);				\
>  	}
>  
>  #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto)		\
> @@ -313,6 +317,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  			WARN_ONCE(!rcu_is_watching(),			\
>  				  "RCU not watching for tracepoint");	\
>  		}							\
> +	}								\
> +	static inline void trace_invoke_##name(proto)			\
> +	{								\
> +		might_fault();						\
> +		__do_trace_##name(args);				\
>  	}
> 
> 
> Then it goes through and updates every location that has a:
> 
> 	if (trace_<event>_enabled()) {
> 		[..]
> 		trace_<event>();
> 	}

We have Cocinelle for that :-), and while I absolutely suck at writing
Cocinelle, I had some limited success using Gemini to write some for me
the other day.

^ permalink raw reply

* Re: [PATCH v14 18/30] tracing: Check for undefined symbols in simple_ring_buffer
From: Nathan Chancellor @ 2026-03-11 22:18 UTC (permalink / raw)
  To: Vincent Donnefort
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui, kvmarm,
	linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel
In-Reply-To: <20260309162516.2623589-19-vdonnefort@google.com>

Hi Vincent,

On Mon, Mar 09, 2026 at 04:25:04PM +0000, Vincent Donnefort wrote:
> The simple_ring_buffer implementation must remain simple enough to be
> used by the pKVM hypervisor. Prevent the object build if unresolved
> symbols are found.
> 
> Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> 
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index d106beca8d7f..3182e1bc1cf7 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -132,4 +132,20 @@ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
>  obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
>  obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
>  
> +#
> +# simple_ring_buffer is used by the pKVM hypervisor which does not have access
> +# to all kernel symbols. Fail the build if forbidden symbols are found.
> +#
> +UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov __aeabi_unwind
> +UNDEFINED_ALLOWLIST += __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer
> +UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
> +
> +quiet_cmd_check_undefined = NM      $<
> +      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(UNDEFINED_ALLOWLIST)`"

This check triggers when building allmodconfig targeting arm, arm64,
powerpc, and x86_64 (at least, I did not test more at the moment) with
clang. If this is a hard failure, this really needs to print something
out to the developer/user to help them debug off the bat, versus having
to manually dig the $(NM) command out from the .cmd file or V=1. I came
up with

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3182e1bc1cf7..c725b06876bc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -141,7 +141,13 @@ UNDEFINED_ALLOWLIST += __stack_chk_fail stackleak_track_stack __ref_stack __sani
 UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
 
 quiet_cmd_check_undefined = NM      $<
-      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(UNDEFINED_ALLOWLIST)`"
+      cmd_check_undefined = \
+          undefsyms=$$($(NM) -u $< | grep -v $(UNDEFINED_ALLOWLIST) || true); \
+          if [ -n "$$undefsyms" ]; then \
+              echo "Unexpected symbols in $<:" >&2; \
+              echo "$$undefsyms" >&2; \
+              false; \
+          fi
 
 $(obj)/%.o.checked: $(obj)/%.o FORCE
 	$(call if_changed,check_undefined)
--

which prints

  Unexpected symbols in kernel/trace/simple_ring_buffer.o:
                   U llvm_gcda_emit_arcs
                   U llvm_gcda_emit_function
                   U llvm_gcda_end_file
                   U llvm_gcda_start_file
                   U llvm_gcda_summary_info
                   U llvm_gcov_init

for arm64, which makes sense since these are LLVM specific GCOV symbols,
so they should probably get the same treatment as the other ones:

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c725b06876bc..d464e3aa5bdd 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -136,8 +136,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
 # simple_ring_buffer is used by the pKVM hypervisor which does not have access
 # to all kernel symbols. Fail the build if forbidden symbols are found.
 #
-UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov __aeabi_unwind
-UNDEFINED_ALLOWLIST += __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer
+UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov llvm_gcda llvm_gcov
+UNDEFINED_ALLOWLIST += __aeabi_unwind __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer
 UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
 
 quiet_cmd_check_undefined = NM      $<
--

For x86_64, I see

  Unexpected symbols in kernel/trace/simple_ring_buffer.o:
                   U __clear_pages_unrolled
                   U __memmove
                   U copy_page

which comes from the use of KCFI_ADDRESSABLE(), since allmodconfig has
CONFIG_CFI=y.

For powerpc (with both clang and GCC), I see

  Unexpected symbols in kernel/trace/simple_ring_buffer.o:
                   U .TOC.

For arm (with both clang and GCC), I see

  Unexpected symbols in kernel/trace/simple_ring_buffer.o:
           U __stack_chk_guard
           U warn_slowpath_fmt

Presumably adding all of those should be fine as well?

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d464e3aa5bdd..4f120cb8c79c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -137,7 +137,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
 # to all kernel symbols. Fail the build if forbidden symbols are found.
 #
 UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov llvm_gcda llvm_gcov
-UNDEFINED_ALLOWLIST += __aeabi_unwind __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer
+UNDEFINED_ALLOWLIST += __aeabi_unwind __stack_chk_fail __stack_chk_guard stackleak_track_stack __ref_stack __sanitizer
+UNDEFINED_ALLOWLIST += \.TOC\. __clear_pages_unrolled __memmove copy_page warn_slowpath_fmt
 UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
 
 quiet_cmd_check_undefined = NM      $<
--

I don't mind sending a series for these, I just wanted to make sure I
was reasoning about everything correctly.

Cheers,
Nathan

^ permalink raw reply related

* Re: [PATCH 49/61] media: Prefer IS_ERR_OR_NULL over manual NULL check
From: Kieran Bingham @ 2026-03-11 23:03 UTC (permalink / raw)
  To: Philipp Hahn, amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel,
	dri-devel, gfs2, intel-gfx, intel-wired-lan, iommu, kvm,
	linux-arm-kernel, linux-block, linux-bluetooth, linux-btrfs,
	linux-cifs, linux-clk, linux-erofs, linux-ext4, linux-fsdevel,
	linux-gpio, linux-hyperv, linux-input, linux-kernel, linux-leds,
	linux-media, linux-mips, linux-mm, linux-modules, linux-mtd,
	linux-nfs, linux-omap, linux-phy, lin 
  Cc: Shuah Khan, Mauro Carvalho Chehab
In-Reply-To: <20260310-b4-is_err_or_null-v1-49-bd63b656022d@avm.de>

Quoting Philipp Hahn (2026-03-10 11:49:15)
> Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL
> check.
> 
> Change generated with coccinelle.
> 
> To: Shuah Khan <skhan@linuxfoundation.org>
> To: Kieran Bingham <kieran.bingham@ideasonboard.com>
> To: Mauro Carvalho Chehab <mchehab@kernel.org>
> Cc: linux-media@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Philipp Hahn <phahn-oss@avm.de>
> ---
>  drivers/media/test-drivers/vimc/vimc-streamer.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/media/test-drivers/vimc/vimc-streamer.c b/drivers/media/test-drivers/vimc/vimc-streamer.c
> index 15d863f97cbf96b7ca7fbf3d7b6b6ec39fcc8ae3..da5aca50bcb4990c06f28e5a883eb398606991e9 100644
> --- a/drivers/media/test-drivers/vimc/vimc-streamer.c
> +++ b/drivers/media/test-drivers/vimc/vimc-streamer.c
> @@ -167,7 +167,7 @@ static int vimc_streamer_thread(void *data)
>                 for (i = stream->pipe_size - 1; i >= 0; i--) {
>                         frame = stream->ved_pipeline[i]->process_frame(
>                                         stream->ved_pipeline[i], frame);
> -                       if (!frame || IS_ERR(frame))
> +                       if (IS_ERR_OR_NULL(frame))

Reviewed-by: Kieran Bingham <kieran.bingham@ideasonboard.com>

>                                 break;
>                 }
>                 //wait for 60hz
> 
> -- 
> 2.43.0
>

^ permalink raw reply

* Re: [PATCH v3 4/4] trace/preemptirq: Implement trace_irqflags hooks
From: Steven Rostedt @ 2026-03-11 23:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wander Lairson Costa, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Masami Hiramatsu, Mathieu Desnoyers, Andrew Morton, open list,
	open list:TRACING, acme, williams, gmonaco, Vineeth Pillai
In-Reply-To: <20260311204607.GF2277644@noisy.programming.kicks-ass.net>

On Wed, 11 Mar 2026 21:46:07 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> > Then it goes through and updates every location that has a:
> > 
> > 	if (trace_<event>_enabled()) {
> > 		[..]
> > 		trace_<event>();
> > 	}  
> 
> We have Cocinelle for that :-), and while I absolutely suck at writing
> Cocinelle, I had some limited success using Gemini to write some for me
> the other day.

Heh, I believe Vineeth used claude ;-)

-- Steve

^ permalink raw reply

* Re: [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions
From: Jakub Kicinski @ 2026-03-12  0:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wesley Atwell, Simon Baatz, davem, pabeni, ncardwell, dsahern,
	matttbe, martineau, netdev, mptcp, kuniyu, horms, geliang, corbet,
	skhan, rostedt, mhiramat, mathieu.desnoyers, 0x7f454c46,
	linux-doc, linux-trace-kernel, linux-kselftest, linux-kernel,
	linux-api
In-Reply-To: <CANn89i+dojcg=TDh6E1++g_TM7qdcpnyu47n2Q9DRW_w73TjzA@mail.gmail.com>

On Wed, 11 Mar 2026 09:34:32 +0100 Eric Dumazet wrote:
> Your series will heavily conflict with Simon's one
> 
> https://patchwork.kernel.org/project/netdevbpf/list/?series=1063486&state=%2A&archive=both
> 
> I suggest you rebase/retest/resend after we merge it.

Would it make sense to extend netdevsim and packetdrill to be able to
exercise scaling ratio a little more? Having it optionally clone the
skb and truesize += X would be trivial. IDK how many bugs this would
let us catch tho :(

^ permalink raw reply

* Re: [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions
From: Jakub Kicinski @ 2026-03-12  0:43 UTC (permalink / raw)
  To: Wesley Atwell
  Cc: davem, pabeni, edumazet, ncardwell, dsahern, matttbe, martineau,
	netdev, mptcp, kuniyu, horms, geliang, corbet, skhan, rostedt,
	mhiramat, mathieu.desnoyers, 0x7f454c46, linux-doc,
	linux-trace-kernel, linux-kselftest, linux-kernel, linux-api
In-Reply-To: <20260311075600.948413-1-atwellwea@gmail.com>

On Wed, 11 Mar 2026 01:55:53 -0600 Wesley Atwell wrote:
> Subject: [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions

when you repost please make sure you use "PATCH net-next v2" 
as the tag / prefix. "net" is a tree we use to fast track fixes.

^ permalink raw reply

* Fwd: [PATCH 09/12] dt-bindings: input: Document hid-over-spi DT schema
From: Jingyuan Liang @ 2026-03-12  0:58 UTC (permalink / raw)
  To: Rob Herring
  Cc: Jiri Kosina, Benjamin Tissoires, Jonathan Corbet, Mark Brown,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Dmitry Torokhov, Krzysztof Kozlowski, Conor Dooley, linux-input,
	linux-doc, LKML, linux-spi, linux-trace-kernel, devicetree,
	Henry Barnor, Dmitry Antipov, Jarrett Schultz
In-Reply-To: <CAEe3GZHSqepvjjopLwrWX3_n4+RnCeVVQnAO=Swixgu2z3OpUw@mail.gmail.com>

(Resending to the list. Apologies, I accidentally dropped the CCs on
my initial reply!)

On Tue, Mar 3, 2026 at 5:53 AM Rob Herring <robh@kernel.org> wrote:
>
> On Tue, Mar 3, 2026 at 12:14 AM Jingyuan Liang <jingyliang@chromium.org> wrote:
> >
> > Documentation describes the required and optional properties for
> > implementing Device Tree for a Microsoft G6 Touch Digitizer that
> > supports HID over SPI Protocol 1.0 specification.
> >
> > The properties are common to HID over SPI.
> >
> > Signed-off-by: Dmitry Antipov <dmanti@microsoft.com>
> > Signed-off-by: Jarrett Schultz <jaschultz@microsoft.com>
> > Signed-off-by: Jingyuan Liang <jingyliang@chromium.org>
> > ---
> >  .../devicetree/bindings/input/hid-over-spi.yaml    | 153 +++++++++++++++++++++
> >  1 file changed, 153 insertions(+)
> >
> > diff --git a/Documentation/devicetree/bindings/input/hid-over-spi.yaml b/Documentation/devicetree/bindings/input/hid-over-spi.yaml
> > new file mode 100644
> > index 000000000000..b623629ed9d3
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/input/hid-over-spi.yaml
> > @@ -0,0 +1,153 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: http://devicetree.org/schemas/input/hid-over-spi.yaml#
> > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > +
> > +title: HID over SPI Devices
> > +
> > +maintainers:
> > +  - Benjamin Tissoires <benjamin.tissoires@redhat.com>
> > +  - Jiri Kosina <jkosina@suse.cz>
> > +
> > +description: |+
> > +  HID over SPI provides support for various Human Interface Devices over the
> > +  SPI bus. These devices can be for example touchpads, keyboards, touch screens
> > +  or sensors.
> > +
> > +  The specification has been written by Microsoft and is currently available here:
> > +  https://www.microsoft.com/en-us/download/details.aspx?id=103325
> > +
> > +  If this binding is used, the kernel module spi-hid will handle the communication
> > +  with the device and the generic hid core layer will handle the protocol.
> > +
> > +allOf:
> > +  - $ref: /schemas/input/touchscreen/touchscreen.yaml#
> > +
> > +properties:
> > +  compatible:
> > +    oneOf:
> > +      - items:
> > +          - enum:
> > +              - microsoft,g6-touch-digitizer
> > +          - const: hid-over-spi
> > +      - description: Just "hid-over-spi" alone is allowed, but not recommended.
> > +        const: hid-over-spi
> > +
> > +  reg:
> > +    maxItems: 1
> > +
> > +  interrupts:
> > +    maxItems: 1
> > +
> > +  reset-gpios:
> > +    maxItems: 1
> > +    description:
> > +      GPIO specifier for the digitizer's reset pin (active low). The line must
> > +      be flagged with GPIO_ACTIVE_LOW.
> > +
> > +  vdd-supply:
> > +    description:
> > +      Regulator for the VDD supply voltage.
>
> Is this part of the spec? This won't scale for multiple devices with
> different power rails.

This is not part of the spec but is needed for power management. Is it okay I
mark it as optional? Thank you.

>
> > +
> > +  input-report-header-address:
> > +    $ref: /schemas/types.yaml#/definitions/uint32
> > +    minimum: 0
> > +    maximum: 0xffffff
> > +    description:
> > +      A value to be included in the Read Approval packet, listing an address of
> > +      the input report header to be put on the SPI bus. This address has 24
> > +      bits.
> > +
> > +  input-report-body-address:
> > +    $ref: /schemas/types.yaml#/definitions/uint32
> > +    minimum: 0
> > +    maximum: 0xffffff
> > +    description:
> > +     A value to be included in the Read Approval packet, listing an address of
> > +      the input report body to be put on the SPI bus. This address has 24 bits.
> > +
> > +  output-report-address:
> > +    $ref: /schemas/types.yaml#/definitions/uint32
> > +    minimum: 0
> > +    maximum: 0xffffff
> > +    description:
> > +      A value to be included in the Output Report sent by the host, listing an
> > +      address where the output report on the SPI bus is to be written to. This
> > +      address has 24 bits.
> > +
> > +  post-power-on-delay-ms:
> > +    description:
> > +      Optional time in ms required by the device after enabling its regulators
> > +      or powering it on, before it is ready for communication.
>
> Drop. This should be implied by the compatible.

Thank you, I will fix this in v2.

>
> > +
> > +  minimal-reset-delay-ms:
> > +    description:
> > +      Optional minimum amount of time in ms that device needs to be in reset
> > +      state for the reset to take effect.
>
> Drop. This should be implied by the compatible.

I will fix this in v2.

>
> > +
> > +  read-opcode:
> > +  $ref: /schemas/types.yaml#/definitions/uint8
> > +    description:
> > +      Value to be used in Read Approval packets. 1 byte.
> > +
> > +  write-opcode:
> > +  $ref: /schemas/types.yaml#/definitions/uint8
> > +    description:
> > +      Value to be used in Write Approval packets. 1 byte.
>
> Why are these and the address properties above not defined by the
> spec? Do they vary for a specific device? If not, then they should be
> implied by the compatible.

These properties are not defined by the spec:

"The Input Report Address (header or body) and READ opcode are retrieved
from ACPI."

Same for the output report address and write opcode. I will drop these in v2.

>
> > +
> > +  hid-over-spi-flags:
> > +  $ref: /schemas/types.yaml#/definitions/uint16
> > +    description:
> > +      16 bits.
> > +      Bits 0-12 - Reserved (must be 0)
> > +      Bit 13 - SPI Write Mode. Possible values -
> > +        * 0b0- Writes are carried out in Single-SPI mode
> > +        * 0b1- Writes are carried out in the Multi-SPI mode specified by bits
> > +               14-15
> > +      Bits 14-15 - Multi-SPI Mode. Possible values -
> > +        * 0b00- Single SPI
> > +        * 0b01- Dual SPI
> > +        * 0b10- Quad SPI
>
> We already have SPI properties to define the bus width for read and write.

Will fix this in v2.

>
> > +
> > +required:
> > +  - compatible
> > +  - interrupts
> > +  - reset-gpios
> > +  - vdd-supply
> > +  - input-report-header-address
> > +  - input-report-body-address
> > +  - output-report-address
> > +  - read-opcode
> > +  - write-opcode
> > +  - hid-over-spi-flags
> > +
> > +additionalProperties: false
> > +
> > +examples:
> > +  - |
> > +    #include <dt-bindings/interrupt-controller/irq.h>
> > +    #include <dt-bindings/gpio/gpio.h>
> > +
> > +    spi {
> > +      #address-cells = <1>;
> > +      #size-cells = <0>;
> > +
> > +      hid@0 {
> > +        compatible = "hid-over-spi";
> > +        reg = <0x0>;
> > +        interrupts-extended = <&gpio 42 IRQ_TYPE_EDGE_FALLING>;
> > +        reset-gpios = <&gpio 27 GPIO_ACTIVE_LOW>;
> > +        vdd-supply = <&pm8350c_l3>;
> > +        pinctrl-names = "default";
> > +        pinctrl-0 = <&ts_d6_reset_assert &ts_d6_int_bias>;
> > +        input-report-header-address = <0x1000>;
> > +        input-report-body-address = <0x1004>;
> > +        output-report-address = <0x2000>;
> > +        read-opcode = <0x0b>;
> > +        write-opcode = <0x02>;
> > +        hid-over-spi-flags = <0x0000>;
> > +        post-power-on-delay-ms = <5>;
> > +        minimal-reset-delay-ms = <5>;
> > +      };
> > +    };
> > \ No newline at end of file
>
> Fix this.

Will fix this in v2.

>
> Rob

^ permalink raw reply

* Re: [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions
From: Eric Dumazet @ 2026-03-12  1:49 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Wesley Atwell, Simon Baatz, davem, pabeni, ncardwell, dsahern,
	matttbe, martineau, netdev, mptcp, kuniyu, horms, geliang, corbet,
	skhan, rostedt, mhiramat, mathieu.desnoyers, 0x7f454c46,
	linux-doc, linux-trace-kernel, linux-kselftest, linux-kernel,
	linux-api
In-Reply-To: <20260311174154.5fadb207@kernel.org>

On Thu, Mar 12, 2026 at 1:41 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 11 Mar 2026 09:34:32 +0100 Eric Dumazet wrote:
> > Your series will heavily conflict with Simon's one
> >
> > https://patchwork.kernel.org/project/netdevbpf/list/?series=1063486&state=%2A&archive=both
> >
> > I suggest you rebase/retest/resend after we merge it.
>
> Would it make sense to extend netdevsim and packetdrill to be able to
> exercise scaling ratio a little more? Having it optionally clone the
> skb and truesize += X would be trivial. IDK how many bugs this would
> let us catch tho :(

Yes, I think we mentioned this at some point.
packetdrill uses tun device.
Adding a TUN ioctl() to control how many additional bytes are added to
skb->truesize after tun allocates an skb is doable.

^ permalink raw reply

* Re: [PATCH v2 2/3] tracing: Update futex syscall trace event to show more commands
From: Masami Hiramatsu @ 2026-03-12  5:34 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Thomas Gleixner, Peter Zijlstra,
	Brian Geffon, John Stultz, Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260310201036.713629966@kernel.org>

On Tue, 10 Mar 2026 16:09:56 -0400
Steven Rostedt <rostedt@kernel.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> Make the futex syscall trace event a little more smart. Have it read the
> futex_op instruction to determine what else it can save and print. For the
> appropriate options, it will read the utime (timespec) parameter and show
> its output as well as the uaddr2.
> 
>  futex_requeue_p-1154    [004] .....   144.568339: sys_futex(uaddr: 0x5652b178d834 (0x482), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
>  futex_requeue_p-1162    [002] .....   144.568696: sys_futex(uaddr: 0x7f763b7fece0 (2), FUTEX_WAIT|FUTEX_PRIVATE_FLAG, val: 2)
>  futex_requeue_p-1151    [000] .....   144.568700: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
>  futex_requeue_p-1162    [002] .....   144.568705: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
>  futex_requeue_p-1151    [000] .....   144.568715: sys_futex(uaddr: 0x7f764369e990 (0x483), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 1155)
>  futex_requeue_p-1155    [005] .....   144.569420: sys_futex(uaddr: 0x5652b178d838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7ffdacfba500 (143.890024054), uaddr2: 0x5652b178d834 (0), val3: 0)
>  futex_requeue_p-1155    [005] .....   144.569454: sys_futex(uaddr: 0x5652b178d834 (0), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

This looks good to me.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> ---
> Changes since v1: https://lore.kernel.org/all/20260303214942.587739736@kernel.org/
> 
> - Updated to have the print processing in kernel/futex/syscall.c
> 
>  include/linux/futex.h         |  35 ++++++++-
>  kernel/futex/syscalls.c       |  48 ++++++-------
>  kernel/trace/trace_syscalls.c | 129 +++++++++++++++++++++++++++++-----
>  3 files changed, 164 insertions(+), 48 deletions(-)
> 
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 9fc47aa01a8b..976fa257ab5c 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -82,8 +82,35 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
>  	      u32 __user *uaddr2, u32 val2, u32 val3);
>  int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
>  
> +static __always_inline bool futex_cmd_has_timeout(u32 cmd)
> +{
> +	switch (cmd) {
> +	case FUTEX_WAIT:
> +	case FUTEX_LOCK_PI:
> +	case FUTEX_LOCK_PI2:
> +	case FUTEX_WAIT_BITSET:
> +	case FUTEX_WAIT_REQUEUE_PI:
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static __always_inline bool futex_cmd_has_addr2(u32 cmd)
> +{
> +	switch (cmd) {
> +	case FUTEX_REQUEUE:
> +	case FUTEX_CMP_REQUEUE:
> +	case FUTEX_WAKE_OP:
> +	case FUTEX_WAIT_REQUEUE_PI:
> +		return true;
> +	}
> +	return false;
> +}
> +
>  #ifdef CONFIG_FTRACE_SYSCALLS
> -void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr);
> +void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
> +			 u32 *ptr1, u32 *ptr2, unsigned long *ts1,
> +			 unsigned long *ts2);
>  #endif
>  
>  #ifdef CONFIG_FUTEX_PRIVATE_HASH
> @@ -119,7 +146,11 @@ static inline int futex_hash_allocate_default(void)
>  static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
>  static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
>  static inline void futex_print_syscall(struct seq_buf *s, int nr_args,
> -				       unsigned long *args, u32 *ptr) { }
> +				       unsigned long *args, u32 *ptr1,
> +				       u32 *ptr2, unsigned long *ts1,
> +				       unsigned long *ts2) { }
> +static __always_inline bool futex_cmd_has_timeout(u32 cmd) { return false; }
> +static __always_inline bool futex_cmd_has_addr2(u32 cmd) { return false; }
>  #endif
>  
>  #endif
> diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
> index a1cd512aa502..a46706d6bc6c 100644
> --- a/kernel/futex/syscalls.c
> +++ b/kernel/futex/syscalls.c
> @@ -158,31 +158,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
>  	return -ENOSYS;
>  }
>  
> -static __always_inline bool futex_cmd_has_timeout(u32 cmd)
> -{
> -	switch (cmd) {
> -	case FUTEX_WAIT:
> -	case FUTEX_LOCK_PI:
> -	case FUTEX_LOCK_PI2:
> -	case FUTEX_WAIT_BITSET:
> -	case FUTEX_WAIT_REQUEUE_PI:
> -		return true;
> -	}
> -	return false;
> -}
> -
> -static __always_inline bool futex_cmd_has_addr2(u32 cmd)
> -{
> -	switch (cmd) {
> -	case FUTEX_REQUEUE:
> -	case FUTEX_CMP_REQUEUE:
> -	case FUTEX_WAKE_OP:
> -	case FUTEX_WAIT_REQUEUE_PI:
> -		return true;
> -	}
> -	return false;
> -}
> -
>  static __always_inline int
>  futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
>  {
> @@ -229,7 +204,9 @@ static const char * __futex_cmds[] =
>  	"FUTEX_LOCK_PI2",
>  };
>  
> -void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr)
> +void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
> +			 u32 *ptr1, u32 *ptr2, unsigned long *ts1,
> +			 unsigned long *ts2)
>  {
>  	unsigned int op, cmd;
>  	bool done = false;
> @@ -242,8 +219,8 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
>  		switch (i) {
>  		case 0:
>  			seq_buf_printf(s, "uaddr: 0x%lx", args[i]);
> -			if (ptr) {
> -				u32 val = *ptr;
> +			if (ptr1) {
> +				u32 val = *ptr1;
>  				if (val < 10)
>  					seq_buf_printf(s, " (%u)", val);
>  				else
> @@ -279,6 +256,15 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
>  				continue;
>  
>  			seq_buf_printf(s, ", timespec: 0x%lx", args[i]);
> +
> +			if (!ts1 || !ts2)
> +				continue;
> +
> +			if (!*ts1 && !*ts2) {
> +				seq_buf_puts(s, " (0)");
> +				continue;
> +			}
> +			seq_buf_printf(s, " (%lu.%09lu)", *ts1, *ts2);
>  			continue;
>  		case 4:
>  			if (!futex_cmd_has_addr2(cmd)) {
> @@ -286,6 +272,12 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
>  				continue;
>  			}
>  			seq_buf_printf(s, ", uaddr2: 0x%lx", args[i]);
> +
> +			if (!ptr2)
> +				continue;
> +
> +			seq_buf_printf(s, " (%x)", *ptr2);
> +
>  			continue;
>  		case 5:
>  			seq_buf_printf(s, ", val3: %lu", args[i]);
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 8cb3af569157..de1fa97547a3 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -239,21 +239,35 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
>  	return trace_handle_return(s);
>  }
>  
> +struct futex_data {
> +	u32		val1;
> +	u32		val2;
> +	unsigned long	ts1;
> +	unsigned long	ts2;
> +};
> +
>  static enum print_line_t
>  sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
>  		      struct trace_seq *s, struct trace_event *event, int ent_size)
>  {
> +	struct futex_data *data;
>  	void *end = (void *)trace + ent_size;
> -	void *ptr;
> +	unsigned long *ts1 = NULL, *ts2 = NULL;
> +	u32 *ptr1 = NULL, *ptr2 = NULL;
>  
>  	/* Set ptr to the user space copied area */
> -	ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
> -	if (ptr + 4 > end)
> -		ptr = NULL;
> +	data = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
> +	if ((void *)data + sizeof(*data) <= end) {
> +		ptr1 = &data->val1;
> +		ptr2 = &data->val2;
> +		ts1 = &data->ts1;
> +		ts2 = &data->ts2;
> +	}
>  
>  	trace_seq_printf(s, "%s(", entry->name);
>  
> -	futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr);
> +	futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr1, ptr2,
> +			    ts1, ts2);
>  
>  	trace_seq_puts(s, ")\n");
>  
> @@ -472,9 +486,9 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
>  			"\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s");
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
> -			"  val: 0x%%x timeout/val2: 0x%%llx");
> +			"  val: 0x%%x timeout/val2: 0x%%llx (%%lu.%%lu)");
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
> -			" uaddr2: 0x%%lx val3: 0x%%x\", ");
> +			" uaddr2: 0x%%lx (0x%%lx) val3: 0x%%x\", ");
>  
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
>  			" REC->uaddr,");
> @@ -520,10 +534,12 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
>  			FUTEX_CLOCK_REALTIME);
>  
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
> -			" REC->val, REC->utime,");
> +			" REC->val, REC->utime, REC->__ts1, REC->__ts2,");
>  
>  	pos += snprintf(buf + pos, LEN_OR_ZERO,
> -			" REC->uaddr, REC->val3");
> +			" REC->uaddr,");
> +	pos += snprintf(buf + pos, LEN_OR_ZERO,
> +			" REC->__value2, REC->val3");
>  	return pos;
>  }
>  
> @@ -626,7 +642,39 @@ static int __init futex_fields(struct trace_event_call *call, int offset)
>  	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
>  				 FILTER_OTHER);
>  	if (ret)
> -		kfree(arg);
> +		goto free;
> +	offset += sizeof(int);
> +
> +	arg = kstrdup("__value2", GFP_KERNEL);
> +	if (WARN_ON_ONCE(!arg))
> +		return -ENOMEM;
> +	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
> +				 FILTER_OTHER);
> +	if (ret)
> +		goto free;
> +	offset += sizeof(int);
> +
> +	arg = kstrdup("__ts1", GFP_KERNEL);
> +	if (WARN_ON_ONCE(!arg))
> +		return -ENOMEM;
> +	ret = trace_define_field(call, "unsigned long", arg, offset,
> +				 sizeof(unsigned long), 0, FILTER_OTHER);
> +	if (ret)
> +		goto free;
> +	offset += sizeof(long);
> +
> +	arg = kstrdup("__ts2", GFP_KERNEL);
> +	if (WARN_ON_ONCE(!arg))
> +		return -ENOMEM;
> +	ret = trace_define_field(call, "unsigned long", arg, offset,
> +				 sizeof(unsigned long), 0, FILTER_OTHER);
> +	if (ret)
> +		goto free;
> +
> +	return 0;
> +
> +free:
> +	kfree(arg);
>  	return ret;
>  }
>  
> @@ -799,11 +847,51 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr,
>  	return 0;
>  }
>  
> +struct tp_futex_data {
> +	u32			cmd;
> +	const u32		__user *val1;
> +	const u32 		__user *val2;
> +	void			__user *timeout;
> +};
> +
> +static int syscall_copy_futex(char *buf, const char __user *ptr,
> +			      size_t size, void *data)
> +{
> +	struct tp_futex_data *tp_data = data;
> +	struct futex_data *fdata = (void *)buf;
> +	int cmd = tp_data->cmd & FUTEX_CMD_MASK;
> +	int ret;
> +
> +	memset(fdata, 0, sizeof(*fdata));
> +
> +	if (tp_data->val1) {
> +		ret = __copy_from_user(&fdata->val1, tp_data->val1, 4);
> +		if (ret)
> +			return -1;
> +	}
> +
> +	if (tp_data->val2 && futex_cmd_has_addr2(cmd)) {
> +		ret = __copy_from_user(&fdata->val2, tp_data->val2, 4);
> +		if (ret)
> +			return -1;
> +	}
> +
> +	if (tp_data->timeout && futex_cmd_has_timeout(cmd)) {
> +		/* Copies both ts1 and ts2 */
> +		ret = __copy_from_user(&fdata->ts1, tp_data->timeout,
> +				       sizeof(long) * 2);
> +		if (ret)
> +			return -1;
> +	}
> +
> +	return 0;
> +}
> +
>  static int
>  syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
>  {
>  	struct syscall_user_buffer *sbuf;
> -	const char __user *ptr;
> +	struct tp_futex_data tp_data;
>  	char *buf;
>  
>  	/* buf_size of zero means user doesn't want user space read */
> @@ -815,14 +903,18 @@ syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
>  	if (!sbuf)
>  		return -1;
>  
> -	ptr = (char __user *)args[0];
> +	tp_data.cmd = args[1];
> +	tp_data.val1 = (u32 __user *)args[0];
> +	tp_data.val2 = (u32 __user *)args[4];
> +	tp_data.timeout = (u64 __user *)args[3];
>  
> -	*buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL);
> +	*buffer = trace_user_fault_read(&sbuf->buf, NULL, 0,
> +					syscall_copy_futex, &tp_data);
>  	if (!*buffer)
>  		return -1;
>  
> -	/* Add room for the value */
> -	*size += 4;
> +	/* Add room for values */
> +	*size += sizeof(struct futex_data);
>  
>  	buf = *buffer;
>  
> @@ -833,12 +925,13 @@ static void syscall_put_futex(struct syscall_metadata *sys_data,
>  			      struct syscall_trace_enter *entry,
>  			      char *buffer)
>  {
> -	u32 *ptr;
> +	struct futex_data *fdata = (void *)buffer;
> +	struct futex_data *data;
>  
>  	/* Place the futex key into the storage */
> -	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
> +	data = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
>  
> -	*ptr = *(u32 *)buffer;
> +	*data = *fdata;
>  }
>  
>  static char *sys_fault_user(unsigned int buf_size,
> -- 
> 2.51.0
> 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [RFC PATCH v2 09/37] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Ackerley Tng @ 2026-03-12  5:44 UTC (permalink / raw)
  To: kvm, linux-doc, linux-kernel, linux-kselftest, linux-trace-kernel,
	x86
  Cc: aik, andrew.jones, binbin.wu, bp, brauner, chao.p.peng,
	chao.p.peng, chenhuacai, corbet, dave.hansen, david, hpa,
	ira.weiny, jgg, jmattson, jroedel, jthoughton, maobibo,
	mathieu.desnoyers, maz, mhiramat, michael.roth, mingo, mlevitsk,
	oupton, pankaj.gupta, pbonzini, prsampat, qperret, ricarkol,
	rick.p.edgecombe, rientjes, rostedt, seanjc, shivankg, shuah,
	steven.price, tabba, tglx, vannapurve, vbabka, willy, wyihan,
	yan.y.zhao
In-Reply-To: <CAEvNRgFMNywpDRr+WeNsVj=MnsbhZp9H3j0QRDo_eOP+kGCNJw@mail.gmail.com>

Ackerley Tng <ackerleytng@google.com> writes:

Here's iteration 2 of the attributes, after getting a much clearer idea
of the use cases across platforms at the last guest_memfd biweekly.

Please comment in this context! I'm planning for this text to make it to
Documentation/virt/kvm/api.rst.

> Ackerley Tng <ackerleytng@google.com> writes:
>
>>
>> [...snip...]
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 23ec0b0c3e22..26e80745c8b4 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -117,7 +117,7 @@ description:
>>        x86 includes both i386 and x86_64.
>>
>>    Type:
>> -      system, vm, or vcpu.
>> +      system, vm, vcpu or guest_memfd.
>>
>>    Parameters:
>>        what parameters are accepted by the ioctl.
>> @@ -6523,11 +6523,22 @@ the capability to be present.
>>  ---------------------------------
>>
>>  :Capability: KVM_CAP_MEMORY_ATTRIBUTES2
>> -:Architectures: x86
>> -:Type: vm ioctl
>> +:Architectures: all
>> +:Type: vm, guest_memfd ioctl
>>  :Parameters: struct kvm_memory_attributes2 (in/out)
>>  :Returns: 0 on success, <0 on error
>>
>> +Errors:
>> +
>> +  ========== ===============================================================
>> +  EINVAL     The specified `offset` or `size` were invalid (e.g. not
>> +             page aligned, causes an overflow, or size is zero).
>> +  EFAULT     The parameter address was invalid.
>> +  EAGAIN     Some page within requested range had unexpected refcounts. The
>> +             offset of the page will be returned in `error_offset`.
>> +  ENOMEM     Ran out of memory trying to track private/shared state
      EOPNOTSUPP The specified content policy is not supported while
                 setting the requested attribute
>> +  ========== ===============================================================
>> +
>>  KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
>>  KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
>>  userspace.  The original (pre-extension) fields are shared with
>> @@ -6538,15 +6549,42 @@ Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
>>  ::
>>
>>    struct kvm_memory_attributes2 {
>> -	__u64 address;
>> +	/* in */
>> +	union {
>> +		__u64 address;
>> +		__u64 offset;
>> +	};
>>  	__u64 size;
>>  	__u64 attributes;
>>  	__u64 flags;
>> -	__u64 reserved[12];
>> +	/* out */
>> +	__u64 error_offset;
>> +	__u64 reserved[11];
>>    };
>>
>>    #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>>
>> +Set attributes for a range of offsets within a guest_memfd to
>> +KVM_MEMORY_ATTRIBUTE_PRIVATE to limit the specified guest_memfd backed
>> +memory range for guest_use. Even if KVM_CAP_GUEST_MEMFD_MMAP is
>> +supported, after a successful call to set
>> +KVM_MEMORY_ATTRIBUTE_PRIVATE, the requested range will not be mappable
>> +into host userspace and will only be mappable by the guest.
>> +
>> +To allow the range to be mappable into host userspace again, call
>> +KVM_SET_MEMORY_ATTRIBUTES2 on the guest_memfd again with
>> +KVM_MEMORY_ATTRIBUTE_PRIVATE unset.
>> +
>> +If this ioctl returns -EAGAIN, the offset of the page with unexpected
>> +refcounts will be returned in `error_offset`. This can occur if there
>> +are transient refcounts on the pages, taken by other parts of the
>> +kernel.
>> +
>> +Userspace is expected to figure out how to remove all known refcounts
>> +on the shared pages, such as refcounts taken by get_user_pages(), and
>> +try the ioctl again. A possible source of these long term refcounts is
>> +if the guest_memfd memory was pinned in IOMMU page tables.
>> +

Memory *content* policies can be requested while setting memory
attributes. This defines:

  - What the host reads after a private to shared conversion
  - What the guest reads after a shared to private conversion (if
    applicable)

The policy definitions below provide more details:

``KVM_SET_MEMORY_ATTRIBUTES2_CONTENT_POLICY_ZERO`` (default)

  On a private to shared conversion, the host will read zeros from the
  converted memory on the next fault after successful return of the
  KVM_SET_MEMORY_ATTRIBUTES2 ioctl.

  This is not supported (-EOPNOTSUPP) for a shared to private
  conversion. While some CoCo implementations do zero memory contents
  such that the guest reads zeros after conversion, the guest is not
  expected to trust host-provided zeroing, hence as a UAPI policy, KVM
  does not make any such guarantees.

  For testing purposes, the KVM_X86_SW_PROTECTED_VM testing vehicle
  will support this policy and ensure zeroing for conversions in both
  directions.

``KVM_SET_MEMORY_ATTRIBUTES2_CONTENT_POLICY_PRESERVE``

  On private/shared conversions in both directions, memory contents
  will be preserved and readable. As a concrete example, if the host
  writes ``0xbeef`` to memory and converts the memory to shared, the
  guest will also read ``0xbeef``, after any necessary hardware or
  software provided decryption. After a reverse shared to private
  conversion, the host will also read ``0xbeef``.

  pKVM (ARM) is the first user of this policy. Since pKVM does not
  protect memory with encryption, a content policy to preserve memory
  will not will not involve any decryption. The guest will be able to
  read what the host wrote with full content preservation.

  For testing purposes, the KVM_X86_SW_PROTECTED_VM testing vehicle
  will support this policy and the contents of converted memory will
  be preserved.

``KVM_SET_MEMORY_ATTRIBUTES2_CONTENT_POLICY_NONE``

  This is an explicit request that KVM provide no guarantees on memory
  contents after conversion. Neither host nor guest should expect any
  guarantees about the memory contents after conversion.

  For testing purposes, the KVM_X86_SW_PROTECTED_VM testing vehicle will
  support this policy and every byte of converted memory will read
  ``0xab``.

>>  See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
>>
>
> [...snip...]
>

^ permalink raw reply

* Re: [RFC PATCH v2 0/3] disable optimistic spinning for ftrace_lock
From: Masami Hiramatsu @ 2026-03-12  8:06 UTC (permalink / raw)
  To: Yafang Shao
  Cc: David Laight, Peter Zijlstra, mingo, will, boqun, longman,
	rostedt, mhiramat, mark.rutland, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <CALOAHbDqYjJngQmmOaPRA=k4Bb8Or39YNp5R98f_op4dti2_TQ@mail.gmail.com>

Hi,

On Wed, 11 Mar 2026 21:40:32 +0800
Yafang Shao <laoar.shao@gmail.com> wrote:

> > Although there is a bigger issue of why on earth the code is reading the
> > list of filter functions at all - never mind all the time.
> 
> bpftrace reads the complete list of available functions into
> userspace, then performs matching against the target function to
> determine if it is traceable.

What about changing bpftrace userspace tool to cache the list of available
functions? (or just add an option to pass available function list?)
Then, you can just copy the function list somewhere and uses it.

Of course we can do the same thing in the kernel, but I don't think
there is any reason to do it in the kernel instead of user space.

Thank you,

> 
> > I'll do it by hand when debugging, but I'd have though anything using bpf
> > will know exactly where to add its hooks.
> 
> 
> -- 
> Regards
> Yafang


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2 3/3] tracing: Show TID and flags for PI futex system call trace event
From: Masami Hiramatsu @ 2026-03-12  8:15 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Thomas Gleixner, Peter Zijlstra,
	Brian Geffon, John Stultz, Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260310201036.879130564@kernel.org>

On Tue, 10 Mar 2026 16:09:57 -0400
Steven Rostedt <rostedt@kernel.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> For the futex system call trace event for FUTEX_LOCK_PI and
> FUTEX_UNLOCK_PI commands, show the TID from the value (which is usually in
> hex) as well as translate the flags (DIED and WAITERS).
> 
>  pi_mutex_hammer-1098    [000] .....   121.876928: sys_futex(uaddr: 0x560f40cc8180 (0x450) tid: 1104, FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f9d4b1e50 (0.000100000))
>  pi_mutex_hammer-1128    [000] .....   121.877120: sys_futex(uaddr: 0x560f40cc8180 (0x8000042a) tid: 1066 (WAITERS), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f8e493e50 (0.000100000))
>  pi_mutex_hammer-1106    [000] .....   121.877242: sys_futex(uaddr: 0x560f40cc8180 (0x80000452) tid: 1106 (WAITERS), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
> 
> This makes it easier to see the hand off of a mutex and who the owner was.
> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

Looks good to me.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks!


> ---
> Changes since v1: https://lore.kernel.org/all/20260303174442.548b6524@gandalf.local.home/
> 
> - Updated to have the print processing in kernel/futex/syscall.c
> 
>  kernel/futex/syscalls.c | 26 ++++++++++++++++++++++++--
>  1 file changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
> index a46706d6bc6c..24d912d9b20d 100644
> --- a/kernel/futex/syscalls.c
> +++ b/kernel/futex/syscalls.c
> @@ -211,6 +211,9 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
>  	unsigned int op, cmd;
>  	bool done = false;
>  
> +	op = args[1];
> +	cmd = op & FUTEX_CMD_MASK;
> +
>  	for (int i = 0; !done && i < nr_args; i++) {
>  
>  		if (seq_buf_has_overflowed(s))
> @@ -225,11 +228,30 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
>  					seq_buf_printf(s, " (%u)", val);
>  				else
>  					seq_buf_printf(s, " (0x%x)", val);
> +
> +				switch(cmd) {
> +				case FUTEX_LOCK_PI:
> +				case FUTEX_UNLOCK_PI:
> +					seq_buf_printf(s, " tid: %d",
> +						       val & FUTEX_TID_MASK);
> +
> +					if (!(val & (FUTEX_OWNER_DIED|FUTEX_WAITERS)))
> +						break;
> +
> +					seq_buf_puts(s, " (");
> +					if (val & FUTEX_WAITERS)
> +						seq_buf_puts(s, "WAITERS");
> +					if (val & FUTEX_OWNER_DIED) {
> +						if (op & FUTEX_WAITERS)
> +							seq_buf_putc(s, '|');
> +						seq_buf_puts(s, "DIED");
> +					}
> +					seq_buf_putc(s, ')');
> +					break;
> +				}
>  			}
>  			continue;
>  		case 1:
> -			op = args[i];
> -			cmd = op & FUTEX_CMD_MASK;
>  			if (cmd <= FUTEX_LOCK_PI2)
>  				seq_buf_printf(s, ", %s", __futex_cmds[cmd]);
>  			else
> -- 
> 2.51.0
> 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox