Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [PATCH v2 4/6] io_uring/zcrx: add shared-memory notification statistics
From: Clément Léger @ 2026-05-18 15:35 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260518153532.2835502-1-cleger@meta.com>

Add support for an optional stats struct embedded in the refill queue
region, allowing userspace to monitor copy-fallback in real-time.

Userspace queries the stats struct size and alignment via
IO_URING_QUERY_ZCRX_NOTIF (notif_stats_size / notif_stats_alignment),
then provides a stats_offset in zcrx_notification_desc pointing to a
location within the refill queue region.

The kernel updates the stats counters in-place on every copy-fallback
event.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 include/uapi/linux/io_uring/query.h | 12 +++++++
 include/uapi/linux/io_uring/zcrx.h  | 15 ++++++--
 io_uring/query.c                    | 16 +++++++++
 io_uring/zcrx.c                     | 54 +++++++++++++++++++++++++++--
 io_uring/zcrx.h                     |  1 +
 5 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 95500759cc13..1a68eca7c6b4 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -23,6 +23,7 @@ enum {
 	IO_URING_QUERY_OPCODES			= 0,
 	IO_URING_QUERY_ZCRX			= 1,
 	IO_URING_QUERY_SCQ			= 2,
+	IO_URING_QUERY_ZCRX_NOTIF		= 3,
 
 	__IO_URING_QUERY_MAX,
 };
@@ -62,6 +63,17 @@ struct io_uring_query_zcrx {
 	__u64 __resv2;
 };
 
+struct io_uring_query_zcrx_notif {
+	/* Bitmask of supported ZCRX_NOTIF_* flags */
+	__u32 notif_flags;
+	/* Size of io_uring_zcrx_notif_stats */
+	__u32 notif_stats_size;
+	/* Required alignment for the stats struct within the region (ie stats_offset) */
+	__u32 notif_stats_off_alignment;
+	__u32 __resv1;
+	__u64 __resv2[4];
+};
+
 struct io_uring_query_scq {
 	/* The SQ/CQ rings header size */
 	__u64 hdr_size;
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index 3f7b72b09878..384e185a180c 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -75,11 +75,22 @@ enum zcrx_notification_type {
 	__ZCRX_NOTIF_TYPE_LAST,
 };
 
+enum zcrx_notification_desc_flags {
+	/* If set, stats_offset holds a valid offset to a notif_stats struct */
+	ZCRX_NOTIF_DESC_FLAG_STATS = 1 << 0,
+};
+
+struct io_uring_zcrx_notif_stats {
+	__u64	copy_count;	/* cumulative copy-fallback CQEs */
+	__u64	copy_bytes;	/* cumulative bytes copied */
+};
+
 struct zcrx_notification_desc {
 	__u64	user_data;
 	__u32	type_mask;
-	__u32	__resv1;
-	__u64	__resv2[10];
+	__u32	flags; /* see enum zcrx_notification_desc_flags */
+	__u64	stats_offset; /* offset from the beginning of refill ring region for stats */
+	__u64	__resv2[9];
 };
 
 /*
diff --git a/io_uring/query.c b/io_uring/query.c
index c1704d088374..d17a83645bcd 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -9,6 +9,7 @@
 union io_query_data {
 	struct io_uring_query_opcode opcodes;
 	struct io_uring_query_zcrx zcrx;
+	struct io_uring_query_zcrx_notif zcrx_notif;
 	struct io_uring_query_scq scq;
 };
 
@@ -44,6 +45,18 @@ static ssize_t io_query_zcrx(union io_query_data *data)
 	return sizeof(*e);
 }
 
+static ssize_t io_query_zcrx_notif(union io_query_data *data)
+{
+	struct io_uring_query_zcrx_notif *e = &data->zcrx_notif;
+
+	e->notif_flags = ZCRX_NOTIF_TYPE_MASK;
+	e->notif_stats_size = sizeof(struct io_uring_zcrx_notif_stats);
+	e->notif_stats_off_alignment = __alignof__(struct io_uring_zcrx_notif_stats);
+	e->__resv1 = 0;
+	memset(&e->__resv2, 0, sizeof(e->__resv2));
+	return sizeof(*e);
+}
+
 static ssize_t io_query_scq(union io_query_data *data)
 {
 	struct io_uring_query_scq *e = &data->scq;
@@ -83,6 +96,9 @@ static int io_handle_query_entry(union io_query_data *data, void __user *uhdr,
 	case IO_URING_QUERY_ZCRX:
 		ret = io_query_zcrx(data);
 		break;
+	case IO_URING_QUERY_ZCRX_NOTIF:
+		ret = io_query_zcrx_notif(data);
+		break;
 	case IO_URING_QUERY_SCQ:
 		ret = io_query_scq(data);
 		break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index f31f2ca0f7ec..2881ad76bacc 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -415,6 +415,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 	io_free_region(ifq->user, &ifq->rq_region);
 	ifq->rq.ring = IO_URING_PTR_POISON;
 	ifq->rq.rqes = IO_URING_PTR_POISON;
+	ifq->notif_stats = IO_URING_PTR_POISON;
 }
 
 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -855,6 +856,33 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
 	return ret;
 }
 
+static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq,
+				     const struct io_uring_zcrx_ifq_reg *reg,
+				     const struct zcrx_notification_desc *notif)
+{
+	size_t stats_off = notif->stats_offset;
+	size_t used, end;
+
+	used = reg->offsets.rqes +
+	       sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+
+	if (!IS_ALIGNED(stats_off, __alignof__(struct io_uring_zcrx_notif_stats)))
+		return -EINVAL;
+	if (stats_off < used)
+		return -ERANGE;
+	if (check_add_overflow(stats_off,
+			       sizeof(struct io_uring_zcrx_notif_stats),
+			       &end))
+		return -ERANGE;
+	if (end > io_region_size(&ifq->rq_region))
+		return -ERANGE;
+
+	ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off;
+	memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats));
+
+	return 0;
+}
+
 int io_register_zcrx(struct io_ring_ctx *ctx,
 		     struct io_uring_zcrx_ifq_reg __user *arg)
 {
@@ -908,7 +936,13 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
 		return -EINVAL;
-	if (notif.__resv1 || !mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
+	if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS)
+		return -EINVAL;
+	if (!(notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS)) {
+		if (notif.stats_offset)
+			return -EINVAL;
+	}
+	if (!mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
 		return -EINVAL;
 
 	ifq = io_zcrx_ifq_alloc(ctx);
@@ -939,6 +973,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 	if (ret)
 		goto err;
 
+	if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) {
+		ret = zcrx_validate_notif_stats(ifq, &reg, &notif);
+		if (ret)
+			goto err;
+	}
+
 	ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
 
 	if (!(reg.flags & ZCRX_REG_NODEV)) {
@@ -1154,6 +1194,11 @@ static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
 	kmem_cache_free(req_cachep, req);
 }
 
+static void zcrx_stat_add(__u64 *p, s64 v)
+{
+	WRITE_ONCE(*p, READ_ONCE(*p) + v);
+}
+
 static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
 {
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
@@ -1537,8 +1582,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 	int ret;
 
 	ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
-	if (ret > 0)
+	if (ret > 0) {
+		if (ifq->notif_stats) {
+			zcrx_stat_add(&ifq->notif_stats->copy_count, 1);
+			zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret);
+		}
 		zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
+	}
 
 	return ret;
 }
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 203b3049e14b..e1aab76c310d 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -81,6 +81,7 @@ struct io_zcrx_ifq {
 	u32				allowed_notif_mask;
 	u32				fired_notifs;
 	u64				notif_data;
+	struct io_uring_zcrx_notif_stats *notif_stats;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v2 5/6] Documentation: networking: document zcrx notifications and statistics
From: Clément Léger @ 2026-05-18 15:35 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260518153532.2835502-1-cleger@meta.com>

Document the zcrx notification system and shared-memory statistics
that were introduced to let userspace monitor zero-copy receive health.
The notification section covers the two notification types
(ZCRX_NOTIF_NO_BUFFERS, ZCRX_NOTIF_COPY), registration via
zcrx_notification_desc, and the fire-once / re-arm mechanism via
ZCRX_CTRL_ARM_NOTIFICATION. The statistics section covers the optional
shared-memory io_uring_zcrx_notif_stats structure placed in the refill
ring region, including how to query its layout via
IO_URING_QUERY_ZCRX_NOTIF.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 Documentation/networking/iou-zcrx.rst | 121 ++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst
index 7f3f4b2e6cf2..442760a1ca03 100644
--- a/Documentation/networking/iou-zcrx.rst
+++ b/Documentation/networking/iou-zcrx.rst
@@ -196,6 +196,127 @@ Return buffers back to the kernel to be used again::
   rqe->len = cqe->res;
   IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail);
 
+Notifications
+-------------
+
+When zero-copy receive encounters conditions that impact performance or
+functionality, the kernel can notify userspace via dedicated CQE notifications.
+The application must register a notification descriptor during
+``IORING_REGISTER_ZCRX_IFQ`` to receive them. Notifications are sent
+individually and are not batched with other CQEs. Each notification CQE reports
+a single notification in ``cqe->res``.
+
+Supported features can be detected by checking for ``ZCRX_FEATURE_NOTIFICATION``
+in the features bitmask returned by ``IO_URING_QUERY_ZCRX``.
+
+**Notification types**
+
+``ZCRX_NOTIF_NO_BUFFERS``
+  Fired when the page pool fails to allocate because the zcrx buffer area is
+  exhausted.
+
+``ZCRX_NOTIF_COPY``
+  Fired when a received fragment could not be delivered zero-copy and was
+  instead copied into a buffer.
+
+**Registering notifications**
+
+Allocate and fill a ``struct zcrx_notification_desc``::
+
+  struct zcrx_notification_desc notif = {
+    .user_data = MY_NOTIF_USER_DATA,
+    .type_mask = ZCRX_NOTIF_NO_BUFFERS | ZCRX_NOTIF_COPY,
+  };
+
+  reg.notif_desc = (__u64)(unsigned long)&notif;
+
+``user_data`` is the value that will appear in the notification CQE's
+``user_data`` field. ``type_mask`` selects which notification types the
+application wants to receive.
+
+When a registered event occurs, the kernel posts a CQE with the specified
+``user_data`` and ``cqe->res`` set to a bitmask of the triggered notification
+types.
+
+**Rate limiting**
+
+Each notification type fires once until the application explicitly re-arms it.
+To re-arm, issue ``IORING_REGISTER_ZCRX_CTRL`` with
+``ZCRX_CTRL_ARM_NOTIFICATION``::
+
+  struct zcrx_ctrl ctrl = {
+    .zcrx_id = zcrx_id,
+    .op = ZCRX_CTRL_ARM_NOTIFICATION,
+    .zc_arm_notif = {
+      .notif_type = ZCRX_NOTIF_NO_BUFFERS,
+    },
+  };
+
+  io_uring_register(ring_fd, IORING_REGISTER_ZCRX_CTRL, &ctrl, 0);
+
+Only notification types that have previously fired can be re-armed.
+
+Notification statistics
+-----------------------
+
+In addition to CQE-based notifications, the kernel can maintain a shared-memory
+statistics structure that is updated on every relevant event. All stats are
+updated regardless of which notification flags were registered.
+
+The statistics structure layout and alignment requirements can be queried via
+``IO_URING_QUERY_ZCRX_NOTIF``. The application must query the structure size
+and alignment requirements so that it allocates enough memory for the region
+to fit both the refill ring and the stats structure::
+
+  struct io_uring_query_zcrx_notif notif_query = {};
+  struct io_uring_query_hdr hdr = {
+    .query_op = IO_URING_QUERY_ZCRX_NOTIF,
+    .size = sizeof(notif_query),
+    .query_data = (__u64)(unsigned long)&notif_query,
+  };
+
+  io_uring_register(ring_fd, IORING_REGISTER_QUERY, &hdr, 1);
+
+  __u32 notif_stats_size = notif_query.notif_stats_size;
+  __u32 notif_stats_off_alignment = notif_query.notif_stats_off_alignment;
+
+To enable statistics, place the stats structure after the refill ring entries
+within the same mapped region, and set the ``ZCRX_NOTIF_DESC_FLAG_STATS`` flag
+in the notification descriptor::
+
+  /* Compute offset for the stats struct (after refill ring entries) */
+  size_t stats_offset = ALIGN_UP(ring_size, notif_stats_off_alignment);
+  ring_size = stats_offset + notif_stats_size;
+  ring_size = ALIGN_UP(ring_size, PAGE_SIZE);
+
+  /* Map the region with the extra space */
+  ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+  struct zcrx_notification_desc notif = {
+    .user_data = MY_NOTIF_USER_DATA,
+    .type_mask = ZCRX_NOTIF_COPY,
+    .flags = ZCRX_NOTIF_DESC_FLAG_STATS,
+    .stats_offset = stats_offset,
+  };
+
+The ``stats_offset`` must satisfy the alignment reported by
+``notif_stats_off_alignment`` and must point to a location within the mapped
+region that does not overlap with the refill ring header or entries.
+
+Application can read stat counters them at any time::
+
+  volatile struct io_uring_zcrx_notif_stats *stats =
+    (void *)((char *)ring_ptr + stats_offset);
+
+  printf("copy fallbacks: %llu (%llu bytes)\n",
+         IO_URING_READ_ONCE(stats->copy_count),
+	 IO_URING_READ_ONCE(stats->copy_bytes));
+
+``copy_count`` is incremented each time a fragment is copied instead of being
+delivered via zero-copy. ``copy_bytes`` accumulates the total number of bytes
+copied.
+
 Area chunking
 -------------
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v2 6/6] selftests: iou-zcrx: add notification and stats test for zcrx
From: Clément Léger @ 2026-05-18 15:35 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260518153532.2835502-1-cleger@meta.com>

Add a selftest to verify that ZCRX notification are properly delivered
to userspace and that the shared-memory notification stats (copy_count,
copy_bytes) are correctly incremented when zero-copy RX falls back to
copying or when it runs out of buffers.

The test registers a notification descriptor during
IORING_REGISTER_ZCRX_IFQ with a stats region placed after the refill
queue entries. A new -n flag verifies that the copy fallback is
triggered and -b/-a flags allows to check for out of buffer
notification.

To reliably trigger copy fallback, the Python test uses a new
single_no_flow() setup variant that configures tcp-data-split and RSS
but without ethtool flow rule. Without flow steering, traffic arrives
on non-zcrx queues as regular pages, forcing the kernel copy-fallback
path in io_zcrx_copy_frag().

Out-of-buffer notification is verified by using a smaller receive area
and by avoiding recycling the buffers so that the kernel runs out of
buffer quickly.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 .../selftests/drivers/net/hw/iou-zcrx.c       | 114 ++++++++++++++++--
 .../selftests/drivers/net/hw/iou-zcrx.py      |  49 +++++++-
 2 files changed, 151 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 240d13dbc54e..78a43ede77ed 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -52,7 +52,27 @@ struct t_io_uring_zcrx_ifq_reg {
 	struct io_uring_zcrx_offsets offsets;
 	__u32	zcrx_id;
 	__u32	rx_buf_len;
-	__u64	__resv[3];
+	__u64	notif_desc;
+	__u64	__resv[2];
+};
+
+#define ZCRX_NOTIF_NO_BUFFERS		0
+#define ZCRX_NOTIF_COPY			1
+#define ZCRX_NOTIF_DESC_FLAG_STATS	(1 << 0)
+
+#define NOTIF_USER_DATA			3
+
+struct t_zcrx_notification_desc {
+	__u64	user_data;
+	__u32	type_mask;
+	__u32	flags;
+	__u64	stats_offset;
+	__u64	__resv2[9];
+};
+
+struct t_io_uring_zcrx_notif_stats {
+	__u64	copy_count;
+	__u64	copy_bytes;
 };
 
 static long page_size;
@@ -84,7 +104,10 @@ static int cfg_oneshot_recvs;
 static int cfg_send_size = SEND_SIZE;
 static struct sockaddr_in6 cfg_addr;
 static unsigned int cfg_rx_buf_len;
+static size_t cfg_area_size;
 static bool cfg_dry_run;
+static bool cfg_copy_fallback;
+static bool cfg_no_buffers;
 
 static char *payload;
 static void *area_ptr;
@@ -95,6 +118,9 @@ static unsigned long area_token;
 static int connfd;
 static bool stop;
 static size_t received;
+static unsigned int received_notif_type;
+static bool received_notif;
+static size_t notif_stats_offset;
 
 static unsigned long gettimeofday_ms(void)
 {
@@ -142,6 +168,7 @@ static void setup_zcrx(struct io_uring *ring)
 {
 	unsigned int ifindex;
 	unsigned int rq_entries = 4096;
+	size_t area_size = cfg_area_size ? cfg_area_size : AREA_SIZE;
 	int ret;
 
 	ifindex = if_nametoindex(cfg_ifname);
@@ -150,7 +177,7 @@ static void setup_zcrx(struct io_uring *ring)
 
 	if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) {
 		area_ptr = mmap(NULL,
-				AREA_SIZE,
+				area_size,
 				PROT_READ | PROT_WRITE,
 				MAP_ANONYMOUS | MAP_PRIVATE |
 				MAP_HUGETLB | MAP_HUGE_2MB,
@@ -162,7 +189,7 @@ static void setup_zcrx(struct io_uring *ring)
 		}
 	} else {
 		area_ptr = mmap(NULL,
-				AREA_SIZE,
+				area_size,
 				PROT_READ | PROT_WRITE,
 				MAP_ANONYMOUS | MAP_PRIVATE,
 				0,
@@ -172,6 +199,12 @@ static void setup_zcrx(struct io_uring *ring)
 	}
 
 	ring_size = get_refill_ring_size(rq_entries);
+
+	if (cfg_copy_fallback) {
+		notif_stats_offset = ring_size;
+		ring_size += ALIGN_UP(sizeof(struct t_io_uring_zcrx_notif_stats), page_size);
+	}
+
 	ring_ptr = mmap(NULL,
 			ring_size,
 			PROT_READ | PROT_WRITE,
@@ -187,10 +220,11 @@ static void setup_zcrx(struct io_uring *ring)
 
 	struct io_uring_zcrx_area_reg area_reg = {
 		.addr = (__u64)(unsigned long)area_ptr,
-		.len = AREA_SIZE,
+		.len = area_size,
 		.flags = 0,
 	};
 
+	struct t_zcrx_notification_desc notif_desc;
 	struct t_io_uring_zcrx_ifq_reg reg = {
 		.if_idx = ifindex,
 		.if_rxq = cfg_queue_id,
@@ -200,11 +234,32 @@ static void setup_zcrx(struct io_uring *ring)
 		.rx_buf_len = cfg_rx_buf_len,
 	};
 
+	if (cfg_copy_fallback || cfg_no_buffers) {
+		__u32 type_mask = 0;
+
+		if (cfg_copy_fallback)
+			type_mask = 1 << ZCRX_NOTIF_COPY;
+		if (cfg_no_buffers)
+			type_mask = 1 << ZCRX_NOTIF_NO_BUFFERS;
+
+		memset(&notif_desc, 0, sizeof(notif_desc));
+		notif_desc.user_data = NOTIF_USER_DATA;
+		notif_desc.type_mask = type_mask;
+		if (cfg_copy_fallback) {
+			notif_desc.flags = ZCRX_NOTIF_DESC_FLAG_STATS;
+			notif_desc.stats_offset = notif_stats_offset;
+		}
+		reg.notif_desc = (__u64)(unsigned long)&notif_desc;
+	}
+
 	ret = io_uring_register_ifq(ring, (void *)&reg);
 	if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP ||
 			       ret == -ERANGE)) {
 		printf("Large chunks are not supported %i\n", ret);
 		exit(SKIP_CODE);
+	} else if ((cfg_copy_fallback || cfg_no_buffers) && ret == -EINVAL) {
+		printf("Notifications not supported %i\n", ret);
+		exit(SKIP_CODE);
 	} else if (ret) {
 		error(1, 0, "io_uring_register_ifq(): %d", ret);
 	}
@@ -304,10 +359,13 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
 	}
 	received += n;
 
-	rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)];
-	rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token;
-	rqe->len = cqe->res;
-	io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail);
+	/* Skip ring refill so that we ran out of buffers quickly */
+	if (!cfg_no_buffers) {
+		rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)];
+		rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token;
+		rqe->len = cqe->res;
+		io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail);
+	}
 }
 
 static void server_loop(struct io_uring *ring)
@@ -324,8 +382,16 @@ static void server_loop(struct io_uring *ring)
 			process_accept(ring, cqe);
 		else if (cqe->user_data == 2)
 			process_recvzc(ring, cqe);
-		else
+		else if ((cfg_copy_fallback || cfg_no_buffers) &&
+			 cqe->user_data == NOTIF_USER_DATA) {
+			received_notif_type |= cqe->res;
+			received_notif = true;
+			if (cfg_no_buffers &&
+			    (cqe->res == ZCRX_NOTIF_NO_BUFFERS))
+				stop = true;
+		} else {
 			error(1, 0, "unknown cqe");
+		}
 		count++;
 	}
 	io_uring_cq_advance(ring, count);
@@ -374,6 +440,23 @@ static void run_server(void)
 
 	if (!stop)
 		error(1, 0, "test failed\n");
+
+	if (cfg_copy_fallback) {
+		struct t_io_uring_zcrx_notif_stats *stats =
+			(void *)((char *)ring_ptr + notif_stats_offset);
+
+		if (!received_notif || received_notif_type != ZCRX_NOTIF_COPY)
+			error(1, 0, "expected copy fallback notification");
+		if (!IO_URING_READ_ONCE(stats->copy_count))
+			error(1, 0, "expected copy_count > 0");
+		if (!IO_URING_READ_ONCE(stats->copy_bytes))
+			error(1, 0, "expected copy_bytes > 0");
+	}
+
+	if (cfg_no_buffers) {
+		if (!received_notif || received_notif_type != ZCRX_NOTIF_NO_BUFFERS)
+			error(1, 0, "expected no-buffers notification");
+	}
 }
 
 static void run_client(void)
@@ -425,7 +508,7 @@ static void parse_opts(int argc, char **argv)
 		usage(argv[0]);
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) {
+	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:a:dnb")) != -1) {
 		switch (c) {
 		case 's':
 			if (cfg_client)
@@ -466,8 +549,19 @@ static void parse_opts(int argc, char **argv)
 		case 'd':
 			cfg_dry_run = true;
 			break;
+		case 'n':
+			cfg_copy_fallback = true;
+			break;
+		case 'b':
+			cfg_no_buffers = true;
+			break;
+		case 'a':
+			cfg_area_size = strtoul(optarg, NULL, 0) * page_size;
+			break;
 		}
 	}
+	if (cfg_copy_fallback && cfg_no_buffers)
+		error(1, 0, "Pass one of -n or -b");
 
 	if (cfg_server && addr)
 		error(1, 0, "Receiver cannot have -h specified");
diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
index e81724cb5542..82b4f4777182 100755
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
@@ -41,7 +41,9 @@ def set_flow_rule_rss(cfg, rss_ctx_id):
     return int(values)
 
 
-def single(cfg):
+def single_no_flow(cfg):
+    """Like single() but without a flow rule."""
+
     channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
     channels = channels['combined-count']
     if channels < 2:
@@ -65,6 +67,9 @@ def single(cfg):
     ethtool(f"-X {cfg.ifname} equal {cfg.target}")
     defer(ethtool, f"-X {cfg.ifname} default")
 
+def single(cfg):
+    single_no_flow(cfg)
+
     flow_rule_id = set_flow_rule(cfg)
     defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
 
@@ -130,6 +135,26 @@ def test_zcrx_oneshot(cfg, setup) -> None:
         cmd(tx_cmd, host=cfg.remote)
 
 
+@ksft_variants([
+    KsftNamedVariant("single", single_no_flow),
+])
+def test_zcrx_notif_copy_fallback(cfg, setup) -> None:
+    """Test zcrx copy fallback notification.
+
+    Omits the flow rule so traffic arrives on non-zcrx queues as regular
+    pages, forcing the kernel copy-fallback path. Asserts that the
+    ZCRX_NOTIF_COPY notification CQE is delivered."""
+
+    cfg.require_ipver('6')
+
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -n"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(cfg.port, proto="tcp")
+        cmd(tx_cmd, host=cfg.remote)
+
+
 def test_zcrx_large_chunks(cfg) -> None:
     """Test zcrx with large buffer chunks."""
 
@@ -157,6 +182,25 @@ def test_zcrx_large_chunks(cfg) -> None:
         cmd(tx_cmd, host=cfg.remote)
 
 
+@ksft_variants([
+    KsftNamedVariant("single", single),
+])
+def test_zcrx_notif_no_buffers(cfg, setup) -> None:
+    """Test zcrx out-of-buffer notification.
+
+    Skips buffer refill so the pool is quickly exhausted, triggering
+    a ZCRX_NOTIF_NO_BUFFERS notification CQE."""
+
+    cfg.require_ipver('6')
+
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -b -a 64"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(cfg.port, proto="tcp")
+        cmd(tx_cmd, host=cfg.remote, fail=False)
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__) as cfg:
         cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
@@ -166,7 +210,8 @@ def main() -> None:
         cfg.netnl = NetdevFamily()
         cfg.port = rand_port()
         ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot,
-                                        test_zcrx_large_chunks], args=(cfg, ))
+                                        test_zcrx_large_chunks, test_zcrx_notif_copy_fallback,
+                                        test_zcrx_notif_no_buffers], args=(cfg, ))
     ksft_exit()
 
 
-- 
2.53.0-Meta


^ permalink raw reply related

* Re: [PATCH] MAINTAINERS: nvdimm: Include maintainer profile
From: Dave Jiang @ 2026-05-18 15:43 UTC (permalink / raw)
  To: Krzysztof Kozlowski, Dan Williams, Vishal Verma, Ira Weiny,
	Jonathan Corbet, Shuah Khan, nvdimm, linux-doc, linux-kernel
In-Reply-To: <20260518104306.39289-2-krzysztof.kozlowski@oss.qualcomm.com>



On 5/18/26 3:43 AM, Krzysztof Kozlowski wrote:
> No dedicated NVDIMM maintainers are returned by get_maintainers.pl for
> the subsystem maintainer profile, thus patches changing that file miss
> the actual owners of the file.
> 
> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>

Acked-by: Dave Jiang <dave.jiang@intel.com>


> ---
>  MAINTAINERS | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7a65b220d93f..294909f6d488 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -14751,6 +14751,7 @@ S:	Supported
>  Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
>  P:	Documentation/nvdimm/maintainer-entry-profile.rst
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git
> +F:	Documentation/nvdimm/maintainer-entry-profile.rst
>  F:	drivers/acpi/nfit/*
>  F:	drivers/nvdimm/*
>  F:	include/linux/libnvdimm.h


^ permalink raw reply

* Re: (subset) [PATCH v3 00/28] vfs/nfsd: add support for CB_NOTIFY callbacks in directory delegations
From: Chuck Lever @ 2026-05-18 16:05 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton, Chuck Lever
  Cc: Alexander Viro, Jan Kara, Alexander Aring, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Amir Goldstein, Calum Mackay,
	linux-fsdevel, linux-kernel, linux-trace-kernel, linux-doc,
	linux-nfs
In-Reply-To: <20260515-weltschmerz-folgen-68ca0db1ef84@brauner>



On Fri, May 15, 2026, at 1:26 PM, Christian Brauner wrote:
> On Tue, 28 Apr 2026 08:09:44 +0100, Jeff Layton wrote:
>> Re-posting the set per Christian's request. The only difference in this
>> version is a small error handling fix in alloc_init_dir_deleg(). The old
>> version could crash since release_pages() can't handle an array with
>> NULL pointers in it.
>> 
>> ---------------------------------8<------------------------------------
>> 
>> [...]
>
> @Chuck, @Jeff, I've only merged the vfs specific changes into a stable branch.
> You can pull it I won't touch it again. You can pull the nfsd work in in
> whatever form you like. Same procedure I use with io_uring et al.
>
> Let me know if that work for you.
>
> ---
>
> Applied to the vfs-7.2.directory.delegations branch of the vfs/vfs.git 
> tree.
> Patches in the vfs-7.2.directory.delegations branch should appear in 
> linux-next soon.
>
> Please report any outstanding bugs that were missed during review in a
> new review to the original patch series allowing us to drop it.
>
> It's encouraged to provide Acked-bys and Reviewed-bys even though the
> patch has now been applied. If possible patch trailers will be updated.
>
> Note that commit hashes shown below are subject to change due to rebase,
> trailer updates or similar. If in doubt, please check the listed branch.
>
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
> branch: vfs-7.2.directory.delegations
>
> [01/28] filelock: pass current blocking lease to 
> trace_break_lease_block() rather than "new_fl"
>         https://git.kernel.org/vfs/vfs/c/89330d3a60f7
> [02/28] filelock: add support for ignoring deleg breaks for dir change 
> events
>         https://git.kernel.org/vfs/vfs/c/24cbf43337f4
> [03/28] filelock: add a tracepoint to start of break_lease()
>         https://git.kernel.org/vfs/vfs/c/e39026a86b48
> [04/28] filelock: add an inode_lease_ignore_mask helper
>         https://git.kernel.org/vfs/vfs/c/95825fdcc0b0
> [05/28] fsnotify: new tracepoint in fsnotify()
>         https://git.kernel.org/vfs/vfs/c/ad4489dcd08d
> [06/28] fsnotify: add fsnotify_modify_mark_mask()
>         https://git.kernel.org/vfs/vfs/c/12ffbb117b64
> [07/28] fsnotify: add FSNOTIFY_EVENT_RENAME data type
>         https://git.kernel.org/vfs/vfs/c/010043003c0c

Looks good.

To make the NFSD pieces apply, I need v7.1-rc4 and
vfs-7.2.directory.delegations merged into vfs.all. Given your
regular merge cadence over the past few weeks, I expect that
will happen end of this week? Early next?


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v17 02/11] cxl/ras: Unify Endpoint and Port AER trace events
From: Jonathan Cameron @ 2026-05-18 16:09 UTC (permalink / raw)
  To: Dan Williams (nvidia)
  Cc: Bowman, Terry, dave, dave.jiang, alison.schofield, bhelgaas,
	shiju.jose, ming.li, Smita.KoralahalliChannabasappa, rrichter,
	dan.carpenter, PradeepVineshReddy.Kodamati, lukas,
	Benjamin.Cheatham, sathyanarayanan.kuppuswamy, vishal.l.verma,
	alucerop, ira.weiny, corbet, rafael, xueshuai, linux-cxl,
	linux-kernel, linux-pci, linux-acpi, linux-doc,
	Mauro Carvalho Chehab
In-Reply-To: <69feaebd471c3_1b86a100b@djbw-dev.notmuch>

On Fri, 08 May 2026 20:49:17 -0700
"Dan Williams (nvidia)" <djbw@kernel.org> wrote:

> Jonathan Cameron wrote:
> > On Thu, 7 May 2026 13:33:45 -0500
> > "Bowman, Terry" <terry.bowman@amd.com> wrote:  
> [..]
> > > > This concerns me (sorry I wasn't paying attention to the v16 thread).
> > > > It is a userspace regression against code that is out in the wild and typically
> > > > not updated in sync with the kernel.
> > > > 
> > > > If you are suggesting breaking ras-daemon at the very least +CC the maintainer.  
> 
> Sorry, that was not the intent, see below.


Sorry for slow reply - getting a bit buried in other kernel work so haven't been
checking CXL stuff as often as normal.

Anyhow direction looks good to me.

Jonathan


^ permalink raw reply

* Re: [PATCH] docs: submitting-patches: Clarify that in English "reviewer" is a person
From: Randy Dunlap @ 2026-05-18 16:25 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE), Krzysztof Kozlowski, Jonathan Corbet,
	Shuah Khan, workflows, linux-doc, linux-kernel
  Cc: Greg Kroah-Hartman, Andrew Morton, David Hildenbrand,
	Linus Torvalds, Guenter Roeck
In-Reply-To: <ce1e5e9b-83d0-4971-aee3-dc5a8f85ce22@kernel.org>



On 5/16/26 7:39 AM, Vlastimil Babka (SUSE) wrote:
> On 5/16/26 14:38, Krzysztof Kozlowski wrote:
>> Common understanding of word "Reviewer" is: a person performing a review
>> work [1]. Tools are not persons, thus cannot be reviewers in this term.
>> Also tools cannot make statements ("A Reviewed-by tag is a statement of
>> opinion"), since making a statement needs some sort of conscious mind.
>>
>> Our docs already clearly mark that "Reviewed-by" must come from a
>> person:
>>
>>  - "By offering my Reviewed-by: tag, I state that:"
>>
>>    Usage of first person "I" and word "state"
>>
>>  - "A Reviewed-by tag is *a statement of opinion* that the patch is an
>>     appropriate modification of the kernel without any remaining serious"
>>
>>    Only a person can make a statement of opinion.
>>
>>  - "Any interested reviewer (who has done the work) can offer a
>>    Reviewed-by"
>>
>>    A person can offer a tag thus above does not grant the tool
>>    permission to offer a tag.
>>
>> However this is not enough and apparently English is not that precise,
>> so let's clarify that only a person can state the "Reviewer's statement
>> of oversight".
>>
>> Link: https://en.wiktionary.org/wiki/reviewer [1]
>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> Cc: Vlastimil Babka <vbabka@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@kernel.org>
>> Cc: Linus Torvalds <torvalds@linux-foundation.org>
>> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>
> 
> I agree with the intent that the tag is for people (whether they use a tool
> or not to help them). We also don't put "Tested-by: kernel test robot" or
> syzkaller on every commit that they test and find no bugs. Review is also
> not just about absence of bugs, but agreeing with the larger design and
> whether the change makes sense to do in the first place.

Ack that also.

> So whether that's achieved with this particular wording or differently,
> 
> Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>

Acked-by: Randy Dunlap <rdunlap@infradead.org>
Thanks.

> 
>>
>> ---
>>
>> I find it silly to need to describe English, but it seems it is needed.
>>
>> https://lore.kernel.org/all/fd3b2ca7-4d64-4c4b-98a3-7d3285fa6826@roeck-us.net/
>> ---
>>  Documentation/process/submitting-patches.rst | 8 ++++----
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
>> index d7290e208e72..a989de43f3db 100644
>> --- a/Documentation/process/submitting-patches.rst
>> +++ b/Documentation/process/submitting-patches.rst
>> @@ -581,10 +581,10 @@ By offering my Reviewed-by: tag, I state that:
>>  
>>  A Reviewed-by tag is a statement of opinion that the patch is an
>>  appropriate modification of the kernel without any remaining serious
>> -technical issues.  Any interested reviewer (who has done the work) can
>> -offer a Reviewed-by tag for a patch.  This tag serves to give credit to
>> -reviewers and to inform maintainers of the degree of review which has been
>> -done on the patch.  Reviewed-by: tags, when supplied by reviewers known to
>> +technical issues.  Any interested reviewer (who has done the work and is a
>> +person) can offer a Reviewed-by tag for a patch.  This tag serves to give
>> +credit to reviewers and to inform maintainers of the degree of review which has
>> +been done on the patch.  Reviewed-by: tags, when supplied by reviewers known to
>>  understand the subject area and to perform thorough reviews, will normally
>>  increase the likelihood of your patch getting into the kernel.
>>  
> 
> 

-- 
~Randy

^ permalink raw reply

* Re: [PATCH v4 02/16] vfio/pci: Preserve vfio-pci device files across Live Update
From: Vipin Sharma @ 2026-05-18 16:37 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Samiullah Khawaja, David Matlack, kvm, linux-doc, linux-kernel,
	linux-kselftest, linux-pci, ajayachandra, alex, amastro, ankita,
	apopple, chrisl, corbet, graf, jacob.pan, jgg, jgg, jrhilke,
	julianr, kevin.tian, leon, leonro, lukas, michal.winiarski, parav,
	pasha.tatashin, praan, rananta, rientjes, rodrigo.vivi, rppt,
	saeedm, skhan, vivek.kasireddy, witu, yanjun.zhu, yi.l.liu
In-Reply-To: <2vxzcxyy9fpd.fsf@kernel.org>

On Thu, May 14, 2026 at 05:24:46PM +0200, Pratyush Yadav wrote:
> On Wed, May 13 2026, Samiullah Khawaja wrote:
> 
> > On Tue, May 12, 2026 at 02:29:19PM -0700, Vipin Sharma wrote:
> >>On Tue, May 12, 2026 at 01:59:51PM -0700, David Matlack wrote:
> >>> On Mon, May 11, 2026 at 4:48 PM Vipin Sharma <vipinsh@google.com> wrote:
> >>>
> >>> > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> >>> > index c12d614fc6c4..019de053f116 100644
> >>> > --- a/drivers/vfio/pci/Kconfig
> >>> > +++ b/drivers/vfio/pci/Kconfig
> >>> > @@ -45,13 +45,15 @@ config VFIO_PCI_IGD
> >>> >
> >>> >  config VFIO_PCI_LIVEUPDATE
> >>> >         bool "VFIO PCI support for Live Update (EXPERIMENTAL)"
> >>> > -       depends on PCI_LIVEUPDATE
> >>> > +       depends on PCI_LIVEUPDATE && VFIO_DEVICE_CDEV
> >>> >         help
> >>> >           Support for preserving devices bound to vfio-pci across a Live
> >>> >           Update. This option should only be enabled by developers working on
> >>> >           implementing this support. Once enough support has landed in the
> >>> >           kernel, this option will no longer be marked EXPERIMENTAL.
> >>> >
> >>> > +         Enabling this will disable support for VFIO PCI DMA buffer.
> >>> > +
> >>> >           If you don't know what to do here, say N.
> >>> >
> >>> >  endif
> >>> > @@ -68,7 +70,7 @@ config VFIO_PCI_ZDEV_KVM
> >>> >           To enable s390x KVM vfio-pci extensions, say Y.
> >>> >
> >>> >  config VFIO_PCI_DMABUF
> >>> > -       def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
> >>> > +       def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER && !VFIO_PCI_LIVEUPDATE
> >>>
> >>> Why does enabling VFIO_PCI_LIVEUPDATE require disabling
> >>> VFIO_PCI_DMABUF? I saw the cover letter says "to keep things simple",
> >>> but what specific problem does this solve or simplify?
> >>
> >>I should have provided more details there.
> >>
> >>When device is getting reset in vfio_pci_liveupdate_freeze(), we are
> >>zapping userspace mapped bars, we also need to use
> >>vfio_pci_dma_buf_move() to revoke dma buffer access or
> >>vfio_pci_dma_buf_cleanup() combination. Cleanup takes the memory lock
> >>which freeze already takes, and there are some refcounts which are
> >>managed in both of these APIs. This was causing complexities with code
> >>flow based on result of pci_load_saved_state(). All this was adding more
> >>refactoring than I wanted in the series.
> >
> > Maybe we can return -EOPNOTSUPP if any dmabufs for this vfio cdev are
> > exported during preserve?

Currently, no APIs are present to fetch if dmabufs are exported or not.
I will add one patch to this series to return EOPNOTSUPP and remove
condition from the config.

> 
> Whichever way you go with, a TODO/comment would be nice to have so
> someone (including future you) looking at this code knows why this
> restriction exists.
> 

I will add comment in the next version.

^ permalink raw reply

* Re: [PATCH v4 05/16] vfio: Enforce preserved devices are retrieved via LIVEUPDATE_SESSION_RETRIEVE_FD
From: Vipin Sharma @ 2026-05-18 16:47 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: kvm, linux-doc, linux-kernel, linux-kselftest, linux-pci,
	ajayachandra, alex, amastro, ankita, apopple, chrisl, corbet,
	dmatlack, graf, jacob.pan, jgg, jgg, jrhilke, julianr, kevin.tian,
	leon, leonro, lukas, michal.winiarski, parav, pasha.tatashin,
	praan, pratyush, rananta, rientjes, rodrigo.vivi, rppt, saeedm,
	skhan, skhawaja, vivek.kasireddy, witu, yi.l.liu
In-Reply-To: <65228806-6ed3-4577-9037-13fd5eb8f9b6@linux.dev>

On Sun, May 17, 2026 at 12:04:04PM -0700, Zhu Yanjun wrote:
> 
> 在 2026/5/11 16:47, Vipin Sharma 写道:
> > From: David Matlack <dmatlack@google.com>
> > 
> > Enforce that files for incoming (preserved by previous kernel) VFIO
> > devices are retrieved via LIVEUPDATE_SESSION_RETRIEVE_FD rather than by
> > opening the corresponding VFIO character device or via
> > VFIO_GROUP_GET_DEVICE_FD.
> > 
> > Both of these methods would result in VFIO initializing the device
> > without access to the preserved state of the device passed by the
> > previous kernel.
> > 
> > Reviewed-by: Pranjal Shrivastava <praan@google.com>
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > Co-developed-by: Vipin Sharma <vipinsh@google.com>
> > Signed-off-by: Vipin Sharma <vipinsh@google.com>
> > ---
> >   drivers/vfio/device_cdev.c             |  8 ++++++++
> >   drivers/vfio/group.c                   |  9 +++++++++
> >   drivers/vfio/pci/vfio_pci_liveupdate.c |  6 ++++++
> >   drivers/vfio/vfio.h                    | 18 ++++++++++++++++++
> >   4 files changed, 41 insertions(+)
> > 
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index 1ab07ccaf3ab..4df0495941c6 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -49,6 +49,14 @@ static int vfio_device_cdev_open(struct vfio_device *device, struct file **filep
> >   		}
> >   		*filep = file;
> > +	} else if (vfio_liveupdate_incoming_is_preserved(device)) {
> > +		/*
> > +		 * Since it is live update preserved device, it must be
> > +		 * retrieved via LIVEUPDATE_SESSION_RETRIEVE_FD instead of
> > +		 * opening /dev/vfio/devices/vfioX.
> > +		 */
> > +		ret = -EBUSY;
> > +		goto err_free_device_file;
> 
> When vfio_liveupdate_incoming_is_preserved(device) returns true,
> vfio_device_put_registration(device) is not called in this path.
> 
> Is vfio_device_put_registration(device) instead invoked from the
> err_free_device_file error handling path?

Yes, at the end of vfio_device_cdev_open(), goto label first frees the
device file object and then calls the vfio_device_put_registration().
This is the same error handlign flow as in the if(!file) {} code in the
above function.


^ permalink raw reply

* Re: [PATCH mm-unstable v17 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Usama Arif @ 2026-05-18 17:00 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260511185817.686831-7-npache@redhat.com>

On Mon, 11 May 2026 12:58:06 -0600 Nico Pache <npache@redhat.com> wrote:

> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
> 
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
>  1 file changed, 55 insertions(+), 38 deletions(-)
> 

The patch did 2 things:

Make it work with any order and not just PMD order.

Keeps anon_vma_write held across the copy and install for non-PMD orders,
as mTHP leaves the out-of-range PTEs mapped while the PMD is temporarily none.
rmap walkers cannot reach here until PMD is isntalled.

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Sumit Gupta @ 2026-05-18 17:22 UTC (permalink / raw)
  To: Mario Limonciello, rafael, viresh.kumar, pierre.gondois,
	ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan, rdunlap,
	linux-pm, linux-doc, linux-kernel
  Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
	mochs, bbasu
In-Reply-To: <7d7a6ab6-b1ea-484c-a275-19acca50c483@amd.com>


On 18/05/26 19:51, Mario Limonciello wrote:
> External email: Use caution opening links or attachments
>
>
> On 5/18/26 09:15, Sumit Gupta wrote:
>>
>> On 18/05/26 19:20, Mario Limonciello wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On 5/18/26 08:44, Sumit Gupta wrote:
>>>> Hi Mario,
>>>>
>>>>
>>>> On 16/05/26 02:43, Mario Limonciello wrote:
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> On 5/15/26 07:26, Sumit Gupta wrote:
>>>>>> Add a kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable
>>>>>> CPPC autonomous performance selection on all CPUs at system startup.
>>>>>> When autonomous mode is enabled, the hardware automatically adjusts
>>>>>> CPU performance based on workload demands using Energy Performance
>>>>>> Preference (EPP) hints.
>>>>>>
>>>>>> When the parameter is set:
>>>>>> - Configure all CPUs for autonomous operation on first init
>>>>>> - Use HW min/max_perf when available; otherwise initialize from caps
>>>>>> - Initialize desired_perf to max_perf as a starting hint
>>>>>> - Hardware controls frequency instead of the OS governor
>>>>>> - EPP behavior depends on parameter value:
>>>>>>    - performance (or 1): override EPP to performance preference 
>>>>>> (0x0)
>>>>>>    - default_epp (or 2): preserve EPP value programmed by BIOS/
>>>>>> firmware
>>>>>>
>>>>>> The boot parameter is applied only during first policy 
>>>>>> initialization.
>>>>>> Skip applying it on CPU hotplug to preserve runtime sysfs
>>>>>> configuration.
>>>>>>
>>>>>> This patch depends on patch series [1] ("cpufreq: Set policy->min 
>>>>>> and
>>>>>> max as real QoS constraints") so that the policy->min/max set in
>>>>>> cppc_cpufreq_cpu_init() are not overridden by cpufreq_set_policy()
>>>>>> during init.
>>>>>>
>>>>>> Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
>>>>>> ---
>>>>>> [1] https://lore.kernel.org/lkml/20260511135538.522653-1-
>>>>>> pierre.gondois@arm.com/
>>>>>> ---
>>>>>>   .../admin-guide/kernel-parameters.txt         |  16 +++
>>>>>>   drivers/cpufreq/cppc_cpufreq.c                | 122 +++++++++++++
>>>>>> ++++-
>>>>>>   2 files changed, 133 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/
>>>>>> Documentation/admin-guide/kernel-parameters.txt
>>>>>> index 0eb64aab3685..7e4b3a8fd76f 100644
>>>>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>>>>> @@ -1048,6 +1048,22 @@ Kernel parameters
>>>>>>                       policy to use. This governor must be 
>>>>>> registered
>>>>>> in the
>>>>>>                       kernel before the cpufreq driver probes.
>>>>>>
>>>>>> +     cppc_cpufreq.auto_sel_mode=
>>>>>> +                     [CPU_FREQ] Enable ACPI CPPC autonomous
>>>>>> performance
>>>>>> +                     selection. When enabled, hardware 
>>>>>> automatically
>>>>>> adjusts
>>>>>> +                     CPU frequency on all CPUs based on workload
>>>>>> demands.
>>>>>> +                     In Autonomous mode, Energy Performance
>>>>>> Preference (EPP)
>>>>>> +                     hints guide hardware toward performance (0x0)
>>>>>> or energy
>>>>>> +                     efficiency (0xff).
>>>>>> +                     Requires ACPI CPPC autonomous selection 
>>>>>> register
>>>>>> +                     support.
>>>>>> +                     Accepts:
>>>>>> +                       performance, 1: enable auto_sel + set EPP to
>>>>>> +                                       performance (0x0)
>>>>>> +                       default_epp, 2: enable auto_sel, preserve 
>>>>>> EPP
>>>>>> value
>>>>>> +                                       programmed by BIOS/firmware
>>>>>> +                     Unset: cpufreq governors are used (auto_sel
>>>>>> disabled).
>>>>>
>>>>> Rather than unset doing nothing, have you considered having it take a
>>>>> midpoint like 128?  That's what we do in amd-pstate (default to
>>>>> balance_performance).  I think it turns into a reasonable balance.
>>>>
>>>> Thanks for the suggestion.
>>>> I can add balance_performance that enables auto_sel with EPP=128 in 
>>>> v4.
>>>>
>>>> On changing the driver default (no param behavior) to auto enable
>>>> balance_performance, it would be good to keep the current behavior for
>>>> now since cppc_cpufreq is generic across ARM64/RISC-V platforms where
>>>> EPP and Autonomous Selection registers are optional.
>>>> A default change would affect existing users relying on governors.
>>>>
>>>> Thank you,
>>>> Sumit Gupta
>>>
>>> But couldn't you make the "no module parameter set" follow the behavior
>>> to only set the registers if they're available?
>>>
>>> So the systems that support it start using it, the ones that don't it's
>>> a NOP.
>>>
>>
>> Would it work to add balance_performance as a new mode in v4,
>> and discuss changing the default separately as a follow-up?
>>
>
> Sure.
>
>> Runtime detection helps for unsupported platforms. But platforms which
>> support the registers use OS governors today, and silently switching
>> them to autonomous mode on a kernel update is a behavior change for
>> existing users. They would also have no way to boot into sw governor.
>>
>
> But hopefully it should be better battery life/responsiveness for those
> scenarios too, right?
>

Yes in many cases, but if some workloads rely on specific OS governor
configurations, then that would get impacted.
I will send a separate change later to seek broader consensus on
enabling auto_sel as default without any param.

Thank you,
Sumit Gupta

....



^ permalink raw reply

* Re: [PATCH v2 0/3] mm/hmm: Add mmap lock-drop support for userfaultfd-backed mappings
From: Andrew Morton @ 2026-05-18 17:48 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys, Liam.Howlett, david, jgg, corbet, leon, ljs, mhocko, rppt,
	shuah, skhan, surenb, vbabka, skinsburskii, linux-doc,
	linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <177863991557.82528.15288076059759579141.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

On Wed, 13 May 2026 02:40:11 +0000 Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> wrote:

> This series extends the HMM framework to support userfaultfd-backed memory
> by allowing the mmap read lock to be dropped during hmm_range_fault().
> 
> Some page fault handlers — most notably userfaultfd — require the mmap lock
> to be released so that userspace can resolve the fault. The current HMM
> interface never sets FAULT_FLAG_ALLOW_RETRY, making it impossible to fault
> in pages from userfaultfd-registered regions.
> 
> This series follows the established int *locked pattern from
> get_user_pages_remote() in mm/gup.c. A new entry point,
> hmm_range_fault_unlockable(), accepts an int *locked parameter. When the
> mmap lock is dropped during fault resolution (VM_FAULT_RETRY or
> VM_FAULT_COMPLETED), the function returns 0 with *locked = 0, signalling
> the caller to restart its walk. The existing hmm_range_fault() is
> refactored into a thin wrapper that passes NULL, preserving current
> behavior for all existing callers.
> 
> Faulting hugetlb pages on the unlockable path is not supported because
> walk_hugetlb_range() unconditionally holds and releases
> hugetlb_vma_lock_read across the callback; if the mmap lock is dropped
> inside the callback, the VMA may be freed before the walk framework's
> unlock. Hugetlb pages already present in page tables are handled normally.
> Possible approaches to lift this limitation are documented in
> Documentation/mm/hmm.rst.

Thanks.  AI review asked some questions:
	https://sashiko.dev/#/patchset/177863991557.82528.15288076059759579141.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net

I'd ignore the fist one: don't write buggy fault handlers!



^ permalink raw reply

* [PATCH v1 00/12] Introduce nova-core mm prerequisites
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes

This series introduces the prerequisite memory-management infrastructure for
the nova-core driver: a centralized GpuMm manager, types for addressing VRAM
(Pfn, VramAddress), the PRAMIN aperture for indirect VRAM access from the CPU,
and the GSP plumbing that surfaces the usable FB region and total VRAM extent
at boot. It also picks up two small Rust enablers (pci::Bar::resource_flags()
and a cast+shift accessor form of bitfield!) that the rest of the nova-core
mm code relies on.

This series is based on drm-rust-next.

Dependencies (not yet merged):

- Alex Courbot's bitfield series. Tested on v2:
  https://lore.kernel.org/all/20260409-bitfield-v2-0-23ac400071cb@nvidia.com/
  A newer v3 of bitfield is available and should also work (haven't tested):
  https://lore.kernel.org/all/20260501-bitfield-v3-0-aa1076c3337d@nvidia.com/

- rust: maple_tree: implement Send and Sync for MapleTree (v3):
  https://lore.kernel.org/all/20260511143604.3848176-1-joelagnelf@nvidia.com/

The git tree (containing the dependencies above, this series, and the
follow-on page-table/VMM/BAR1 series) can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git (tag: nova-mm-v1-20260518)

Change log:

Changes from v12 to v1 (split-out):

- Part 1 of 2; the v12 series was split for easier review. Page-table/VMM/BAR1 patches in companion series.
- Broke v12's "Add common memory management types" into atomic patches: Pfn, VramAddress, VramAddress arithmetic.
- New prereq: "rust: pci: add resource_flags accessor".
- New prereq: "rust: bitfield: support cast+shift accessor syntax".
- "Add GpuMm centralized memory manager" scoped to scaffolding only; buddy/TLB wiring deferred to companion series.
- Squashed v12's "pramin: drop useless as_ref()" cleanup into "Add PRAMIN aperture self-tests".
- Moved "rust: maple_tree: Send and Sync" out as a standalone dependency.
- Smaller code touch-ups across most carried-over patches.

Link to v12: https://lore.kernel.org/all/20260425211454.174696-1-joelagnelf@nvidia.com/

Joel Fernandes (12):
  rust: pci: add resource_flags accessor
  rust: bitfield: support cast+shift accessor syntax
  gpu: nova-core: gsp: Return GspStaticInfo from boot()
  gpu: nova-core: gsp: Extract usable FB region from GSP
  gpu: nova-core: gsp: Expose total physical VRAM end from FB region
    info
  gpu: nova-core: mm: Add Pfn (Physical Frame Number) type
  gpu: nova-core: mm: Add VramAddress type and conversion traits
  gpu: nova-core: mm: Add VramAddress arithmetic and ordering
  gpu: nova-core: mm: Add support to use PRAMIN windows to write to VRAM
  docs: gpu: nova-core: Document the PRAMIN aperture mechanism
  gpu: nova-core: mm: Add GpuMm centralized memory manager
  gpu: nova-core: mm: Add PRAMIN aperture self-tests

 Documentation/gpu/nova/core/pramin.rst   | 123 ++++++
 Documentation/gpu/nova/index.rst         |   1 +
 drivers/gpu/nova-core/Kconfig            |  10 +
 drivers/gpu/nova-core/driver.rs          |   2 +
 drivers/gpu/nova-core/gpu.rs             |  48 ++-
 drivers/gpu/nova-core/gsp/boot.rs        |  12 +-
 drivers/gpu/nova-core/gsp/commands.rs    |  16 +-
 drivers/gpu/nova-core/gsp/fw/commands.rs |  49 ++-
 drivers/gpu/nova-core/mm.rs              | 247 +++++++++++
 drivers/gpu/nova-core/mm/pramin.rs       | 512 +++++++++++++++++++++++
 drivers/gpu/nova-core/nova_core.rs       |   1 +
 drivers/gpu/nova-core/regs.rs            | 122 ++++++
 rust/helpers/pci.c                       |   6 +
 rust/kernel/bitfield.rs                  |  67 +++
 rust/kernel/io/resource.rs               |   8 +
 rust/kernel/pci.rs                       |  14 +
 16 files changed, 1228 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/gpu/nova/core/pramin.rst
 create mode 100644 drivers/gpu/nova-core/mm.rs
 create mode 100644 drivers/gpu/nova-core/mm/pramin.rs


base-commit: 9bd99adf7cee4b8ed4adecd53269010250a0d2ec
-- 
2.34.1


^ permalink raw reply

* [PATCH v1 01/12] rust: pci: add resource_flags accessor
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add a `Device::resource_flags()` method to the PCI Rust abstraction,
wrapping the C-side static inline `pci_resource_flags()`.

The flags returned correspond to the `IORESOURCE` bitmask carried by a
PCI BAR's `struct resource`.

The immediate motivation is BAR layout discovery on NVIDIA GPUs: a
64-bit BAR consumes two consecutive Linux PCI resource slots (the lower
32 bits at index N and the upper 32 bits at index N+1, with the latter
having no flags or size of its own).

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 rust/helpers/pci.c         |  6 ++++++
 rust/kernel/io/resource.rs |  8 ++++++++
 rust/kernel/pci.rs         | 14 ++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/rust/helpers/pci.c b/rust/helpers/pci.c
index e44905317d75..51148987618a 100644
--- a/rust/helpers/pci.c
+++ b/rust/helpers/pci.c
@@ -19,6 +19,12 @@ __rust_helper resource_size_t rust_helper_pci_resource_len(struct pci_dev *pdev,
 	return pci_resource_len(pdev, bar);
 }
 
+__rust_helper unsigned long rust_helper_pci_resource_flags(const struct pci_dev *pdev,
+							   int bar)
+{
+	return pci_resource_flags(pdev, bar);
+}
+
 __rust_helper bool rust_helper_dev_is_pci(const struct device *dev)
 {
 	return dev_is_pci(dev);
diff --git a/rust/kernel/io/resource.rs b/rust/kernel/io/resource.rs
index b7ac9faf141d..78f353d1605b 100644
--- a/rust/kernel/io/resource.rs
+++ b/rust/kernel/io/resource.rs
@@ -226,10 +226,18 @@ impl Flags {
     /// Resource represents a memory region that must be ioremaped using `ioremap_np`.
     pub const IORESOURCE_MEM_NONPOSTED: Flags = Flags::new(bindings::IORESOURCE_MEM_NONPOSTED);
 
+    /// Memory region uses a 64-bit address (consumes two consecutive PCI resource slots).
+    pub const IORESOURCE_MEM_64: Flags = Flags::new(bindings::IORESOURCE_MEM_64);
+
     // Always inline to optimize out error path of `build_assert`.
     #[inline(always)]
     const fn new(value: u32) -> Self {
         crate::build_assert!(value as u64 <= c_ulong::MAX as u64);
         Flags(value as c_ulong)
     }
+
+    /// Wrap a raw `c_ulong` value returned by a C API into [`Flags`].
+    pub(crate) const fn from_raw(value: c_ulong) -> Self {
+        Flags(value)
+    }
 }
diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs
index af74ddff6114..d76a1377195e 100644
--- a/rust/kernel/pci.rs
+++ b/rust/kernel/pci.rs
@@ -17,6 +17,7 @@
         from_result,
         to_result, //
     },
+    io::resource,
     prelude::*,
     str::CStr,
     types::Opaque,
@@ -437,6 +438,19 @@ pub fn resource_len(&self, bar: u32) -> Result<bindings::resource_size_t> {
         Ok(unsafe { bindings::pci_resource_len(self.as_raw(), bar.try_into()?) })
     }
 
+    /// Returns the resource flags (`IORESOURCE_*`) of the given PCI BAR.
+    pub fn resource_flags(&self, bar: u32) -> Result<resource::Flags> {
+        if !Bar::index_is_valid(bar) {
+            return Err(EINVAL);
+        }
+
+        // SAFETY:
+        // - `bar` is a valid bar number, as guaranteed by the above call to `Bar::index_is_valid`,
+        // - by its type invariant `self.as_raw` is always a valid pointer to a `struct pci_dev`.
+        let raw = unsafe { bindings::pci_resource_flags(self.as_raw(), bar.try_into()?) };
+        Ok(resource::Flags::from_raw(raw))
+    }
+
     /// Returns the PCI class as a `Class` struct.
     #[inline]
     pub fn pci_class(&self) -> Class {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 04/12] gpu: nova-core: gsp: Extract usable FB region from GSP
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add first_usable_fb_region() to GspStaticConfigInfo to extract the first
usable FB region from GSP's fbRegionInfoParams. Usable regions are those
that are not reserved or protected.

The extracted region is stored in GetGspStaticInfoReply and exposed as
usable_fb_region field for use by the memory subsystem.

Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/gsp/commands.rs    | 11 +++++--
 drivers/gpu/nova-core/gsp/fw/commands.rs | 42 +++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/nova-core/gsp/commands.rs b/drivers/gpu/nova-core/gsp/commands.rs
index c89c7b57a751..049fff337611 100644
--- a/drivers/gpu/nova-core/gsp/commands.rs
+++ b/drivers/gpu/nova-core/gsp/commands.rs
@@ -4,6 +4,7 @@
     array,
     convert::Infallible,
     ffi::FromBytesUntilNulError,
+    ops::Range,
     str::Utf8Error, //
 };
 
@@ -189,15 +190,18 @@ fn init(&self) -> impl Init<Self::Command, Self::InitError> {
     }
 }
 
-/// The reply from the GSP to the [`GetGspInfo`] command.
+/// The reply from the GSP to the [`GetGspStaticInfo`] command.
 pub(crate) struct GetGspStaticInfoReply {
     gpu_name: [u8; 64],
+    /// Usable FB (VRAM) region for driver memory allocation.
+    #[expect(dead_code)]
+    pub(crate) usable_fb_region: Range<u64>,
 }
 
 impl MessageFromGsp for GetGspStaticInfoReply {
     const FUNCTION: MsgFunction = MsgFunction::GetGspStaticInfo;
     type Message = GspStaticConfigInfo;
-    type InitError = Infallible;
+    type InitError = Error;
 
     fn read(
         msg: &Self::Message,
@@ -205,6 +209,7 @@ fn read(
     ) -> Result<Self, Self::InitError> {
         Ok(GetGspStaticInfoReply {
             gpu_name: msg.gpu_name_str(),
+            usable_fb_region: msg.usable_fb_regions_iter().next().ok_or(ENODEV)?,
         })
     }
 }
@@ -233,7 +238,7 @@ pub(crate) fn gpu_name(&self) -> core::result::Result<&str, GpuNameError> {
     }
 }
 
-/// Send the [`GetGspInfo`] command and awaits for its reply.
+/// Send the [`GetGspStaticInfo`] command and awaits for its reply.
 pub(crate) fn get_gsp_info(cmdq: &Cmdq, bar: &Bar0) -> Result<GetGspStaticInfoReply> {
     cmdq.send_command(bar, GetGspStaticInfo)
 }
diff --git a/drivers/gpu/nova-core/gsp/fw/commands.rs b/drivers/gpu/nova-core/gsp/fw/commands.rs
index db46276430be..50b9c205566f 100644
--- a/drivers/gpu/nova-core/gsp/fw/commands.rs
+++ b/drivers/gpu/nova-core/gsp/fw/commands.rs
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+use core::ops::Range;
+
 use kernel::{
     device,
     pci,
@@ -10,7 +12,10 @@
     }, //
 };
 
-use crate::gsp::GSP_PAGE_SIZE;
+use crate::{
+    gsp::GSP_PAGE_SIZE,
+    num::IntoSafeCast, //
+};
 
 use super::bindings;
 
@@ -121,6 +126,41 @@ impl GspStaticConfigInfo {
     pub(crate) fn gpu_name_str(&self) -> [u8; 64] {
         self.0.gpuNameString
     }
+
+    /// Returns an iterator over valid FB regions from GSP firmware data.
+    fn fb_regions(
+        &self,
+    ) -> impl Iterator<Item = &bindings::NV2080_CTRL_CMD_FB_GET_FB_REGION_FB_REGION_INFO> {
+        let fb_info = &self.0.fbRegionInfoParams;
+        fb_info
+            .fbRegion
+            .iter()
+            .take(fb_info.numFBRegions.into_safe_cast())
+            .filter(|reg| reg.limit >= reg.base)
+    }
+
+    /// Iterates over usable FB regions from GSP firmware data.
+    ///
+    /// Each yielded region is a [`Range<u64>`] suitable for driver memory allocation.
+    /// Usable regions are those that satisfy all the following properties:
+    /// - Are not reserved for firmware internal use.
+    /// - Are not protected (hardware-enforced access restrictions).
+    /// - Support compression (can use GPU memory compression for bandwidth).
+    /// - Support ISO (isochronous memory for display requiring guaranteed bandwidth).
+    pub(crate) fn usable_fb_regions_iter(&self) -> impl Iterator<Item = Range<u64>> + '_ {
+        self.fb_regions().filter_map(|reg| {
+            // Filter: not reserved, not protected, supports compression and ISO.
+            if reg.reserved == 0
+                && reg.bProtected == 0
+                && reg.supportCompressed != 0
+                && reg.supportISO != 0
+            {
+                reg.limit.checked_add(1).map(|end| reg.base..end)
+            } else {
+                None
+            }
+        })
+    }
 }
 
 // SAFETY: Padding is explicit and will not contain uninitialized data.
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 02/12] rust: bitfield: support cast+shift accessor syntax
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

The `bitfield!` macro previously generated accessors that returned the
field's value as a `Bounded<$storage, N>` for its raw N-bit width. For
fields whose logical interpretation is a wider value built by widening
the storage type and shifting left (e.g., a 24-bit register field that
stores bits 16..40 of a 40-bit address), callers had to chain
`cast::<TargetType>()` and `shl::<SHIFT, RES>()` (or worse, raw shift
operators) at every read site, and the inverse for writes.

Add a new field declaration shape:

  $hi:$lo $field as Bounded<$target, $res> shl $shift;

The macro generates:

  - A getter `$field(self) -> Bounded<$target, $res>` that extracts the
    raw N-bit field, widens it to $target, and shifts left by $shift.
  - A setter `with_$field(self, value: Bounded<$target, $res>) -> Self`
    that shifts right by $shift, narrows to the storage type, and writes.

Add a KUnit test mirroring nova-core driver's PRAMIN window register
pattern as well which is the usecase for it.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 rust/kernel/bitfield.rs | 67 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/rust/kernel/bitfield.rs b/rust/kernel/bitfield.rs
index 9ab8dafff36c..1c1fc86441f2 100644
--- a/rust/kernel/bitfield.rs
+++ b/rust/kernel/bitfield.rs
@@ -57,6 +57,8 @@
 //!         hi:lo field_2 => ConvertedType;
 //!         // `field_3` documentation.
 //!         hi:lo field_3 ?=> ConvertedType;
+//!         // `field_4` documentation.
+//!         hi:lo field_4 as Bounded<TargetType, RES> shl SHIFT;
 //!         ...
 //!     }
 //! }
@@ -66,6 +68,8 @@
 //! - `hi:lo`: Bit range (inclusive), where `hi >= lo`.
 //! - `=> Type`: Optional infallible conversion (see [below](#infallible-conversion-)).
 //! - `?=> Type`: Optional fallible conversion (see [below](#fallible-conversion-)).
+//! - `as Bounded<T, RES> shl SHIFT`: Optional cast-and-shift accessor (see
+//!   [below](#cast-and-shift-accessors-as-bounded-t-res-shl-shift)).
 //! - Documentation strings and attributes are optional.
 //!
 //! # Generated code
@@ -299,6 +303,7 @@ fn from(val: $storage) -> $name {
         $($(#[doc = $doc:expr])* $hi:literal:$lo:literal $field:ident
             $(?=> $try_into_type:ty)?
             $(=> $into_type:ty)?
+            $(as Bounded<$target:ty, $res:literal> shl $shift:literal)?
         ;
         )*
     }
@@ -311,6 +316,7 @@ impl $name {
             @public_field_accessors $(#[doc = $doc])* $vis $name $storage : $hi:$lo $field
             $(?=> $try_into_type)?
             $(=> $into_type)?
+            $(as Bounded<$target, $res> shl $shift)?
         );
         )*
         }
@@ -475,6 +481,43 @@ const fn [<__with_ $field>](
         );
     };
 
+    // Public accessors for fields cast to a wider type and left-shifted, exposing them as
+    // `Bounded<$target, $res>` where `$res == ($hi + 1 - $lo) + $shift`.
+    (
+        @public_field_accessors $(#[doc = $doc:expr])* $vis:vis $name:ident $storage:ty :
+            $hi:literal:$lo:literal $field:ident
+            as Bounded<$target:ty, $res:literal> shl $shift:literal
+    ) => {
+        ::kernel::macros::paste!(
+
+        $(#[doc = $doc])*
+        #[doc = "Returns the value of this field, cast to the target type and shifted left."]
+        #[inline(always)]
+        $vis fn $field(self) -> ::kernel::num::Bounded<$target, $res> {
+            $crate::const_assert!($res == ($hi + 1 - $lo) + $shift);
+
+            self.[<__ $field>]()
+                .cast::<$target>()
+                .shl::<$shift, $res>()
+        }
+
+        $(#[doc = $doc])*
+        #[doc = "Sets this field from a target-typed, pre-shifted `Bounded` value."]
+        #[inline(always)]
+        $vis fn [<with_ $field>](
+            self,
+            value: ::kernel::num::Bounded<$target, $res>,
+        ) -> Self {
+            $crate::const_assert!($res == ($hi + 1 - $lo) + $shift);
+
+            self.[<__with_ $field>](
+                value.shr::<$shift, { $hi + 1 - $lo }>().cast::<$storage>()
+            )
+        }
+
+        );
+    };
+
     // `Debug` implementation.
     (@debug $name:ident { $($field:ident;)* }) => {
         impl ::kernel::fmt::Debug for $name {
@@ -582,6 +625,15 @@ struct TestStatusRegister(u8) {
         }
     }
 
+    // Mirrors the PRAMIN window register pattern: a 24-bit field in a `u32` storage that
+    // represents bits 16..40 of a 40-bit address. The accessor exposes it as the full
+    // 40-bit `Bounded<u64, 40>`.
+    bitfield! {
+        struct TestWindowReg(u32) {
+            23:0      window_base as Bounded<u64, 40> shl 16;
+        }
+    }
+
     #[test]
     fn test_single_bits() {
         let mut pte = TestPageTableEntry::zeroed();
@@ -806,4 +858,19 @@ fn test_u8_bitfield() {
         assert_eq!(status4.reserved(), 0xF);
         assert_eq!(status4.full_byte(), 0xFF);
     }
+
+    #[test]
+    fn test_cast_shift_accessor() {
+        // Set a value via the pre-shifted setter and read it back via the getter.
+        let addr = Bounded::<u64, 40>::new::<0x12_3456_0000>();
+        let reg = TestWindowReg::zeroed().with_window_base(addr);
+        assert_eq!(reg.window_base().get(), 0x12_3456_0000u64);
+        assert_eq!(u32::from(reg), 0x0012_3456u32);
+
+        // Setting and reading the largest 40-bit aligned value.
+        let max_addr = Bounded::<u64, 40>::new::<0xFF_FFFF_0000>();
+        let reg = TestWindowReg::zeroed().with_window_base(max_addr);
+        assert_eq!(reg.window_base().get(), 0xFF_FFFF_0000u64);
+        assert_eq!(u32::from(reg), 0x00FF_FFFFu32);
+    }
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 05/12] gpu: nova-core: gsp: Expose total physical VRAM end from FB region info
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add `total_fb_end()` to `GspStaticConfigInfo` that computes the
exclusive end address of the highest valid FB region covering both
usable and GSP-reserved areas.

This allows callers to know the full physical VRAM extent, not just
the allocatable portion.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/gpu.rs             | 11 ++++++++++-
 drivers/gpu/nova-core/gsp/commands.rs    |  5 +++++
 drivers/gpu/nova-core/gsp/fw/commands.rs |  7 +++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 775cdb653830..d9d1a7417a2e 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -8,6 +8,7 @@
     num::Bounded,
     pci,
     prelude::*,
+    sizes::SizeConstants,
     sync::Arc, //
 };
 
@@ -295,7 +296,15 @@ pub(crate) fn new<'a>(
 
             gsp <- Gsp::new(pdev),
 
-            gsp_static_info: { gsp.boot(pdev, bar, spec.chipset, gsp_falcon, sec2_falcon)? },
+            gsp_static_info: gsp
+                .boot(pdev, bar, spec.chipset, gsp_falcon, sec2_falcon)
+                .inspect(|info| {
+                    dev_info!(
+                        pdev.as_ref(),
+                        "Total physical VRAM: {} MiB\n",
+                        info.total_fb_end / u64::SZ_1M
+                    );
+                })?,
 
             bar: devres_bar,
         })
diff --git a/drivers/gpu/nova-core/gsp/commands.rs b/drivers/gpu/nova-core/gsp/commands.rs
index 049fff337611..172411d7b475 100644
--- a/drivers/gpu/nova-core/gsp/commands.rs
+++ b/drivers/gpu/nova-core/gsp/commands.rs
@@ -196,6 +196,8 @@ pub(crate) struct GetGspStaticInfoReply {
     /// Usable FB (VRAM) region for driver memory allocation.
     #[expect(dead_code)]
     pub(crate) usable_fb_region: Range<u64>,
+    /// End of VRAM.
+    pub(crate) total_fb_end: u64,
 }
 
 impl MessageFromGsp for GetGspStaticInfoReply {
@@ -207,9 +209,12 @@ fn read(
         msg: &Self::Message,
         _sbuffer: &mut SBufferIter<array::IntoIter<&[u8], 2>>,
     ) -> Result<Self, Self::InitError> {
+        let total_fb_end = msg.total_fb_end().ok_or(ENODEV)?;
+
         Ok(GetGspStaticInfoReply {
             gpu_name: msg.gpu_name_str(),
             usable_fb_region: msg.usable_fb_regions_iter().next().ok_or(ENODEV)?,
+            total_fb_end,
         })
     }
 }
diff --git a/drivers/gpu/nova-core/gsp/fw/commands.rs b/drivers/gpu/nova-core/gsp/fw/commands.rs
index 50b9c205566f..ea663079d95c 100644
--- a/drivers/gpu/nova-core/gsp/fw/commands.rs
+++ b/drivers/gpu/nova-core/gsp/fw/commands.rs
@@ -161,6 +161,13 @@ pub(crate) fn usable_fb_regions_iter(&self) -> impl Iterator<Item = Range<u64>>
             }
         })
     }
+
+    /// Compute the end of physical VRAM from all FB regions.
+    pub(crate) fn total_fb_end(&self) -> Option<u64> {
+        self.fb_regions()
+            .filter_map(|reg| reg.limit.checked_add(1))
+            .max()
+    }
 }
 
 // SAFETY: Padding is explicit and will not contain uninitialized data.
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 03/12] gpu: nova-core: gsp: Return GspStaticInfo from boot()
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Refactor the GSP boot function to return GetGspStaticInfoReply.

This enables access required for memory management initialization to:
- bar1_pde_base: BAR1 page directory base.
- bar2_pde_base: BAR2 page directory base.
- usable memory regions in video memory.

Reviewed-by: Eliot Courtney <ecourtney@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/gpu.rs      |  8 ++++++--
 drivers/gpu/nova-core/gsp/boot.rs | 12 ++++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 659f6a24ee13..775cdb653830 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -20,7 +20,10 @@
         Falcon, //
     },
     fb::SysmemFlush,
-    gsp::Gsp,
+    gsp::{
+        commands::GetGspStaticInfoReply,
+        Gsp, //
+    },
     regs,
 };
 
@@ -260,6 +263,7 @@ pub(crate) struct Gpu {
     /// GSP runtime data. Temporarily an empty placeholder.
     #[pin]
     gsp: Gsp,
+    gsp_static_info: GetGspStaticInfoReply,
 }
 
 impl Gpu {
@@ -291,7 +295,7 @@ pub(crate) fn new<'a>(
 
             gsp <- Gsp::new(pdev),
 
-            _: { gsp.boot(pdev, bar, spec.chipset, gsp_falcon, sec2_falcon)? },
+            gsp_static_info: { gsp.boot(pdev, bar, spec.chipset, gsp_falcon, sec2_falcon)? },
 
             bar: devres_bar,
         })
diff --git a/drivers/gpu/nova-core/gsp/boot.rs b/drivers/gpu/nova-core/gsp/boot.rs
index df105ef4b371..842aef403f07 100644
--- a/drivers/gpu/nova-core/gsp/boot.rs
+++ b/drivers/gpu/nova-core/gsp/boot.rs
@@ -36,7 +36,10 @@
         Chipset, //
     },
     gsp::{
-        commands,
+        commands::{
+            self,
+            GetGspStaticInfoReply, //
+        },
         sequencer::{
             GspSequencer,
             GspSequencerParams, //
@@ -148,7 +151,7 @@ pub(crate) fn boot(
         chipset: Chipset,
         gsp_falcon: &Falcon<Gsp>,
         sec2_falcon: &Falcon<Sec2>,
-    ) -> Result {
+    ) -> Result<GetGspStaticInfoReply> {
         // The FSP boot process of Hopper+ is not supported for now.
         if matches!(
             chipset.arch(),
@@ -243,12 +246,13 @@ pub(crate) fn boot(
         commands::wait_gsp_init_done(&self.cmdq)?;
 
         // Obtain and display basic GPU information.
-        let info = commands::get_gsp_info(&self.cmdq, bar)?;
+        let info = commands::get_gsp_info(&self.cmdq, bar)
+            .inspect_err(|e| dev_err!(pdev, "Failed to obtain GSP static info ({:?})\n", e))?;
         match info.gpu_name() {
             Ok(name) => dev_info!(pdev, "GPU name: {}\n", name),
             Err(e) => dev_warn!(pdev, "GPU name unavailable: {:?}\n", e),
         }
 
-        Ok(())
+        Ok(info)
     }
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 06/12] gpu: nova-core: mm: Add Pfn (Physical Frame Number) type
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add the `Pfn` (Physical Frame Number) type representing a physical page
in VRAM, along with the macros used by frame-number types to interop with
the `Bounded<u64, N>` representation used by bitfield-derived PTE/PDE
fields.

In later patches in the series, we will use `Pfn` in the page table
structures.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/mm.rs | 70 +++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 drivers/gpu/nova-core/mm.rs

diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
new file mode 100644
index 000000000000..3b131aedf2f9
--- /dev/null
+++ b/drivers/gpu/nova-core/mm.rs
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Memory management subsystems for nova-core.
+
+#![expect(dead_code)]
+
+/// Implements `From` conversions between a frame-number type and `Bounded<u64, N>`.
+///
+/// Each MMU version module should invoke this for the specific bit widths used by that version's
+/// PTE/PDE bitfield definitions.
+macro_rules! impl_frame_number_bounded {
+    ($type:ty, $bits:literal) => {
+        impl From<Bounded<u64, $bits>> for $type {
+            fn from(val: Bounded<u64, $bits>) -> Self {
+                Self::new(val.get())
+            }
+        }
+
+        impl From<$type> for Bounded<u64, $bits> {
+            fn from(v: $type) -> Self {
+                Bounded::from_expr(v.raw() & ::kernel::bits::genmask_u64(0..=($bits - 1)))
+            }
+        }
+    };
+}
+
+/// Implements `From` conversions between [`Pfn`] and `Bounded<u64, N>` for bitfield interop.
+macro_rules! impl_pfn_bounded {
+    ($bits:literal) => {
+        impl_frame_number_bounded!(Pfn, $bits);
+    };
+}
+
+use kernel::{
+    num::Bounded,
+    prelude::*, //
+};
+
+/// Physical Frame Number.
+///
+/// Represents a physical page in VRAM.
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub(crate) struct Pfn(u64);
+
+impl Pfn {
+    /// Create a new PFN from a frame number.
+    pub(crate) const fn new(frame_number: u64) -> Self {
+        Self(frame_number)
+    }
+
+    /// Get the raw frame number.
+    pub(crate) const fn raw(self) -> u64 {
+        self.0
+    }
+}
+
+impl From<u64> for Pfn {
+    fn from(val: u64) -> Self {
+        Self(val)
+    }
+}
+
+impl From<Pfn> for u64 {
+    fn from(pfn: Pfn) -> Self {
+        pfn.0
+    }
+}
+
+impl_pfn_bounded!(52);
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 07/12] gpu: nova-core: mm: Add VramAddress type and conversion traits
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add the `VramAddress` bitfield-backed type representing a physical address
in VRAM. The bitfield layout splits the address into a 12-bit intra-page
offset and a 52-bit physical frame number, matching the GPU MMU
addressing scheme. also add a few conversion traits required in later
patches.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/mm.rs | 57 +++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
index 3b131aedf2f9..f8a70f93bc03 100644
--- a/drivers/gpu/nova-core/mm.rs
+++ b/drivers/gpu/nova-core/mm.rs
@@ -31,11 +31,62 @@ macro_rules! impl_pfn_bounded {
     };
 }
 
+use core::ops::Range;
+
 use kernel::{
+    bitfield,
     num::Bounded,
     prelude::*, //
 };
 
+bitfield! {
+    /// Physical VRAM address in GPU video memory.
+    pub(crate) struct VramAddress(u64) {
+        /// Offset within 4KB page.
+        11:0    offset;
+        /// Physical frame number.
+        63:12   frame_number => Pfn;
+    }
+}
+
+impl VramAddress {
+    /// Create a new VRAM address from a raw value.
+    pub(crate) const fn new(addr: u64) -> Self {
+        Self::from_raw(addr)
+    }
+
+    /// Get the raw address value as `u64`.
+    pub(crate) const fn raw(&self) -> u64 {
+        self.into_raw()
+    }
+}
+
+// Allow VRAM addresses to be printed with the `{:#x}` format specifier.
+impl core::fmt::LowerHex for VramAddress {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        core::fmt::LowerHex::fmt(&self.raw(), f)
+    }
+}
+
+impl From<Pfn> for VramAddress {
+    fn from(pfn: Pfn) -> Self {
+        Self::zeroed().with_frame_number(pfn)
+    }
+}
+
+/// Extension trait to convert a `Range<u64>` of byte addresses into a
+/// `Range<VramAddress>`.
+pub(crate) trait IntoVramRange {
+    /// Convert this range of byte addresses into a `Range<VramAddress>`.
+    fn into_vram_range(self) -> Range<VramAddress>;
+}
+
+impl IntoVramRange for Range<u64> {
+    fn into_vram_range(self) -> Range<VramAddress> {
+        VramAddress::new(self.start)..VramAddress::new(self.end)
+    }
+}
+
 /// Physical Frame Number.
 ///
 /// Represents a physical page in VRAM.
@@ -55,6 +106,12 @@ pub(crate) const fn raw(self) -> u64 {
     }
 }
 
+impl From<VramAddress> for Pfn {
+    fn from(addr: VramAddress) -> Self {
+        addr.frame_number()
+    }
+}
+
 impl From<u64> for Pfn {
     fn from(val: u64) -> Self {
         Self(val)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 09/12] gpu: nova-core: mm: Add support to use PRAMIN windows to write to VRAM
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

PRAMIN apertures are a crucial mechanism for direct CPU read/write to
VRAM. Add support for PRAMIN windows on all supported GPU architectures:
Turing, Ampere, Ada (via `NV_PBUS_BAR0_WINDOW`), Hopper (via
`gh100::NV_XAL_EP_BAR0_WINDOW`), and Blackwell (via
`gb100::NV_XAL_EP_BAR0_WINDOW`). Architecture-dispatched
`pramin_window_{read,write}_base()` helpers in `regs.rs` encapsulate the
per-arch register selection.

Hopper/Blackwell window-base dispatch is based on Eliot Courtney's
offlist reference patch.

Cc: Eliot Courtney <ecourtney@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/mm.rs        |   2 +
 drivers/gpu/nova-core/mm/pramin.rs | 298 +++++++++++++++++++++++++++++
 drivers/gpu/nova-core/nova_core.rs |   1 +
 drivers/gpu/nova-core/regs.rs      | 122 ++++++++++++
 4 files changed, 423 insertions(+)
 create mode 100644 drivers/gpu/nova-core/mm/pramin.rs

diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
index 3bc9befab397..f425467281d3 100644
--- a/drivers/gpu/nova-core/mm.rs
+++ b/drivers/gpu/nova-core/mm.rs
@@ -31,6 +31,8 @@ macro_rules! impl_pfn_bounded {
     };
 }
 
+pub(crate) mod pramin;
+
 use core::ops::Range;
 
 use kernel::{
diff --git a/drivers/gpu/nova-core/mm/pramin.rs b/drivers/gpu/nova-core/mm/pramin.rs
new file mode 100644
index 000000000000..38758ca971be
--- /dev/null
+++ b/drivers/gpu/nova-core/mm/pramin.rs
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Direct VRAM access through the PRAMIN aperture.
+//!
+//! PRAMIN provides a 1MB sliding window into VRAM through BAR0, allowing the CPU to access
+//! video memory directly. Access is managed through a two-level API:
+//!
+//! - [`Pramin`]: The parent object that owns the BAR0 reference and synchronization lock.
+//! - [`PraminWindow`]: A guard object that holds exclusive PRAMIN access for its lifetime.
+//!
+//! The PRAMIN aperture is a 1MB region at a fixed offset from BAR0. The window base is
+//! controlled by an architecture-specific register and is 64KB aligned.
+//!
+//! # Examples
+//!
+//! ## Basic read/write
+//!
+//! ```no_run
+//! use crate::driver::Bar0;
+//! use crate::gpu::Chipset;
+//! use crate::mm::{pramin, VramAddress};
+//! use kernel::device;
+//! use kernel::devres::Devres;
+//! use kernel::prelude::*;
+//! use kernel::sync::Arc;
+//!
+//! fn example(
+//!     devres_bar: Arc<Devres<Bar0>>,
+//!     dev: &device::Device<device::Bound>,
+//!     chipset: Chipset,
+//!     vram_region: core::ops::Range<VramAddress>,
+//! ) -> Result<()> {
+//!     let pramin = Arc::pin_init(
+//!         pramin::Pramin::new(devres_bar, dev, chipset, vram_region)?,
+//!         GFP_KERNEL,
+//!     )?;
+//!     let mut window = pramin.get_window(dev)?;
+//!
+//!     // Write and read back.
+//!     window.try_write32(0x100u64, 0xDEADBEEF)?;
+//!     let val = window.try_read32(0x100u64)?;
+//!     assert_eq!(val, 0xDEADBEEF);
+//!
+//!     Ok(())
+//! }
+//! ```
+//!
+//! ## Auto-repositioning across VRAM regions
+//!
+//! ```no_run
+//! use crate::driver::Bar0;
+//! use crate::gpu::Chipset;
+//! use crate::mm::{pramin, VramAddress};
+//! use kernel::device;
+//! use kernel::devres::Devres;
+//! use kernel::prelude::*;
+//! use kernel::sync::Arc;
+//!
+//! fn example(
+//!     devres_bar: Arc<Devres<Bar0>>,
+//!     dev: &device::Device<device::Bound>,
+//!     chipset: Chipset,
+//!     vram_region: core::ops::Range<VramAddress>,
+//! ) -> Result<()> {
+//!     let pramin = Arc::pin_init(
+//!         pramin::Pramin::new(devres_bar, dev, chipset, vram_region)?,
+//!         GFP_KERNEL,
+//!     )?;
+//!     let mut window = pramin.get_window(dev)?;
+//!
+//!     // Access first 1MB region.
+//!     window.try_write32(0x100u64, 0x11111111)?;
+//!
+//!     // Access at 2MB - window auto-repositions.
+//!     window.try_write32(0x200000u64, 0x22222222)?;
+//!
+//!     // Back to first region - window repositions again.
+//!     let val = window.try_read32(0x100u64)?;
+//!     assert_eq!(val, 0x11111111);
+//!
+//!     Ok(())
+//! }
+//! ```
+
+#![expect(unused)]
+
+use core::ops::Range;
+
+use crate::{
+    bounded_enum,
+    driver::Bar0,
+    gpu::Chipset,
+    mm::VramAddress,
+    num::IntoSafeCast,
+    regs, //
+};
+
+use kernel::{
+    device,
+    devres::Devres,
+    io::Io,
+    new_mutex,
+    prelude::*,
+    sizes::{
+        SZ_1M,
+        SZ_64K, //
+    },
+    sync::{
+        lock::mutex::MutexGuard,
+        Arc,
+        Mutex, //
+    },
+};
+
+bounded_enum! {
+    /// Target memory type for the BAR0 window register.
+    ///
+    /// Only VRAM is supported; Hopper+ GPUs do not support other targets.
+    #[derive(Debug)]
+    pub(crate) enum Bar0WindowTarget with TryFrom<Bounded<u32, 2>> {
+        /// Video RAM (GPU framebuffer memory).
+        Vram = 0,
+    }
+}
+
+/// PRAMIN aperture base offset in BAR0.
+const PRAMIN_BASE: usize = 0x700000;
+
+/// PRAMIN aperture size (1MB).
+const PRAMIN_SIZE: usize = SZ_1M;
+
+/// Generate a PRAMIN read accessor that takes an absolute VRAM address.
+///
+/// `$name` matches the underlying [`Bar0`] method (e.g. `try_read32`).
+macro_rules! define_pramin_read {
+    ($name:ident, $ty:ty) => {
+        #[doc = concat!("Read a `", stringify!($ty), "` from VRAM at the given address.")]
+        pub(crate) fn $name(&mut self, vram_addr: impl Into<VramAddress>) -> Result<$ty> {
+            let (bar_offset, new_base) =
+                self.compute_window(vram_addr.into(), ::core::mem::size_of::<$ty>())?;
+
+            if let Some(base) = new_base {
+                regs::pramin_window_write_base(self.chipset.arch(), self.bar, base)?;
+                *self.state = base;
+            }
+            self.bar.$name(bar_offset)
+        }
+    };
+}
+
+/// Generate a PRAMIN write accessor that takes an absolute VRAM address.
+///
+/// `$name` matches the underlying [`Bar0`] method (e.g. `try_write32`).
+macro_rules! define_pramin_write {
+    ($name:ident, $ty:ty) => {
+        #[doc = concat!("Write a `", stringify!($ty), "` to VRAM at the given address.")]
+        pub(crate) fn $name(&mut self, vram_addr: impl Into<VramAddress>, value: $ty) -> Result {
+            let (bar_offset, new_base) =
+                self.compute_window(vram_addr.into(), ::core::mem::size_of::<$ty>())?;
+
+            if let Some(base) = new_base {
+                regs::pramin_window_write_base(self.chipset.arch(), self.bar, base)?;
+                *self.state = base;
+            }
+            self.bar.$name(value, bar_offset)
+        }
+    };
+}
+
+/// PRAMIN aperture manager.
+///
+/// Call [`Pramin::get_window()`] to acquire exclusive PRAMIN access.
+#[pin_data]
+pub(crate) struct Pramin {
+    bar: Arc<Devres<Bar0>>,
+    chipset: Chipset,
+    /// Valid VRAM region. Accesses outside this range are rejected.
+    vram_region: Range<VramAddress>,
+    /// PRAMIN aperture state, protected by a mutex.
+    ///
+    /// # Invariants
+    ///
+    /// This lock is acquired during the DMA fence signaling critical path.
+    /// It must NEVER be held across any reclaimable CPU memory / allocations
+    /// (`GFP_KERNEL`), because the memory reclaim path can call
+    /// `dma_fence_wait()`, which would deadlock with this lock held.
+    #[pin]
+    state: Mutex<VramAddress>,
+}
+
+impl Pramin {
+    /// Create a pin-initializer for PRAMIN.
+    ///
+    /// `vram_region` specifies the valid VRAM address range.
+    pub(crate) fn new(
+        bar: Arc<Devres<Bar0>>,
+        dev: &device::Device<device::Bound>,
+        chipset: Chipset,
+        vram_region: Range<VramAddress>,
+    ) -> Result<impl PinInit<Self>> {
+        let bar_access = bar.access(dev)?;
+        let current_base = regs::pramin_window_read_base(chipset.arch(), bar_access);
+
+        Ok(pin_init!(Self {
+            bar,
+            chipset,
+            vram_region,
+            state <- new_mutex!(current_base, "pramin_state"),
+        }))
+    }
+
+    /// Returns the valid VRAM region for this PRAMIN instance.
+    fn vram_region(&self) -> &Range<VramAddress> {
+        &self.vram_region
+    }
+
+    /// Acquire exclusive PRAMIN access.
+    ///
+    /// Returns a [`PraminWindow`] guard that provides VRAM read/write accessors.
+    /// The [`PraminWindow`] is exclusive and only one can exist at a time.
+    pub(crate) fn get_window<'a>(
+        &'a self,
+        dev: &'a device::Device<device::Bound>,
+    ) -> Result<PraminWindow<'a>> {
+        let bar = self.bar.access(dev)?;
+        let state = self.state.lock();
+        Ok(PraminWindow {
+            bar,
+            chipset: self.chipset,
+            vram_region: self.vram_region.clone(),
+            state,
+        })
+    }
+}
+
+/// PRAMIN window guard for direct VRAM access.
+///
+/// This guard holds exclusive access to the PRAMIN aperture. The window auto-repositions
+/// when accessing VRAM offsets outside the current 1MB range.
+///
+/// Only one [`PraminWindow`] can exist at a time per [`Pramin`] instance (enforced by the
+/// internal `MutexGuard`).
+pub(crate) struct PraminWindow<'a> {
+    bar: &'a Bar0,
+    chipset: Chipset,
+    vram_region: Range<VramAddress>,
+    state: MutexGuard<'a, VramAddress>,
+}
+
+impl PraminWindow<'_> {
+    /// Compute window parameters for a VRAM access.
+    ///
+    /// Returns (`bar_offset`, `new_base`) where:
+    /// - `bar_offset`: The BAR0 offset to use for the access.
+    /// - `new_base`: `Some(base)` if window needs repositioning, `None` otherwise.
+    fn compute_window(
+        &self,
+        vram_addr: VramAddress,
+        access_size: usize,
+    ) -> Result<(usize, Option<VramAddress>)> {
+        // Validate VRAM address is within the valid VRAM region.
+        let end_addr = vram_addr.checked_add(access_size).ok_or(EINVAL)?;
+        if vram_addr < self.vram_region.start || end_addr > self.vram_region.end {
+            return Err(EINVAL);
+        }
+
+        // Check if access fits within the current 1MB window.
+        let current_base = *self.state;
+        if vram_addr >= current_base {
+            let offset_within: usize = (vram_addr - current_base).into_safe_cast();
+            if offset_within + access_size <= PRAMIN_SIZE {
+                return Ok((PRAMIN_BASE + offset_within, None));
+            }
+        }
+
+        // Access doesn't fit in current window - reposition.
+        // Hardware requires 64KB alignment for the window base register.
+        let needed_base = vram_addr.align_down(SZ_64K as u64);
+        let offset_within: usize = (vram_addr - needed_base).into_safe_cast();
+
+        // Verify access fits in the 1MB window from the new base.
+        if offset_within + access_size > PRAMIN_SIZE {
+            return Err(EINVAL);
+        }
+
+        Ok((PRAMIN_BASE + offset_within, Some(needed_base)))
+    }
+
+    define_pramin_read!(try_read8, u8);
+    define_pramin_read!(try_read16, u16);
+    define_pramin_read!(try_read32, u32);
+    define_pramin_read!(try_read64, u64);
+
+    define_pramin_write!(try_write8, u8);
+    define_pramin_write!(try_write16, u16);
+    define_pramin_write!(try_write32, u32);
+    define_pramin_write!(try_write64, u64);
+}
diff --git a/drivers/gpu/nova-core/nova_core.rs b/drivers/gpu/nova-core/nova_core.rs
index 38b8aeb750ba..8bff10dbf327 100644
--- a/drivers/gpu/nova-core/nova_core.rs
+++ b/drivers/gpu/nova-core/nova_core.rs
@@ -16,6 +16,7 @@
 mod firmware;
 mod gpu;
 mod gsp;
+mod mm;
 #[macro_use]
 mod num;
 mod regs;
diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
index 6faeed73901d..fb42d96a59b2 100644
--- a/drivers/gpu/nova-core/regs.rs
+++ b/drivers/gpu/nova-core/regs.rs
@@ -6,6 +6,10 @@
         register::WithBase,
         Io, //
     },
+    num::{
+        Bounded,
+        TryIntoBounded, //
+    },
     prelude::*,
     sizes::SizeConstants,
     time, //
@@ -31,6 +35,10 @@
         Architecture,
         Chipset, //
     },
+    mm::{
+        pramin::Bar0WindowTarget,
+        VramAddress, //
+    },
 };
 
 // PMC
@@ -115,6 +123,15 @@ fn fmt(&self, f: &mut kernel::fmt::Formatter<'_>) -> kernel::fmt::Result {
     }
 }
 
+register! {
+    /// BAR0 window control for PRAMIN access.
+    pub(crate) NV_PBUS_BAR0_WINDOW(u32) @ 0x00001700 {
+        25:24   target ?=> Bar0WindowTarget;
+        /// PRAMIN window base byte address (40-bit FB addr; bits 39:16 stored in 23:0).
+        23:0    window_base as Bounded<u64, 40> shl 16;
+    }
+}
+
 // PFB
 
 register! {
@@ -537,3 +554,108 @@ pub(crate) mod ga100 {
         }
     }
 }
+
+pub(crate) mod gh100 {
+    use kernel::io::register;
+
+    register! {
+        /// Hopper register for PRAMIN window.
+        pub(crate) NV_XAL_EP_BAR0_WINDOW(u32) @ 0x0010_fd40 {
+            /// PRAMIN window base byte address (38-bit FB addr; bits 37:16 stored in 21:0).
+            21:0    window_base as Bounded<u64, 38> shl 16;
+        }
+    }
+}
+
+pub(crate) mod gb100 {
+    use kernel::io::register;
+
+    register! {
+        /// Blackwell+ register for PRAMIN window.
+        pub(crate) NV_XAL_EP_BAR0_WINDOW(u32) @ 0x0010_fd40 {
+            /// PRAMIN window base byte address (39-bit FB addr; bits 38:16 stored in 22:0).
+            22:0    window_base as Bounded<u64, 39> shl 16;
+        }
+    }
+}
+
+/// Common interface for all PRAMIN window registers across GPU architectures.
+pub(crate) trait PraminWindow {
+    /// Reads the current PRAMIN window base address from this register.
+    fn read_base(bar: &Bar0) -> VramAddress;
+
+    /// Writes a new PRAMIN window base address into this register.
+    fn write_base(bar: &Bar0, base: VramAddress) -> Result;
+}
+
+impl PraminWindow for NV_PBUS_BAR0_WINDOW {
+    fn read_base(bar: &Bar0) -> VramAddress {
+        VramAddress::new(bar.read(NV_PBUS_BAR0_WINDOW).window_base().into())
+    }
+
+    fn write_base(bar: &Bar0, base: VramAddress) -> Result {
+        let bounded: Bounded<u64, 40> = base.raw().try_into_bounded().ok_or(EINVAL)?;
+        bar.write_reg(
+            NV_PBUS_BAR0_WINDOW::zeroed()
+                .with_target(Bar0WindowTarget::Vram)
+                .with_window_base(bounded),
+        );
+        Ok(())
+    }
+}
+
+impl PraminWindow for gh100::NV_XAL_EP_BAR0_WINDOW {
+    fn read_base(bar: &Bar0) -> VramAddress {
+        VramAddress::new(bar.read(gh100::NV_XAL_EP_BAR0_WINDOW).window_base().into())
+    }
+
+    fn write_base(bar: &Bar0, base: VramAddress) -> Result {
+        let bounded: Bounded<u64, 38> = base.raw().try_into_bounded().ok_or(EINVAL)?;
+        bar.write_reg(gh100::NV_XAL_EP_BAR0_WINDOW::zeroed().with_window_base(bounded));
+        Ok(())
+    }
+}
+
+impl PraminWindow for gb100::NV_XAL_EP_BAR0_WINDOW {
+    fn read_base(bar: &Bar0) -> VramAddress {
+        VramAddress::new(bar.read(gb100::NV_XAL_EP_BAR0_WINDOW).window_base().into())
+    }
+
+    fn write_base(bar: &Bar0, base: VramAddress) -> Result {
+        let bounded: Bounded<u64, 39> = base.raw().try_into_bounded().ok_or(EINVAL)?;
+        bar.write_reg(gb100::NV_XAL_EP_BAR0_WINDOW::zeroed().with_window_base(bounded));
+        Ok(())
+    }
+}
+
+/// Reads the current BAR0 PRAMIN window base address, dispatching to the
+/// register variant appropriate for `arch`.
+pub(crate) fn pramin_window_read_base(arch: Architecture, bar: &Bar0) -> VramAddress {
+    match arch {
+        Architecture::Turing | Architecture::Ampere | Architecture::Ada => {
+            NV_PBUS_BAR0_WINDOW::read_base(bar)
+        }
+        Architecture::Hopper => gh100::NV_XAL_EP_BAR0_WINDOW::read_base(bar),
+        Architecture::BlackwellGB10x | Architecture::BlackwellGB20x => {
+            gb100::NV_XAL_EP_BAR0_WINDOW::read_base(bar)
+        }
+    }
+}
+
+/// Writes a new BAR0 PRAMIN window base address, dispatching to the register
+/// variant appropriate for `arch`.
+pub(crate) fn pramin_window_write_base(
+    arch: Architecture,
+    bar: &Bar0,
+    base: VramAddress,
+) -> Result {
+    match arch {
+        Architecture::Turing | Architecture::Ampere | Architecture::Ada => {
+            NV_PBUS_BAR0_WINDOW::write_base(bar, base)
+        }
+        Architecture::Hopper => gh100::NV_XAL_EP_BAR0_WINDOW::write_base(bar, base),
+        Architecture::BlackwellGB10x | Architecture::BlackwellGB20x => {
+            gb100::NV_XAL_EP_BAR0_WINDOW::write_base(bar, base)
+        }
+    }
+}
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 08/12] gpu: nova-core: mm: Add VramAddress arithmetic and ordering
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add arithmetic helpers, comparison, and operator overloads for
`VramAddress` which are required in later patches for address
arithmetic.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/mm.rs | 60 +++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
index f8a70f93bc03..3bc9befab397 100644
--- a/drivers/gpu/nova-core/mm.rs
+++ b/drivers/gpu/nova-core/mm.rs
@@ -59,6 +59,38 @@ pub(crate) const fn new(addr: u64) -> Self {
     pub(crate) const fn raw(&self) -> u64 {
         self.into_raw()
     }
+
+    /// Align the address down to the given power-of-two `alignment`.
+    pub(crate) const fn align_down(self, alignment: u64) -> Self {
+        Self::new(self.raw() & !(alignment - 1))
+    }
+
+    /// Add `rhs` to this address, returning `None` on overflow.
+    pub(crate) fn checked_add<O: IntoVramOffset>(self, rhs: O) -> Option<Self> {
+        self.raw()
+            .checked_add(rhs.into_vram_offset())
+            .map(Self::new)
+    }
+}
+
+/// Lossless conversion into a `u64` byte offset, for use as a [`VramAddress`] `checked_add()`
+/// operand which can be either a `u64` or a `usize`.
+pub(crate) trait IntoVramOffset {
+    /// Convert `self` into a `u64` byte offset.
+    fn into_vram_offset(self) -> u64;
+}
+
+impl IntoVramOffset for u64 {
+    fn into_vram_offset(self) -> u64 {
+        self
+    }
+}
+
+impl IntoVramOffset for usize {
+    fn into_vram_offset(self) -> u64 {
+        use crate::num::IntoSafeCast;
+        self.into_safe_cast()
+    }
 }
 
 // Allow VRAM addresses to be printed with the `{:#x}` format specifier.
@@ -68,12 +100,40 @@ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
     }
 }
 
+impl PartialOrd for VramAddress {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for VramAddress {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.into_raw().cmp(&other.into_raw())
+    }
+}
+
 impl From<Pfn> for VramAddress {
     fn from(pfn: Pfn) -> Self {
         Self::zeroed().with_frame_number(pfn)
     }
 }
 
+impl core::ops::Add<u64> for VramAddress {
+    type Output = Self;
+
+    fn add(self, rhs: u64) -> Self {
+        Self::new(self.raw() + rhs)
+    }
+}
+
+impl core::ops::Sub<VramAddress> for VramAddress {
+    type Output = u64;
+
+    fn sub(self, rhs: VramAddress) -> u64 {
+        self.raw() - rhs.raw()
+    }
+}
+
 /// Extension trait to convert a `Range<u64>` of byte addresses into a
 /// `Range<VramAddress>`.
 pub(crate) trait IntoVramRange {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 10/12] docs: gpu: nova-core: Document the PRAMIN aperture mechanism
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add documentation for the PRAMIN aperture mechanism used by nova-core
for direct VRAM access.

Nova only uses TARGET=VRAM for VRAM access. The SYS_MEM target values
are documented for completeness but not used by the driver.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 Documentation/gpu/nova/core/pramin.rst | 123 +++++++++++++++++++++++++
 Documentation/gpu/nova/index.rst       |   1 +
 2 files changed, 124 insertions(+)
 create mode 100644 Documentation/gpu/nova/core/pramin.rst

diff --git a/Documentation/gpu/nova/core/pramin.rst b/Documentation/gpu/nova/core/pramin.rst
new file mode 100644
index 000000000000..f6cbb0811163
--- /dev/null
+++ b/Documentation/gpu/nova/core/pramin.rst
@@ -0,0 +1,123 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+PRAMIN aperture mechanism
+=========================
+
+.. note::
+   The following description is approximate and current as of the Ampere family.
+   It may change for future generations and is intended to assist in understanding
+   the driver code.
+
+Introduction
+============
+
+PRAMIN is a hardware aperture mechanism that provides CPU access to GPU Video RAM (VRAM) before
+the GPU's Memory Management Unit (MMU) and page tables are initialized. This 1MB sliding window,
+located at a fixed offset within BAR0, is essential for setting up page tables and other critical
+GPU data structures without relying on the GPU's MMU.
+
+Architecture Overview
+=====================
+
+The PRAMIN aperture mechanism is logically implemented by the GPU's PBUS (PCIe Bus Controller Unit)
+and provides a CPU-accessible window into VRAM through the PCIe interface::
+
+    +-----------------+    PCIe     +------------------------------+
+    |      CPU        |<----------->|           GPU                |
+    +-----------------+             |                              |
+                                    |  +----------------------+    |
+                                    |  |       PBUS           |    |
+                                    |  |  (Bus Controller)    |    |
+                                    |  |                      |    |
+                                    |  |  +--------------+<------------ (window starts at
+                                    |  |  |   PRAMIN     |    |    |     BAR0 + 0x700000)
+                                    |  |  |   Window     |    |    |
+                                    |  |  |   (1MB)      |    |    |
+                                    |  |  +--------------+    |    |
+                                    |  |         |            |    |
+                                    |  +---------|------------+    |
+                                    |            |                 |
+                                    |            v                 |
+                                    |  +----------------------+<------------ (Program PRAMIN to any
+                                    |  |       VRAM           |    |    64KB-aligned VRAM boundary)
+                                    |  |    (Several GBs)     |    |
+                                    |  |                      |    |
+                                    |  |   FB[0x0000000000]   |    |
+                                    |  |          ...         |    |
+                                    |  |   FB[0xFFFFFFFFFF]   |    |
+                                    |  +----------------------+    |
+                                    +------------------------------+
+
+PBUS (PCIe Bus Controller) is responsible for, among other things, handling MMIO
+accesses to the BAR registers.
+
+PRAMIN Window Operation
+=======================
+
+The PRAMIN window provides a 1MB sliding aperture that can be repositioned over
+the entire VRAM address space using the ``NV_PBUS_BAR0_WINDOW`` register.
+
+Window Control Mechanism
+-------------------------
+
+::
+
+    NV_PBUS_BAR0_WINDOW Register (0x1700):
+    +-------+--------+--------------------------------------+
+    | 31:26 | 25:24  |               23:0                   |
+    | RSVD  | TARGET |            BASE_ADDR                 |
+    |       |        |        (bits 39:16 of VRAM address)  |
+    +-------+--------+--------------------------------------+
+
+    The 24-bit BASE_ADDR field encodes bits [39:16] of the target VRAM address,
+    providing 40-bit (1TB) address space coverage with 64KB alignment.
+
+    TARGET field (bits 25:24):
+    - 0x0: VRAM (Video Memory)
+    - 0x1: SYS_MEM_COH (Coherent System Memory)
+    - 0x2: SYS_MEM_NONCOH (Non-coherent System Memory)
+    - 0x3: Reserved
+
+.. note::
+   Nova only uses TARGET=VRAM (0x0) for video memory access. The SYS_MEM
+   target values are documented here for hardware completeness but are
+   not used by the driver.
+
+64KB Alignment Requirement
+---------------------------
+
+The PRAMIN window must be aligned to 64KB boundaries in VRAM. This is enforced
+by the ``BASE_ADDR`` field representing bits [39:16] of the target address::
+
+    VRAM Address Calculation:
+    actual_vram_addr = (BASE_ADDR << 16) + pramin_offset
+    Where:
+    - BASE_ADDR: 24-bit value from NV_PBUS_BAR0_WINDOW[23:0]
+    - pramin_offset: 20-bit offset within the PRAMIN window [0x00000-0xFFFFF]
+
+    Example Window Positioning:
+    +---------------------------------------------------------+
+    |                    VRAM Space                           |
+    |                                                         |
+    |  0x0000000000 +-----------------+ <-- 64KB aligned      |
+    |               | PRAMIN Window   |                       |
+    |               |    (1MB)        |                       |
+    |  0x00000FFFFF +-----------------+                       |
+    |                                                         |
+    |       |              ^                                  |
+    |       |              | Window can slide                 |
+    |       v              | to any 64KB-aligned boundary     |
+    |                                                         |
+    |  0x0123400000 +-----------------+ <-- 64KB aligned      |
+    |               | PRAMIN Window   |                       |
+    |               |    (1MB)        |                       |
+    |  0x01234FFFFF +-----------------+                       |
+    |                                                         |
+    |                       ...                               |
+    |                                                         |
+    |  0xFFFFF00000 +-----------------+ <-- 64KB aligned      |
+    |               | PRAMIN Window   |                       |
+    |               |    (1MB)        |                       |
+    |  0xFFFFFFFFFF +-----------------+                       |
+    +---------------------------------------------------------+
diff --git a/Documentation/gpu/nova/index.rst b/Documentation/gpu/nova/index.rst
index e39cb3163581..b8254b1ffe2a 100644
--- a/Documentation/gpu/nova/index.rst
+++ b/Documentation/gpu/nova/index.rst
@@ -32,3 +32,4 @@ vGPU manager VFIO driver and the nova-drm driver.
    core/devinit
    core/fwsec
    core/falcon
+   core/pramin
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 11/12] gpu: nova-core: mm: Add GpuMm centralized memory manager
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Introduce GpuMm as the centralized GPU memory manager. At this point in
the series, GpuMm only owns the PRAMIN window for direct VRAM access;
the buddy allocator and TLB manager are added later when those backing
types become available.

This provides a clean ownership model where GpuMm provides accessor
methods for its components that can be used for memory management
operations, and lets follow-on patches (such as the PRAMIN aperture
self-tests) reference `self.mm.pramin()` cleanly.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/gpu.rs | 22 +++++++++++++++++
 drivers/gpu/nova-core/mm.rs  | 46 ++++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index d9d1a7417a2e..38544c38d660 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -25,6 +25,10 @@
         commands::GetGspStaticInfoReply,
         Gsp, //
     },
+    mm::{
+        GpuMm,
+        IntoVramRange, //
+    },
     regs,
 };
 
@@ -261,6 +265,8 @@ pub(crate) struct Gpu {
     gsp_falcon: Falcon<GspFalcon>,
     /// SEC2 falcon instance, used for GSP boot up and cleanup.
     sec2_falcon: Falcon<Sec2Falcon>,
+    /// GPU memory manager owning memory management resources.
+    mm: Arc<GpuMm>,
     /// GSP runtime data. Temporarily an empty placeholder.
     #[pin]
     gsp: Gsp,
@@ -306,6 +312,22 @@ pub(crate) fn new<'a>(
                     );
                 })?,
 
+            // Create GPU memory manager owning memory management resources.
+            mm: {
+                // PRAMIN covers all physical VRAM (including GSP-reserved areas
+                // above the usable region, e.g. the BAR1 page directory).
+                let pramin_vram_region = (0..gsp_static_info.total_fb_end).into_vram_range();
+                Arc::pin_init(
+                    GpuMm::new(
+                        devres_bar.clone(),
+                        pdev.as_ref(),
+                        spec.chipset,
+                        pramin_vram_region,
+                    )?,
+                    GFP_KERNEL,
+                )?
+            },
+
             bar: devres_bar,
         })
     }
diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
index f425467281d3..5c1941d20d1b 100644
--- a/drivers/gpu/nova-core/mm.rs
+++ b/drivers/gpu/nova-core/mm.rs
@@ -2,7 +2,7 @@
 
 //! Memory management subsystems for nova-core.
 
-#![expect(dead_code)]
+#![allow(dead_code)]
 
 /// Implements `From` conversions between a frame-number type and `Bounded<u64, N>`.
 ///
@@ -37,10 +37,52 @@ macro_rules! impl_pfn_bounded {
 
 use kernel::{
     bitfield,
+    device,
+    devres::Devres,
     num::Bounded,
-    prelude::*, //
+    prelude::*,
+    sync::Arc, //
 };
 
+use crate::{
+    driver::Bar0,
+    gpu::Chipset, //
+};
+
+/// GPU Memory Manager - owns all core MM components.
+///
+/// Provides centralized ownership of memory management resources:
+/// - [`pramin::Pramin`] for direct VRAM access.
+#[pin_data]
+pub(crate) struct GpuMm {
+    #[pin]
+    pramin: pramin::Pramin,
+}
+
+impl GpuMm {
+    /// Create a pin-initializer for `GpuMm`.
+    ///
+    /// `pramin_vram_region` is the full physical VRAM range (including GSP-reserved
+    /// areas). PRAMIN window accesses are validated against this range.
+    pub(crate) fn new(
+        bar: Arc<Devres<Bar0>>,
+        dev: &device::Device<device::Bound>,
+        chipset: Chipset,
+        pramin_vram_region: Range<VramAddress>,
+    ) -> Result<impl PinInit<Self>> {
+        let pramin_init = pramin::Pramin::new(bar, dev, chipset, pramin_vram_region)?;
+
+        Ok(pin_init!(Self {
+            pramin <- pramin_init,
+        }))
+    }
+
+    /// Access the [`pramin::Pramin`].
+    pub(crate) fn pramin(&self) -> &pramin::Pramin {
+        &self.pramin
+    }
+}
+
 bitfield! {
     /// Physical VRAM address in GPU video memory.
     pub(crate) struct VramAddress(u64) {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 12/12] gpu: nova-core: mm: Add PRAMIN aperture self-tests
From: Joel Fernandes @ 2026-05-18 18:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Bjorn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Dave Airlie, Daniel Almeida, dri-devel, rust-for-linux, nova-gpu,
	Nikola Djukic, David Airlie, Boqun Feng, John Hubbard,
	Alistair Popple, Timur Tabi, Edwin Peer, Alexandre Courbot,
	Andrea Righi, Andy Ritger, Zhi Wang, Balbir Singh,
	Philipp Stanner, alexeyi, Eliot Courtney, joel, linux-doc,
	Joel Fernandes
In-Reply-To: <20260518180342.2387845-1-joelagnelf@nvidia.com>

Add self-tests for the PRAMIN aperture mechanism to verify correct
operation during GPU probe. The tests validate various alignment
requirements and corner cases.

The tests are default disabled and behind CONFIG_NOVA_MM_SELFTESTS.
When enabled, tests run after GSP boot during probe.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 drivers/gpu/nova-core/Kconfig      |  10 ++
 drivers/gpu/nova-core/driver.rs    |   2 +
 drivers/gpu/nova-core/gpu.rs       |   9 ++
 drivers/gpu/nova-core/mm.rs        |  16 +++
 drivers/gpu/nova-core/mm/pramin.rs | 214 +++++++++++++++++++++++++++++
 5 files changed, 251 insertions(+)

diff --git a/drivers/gpu/nova-core/Kconfig b/drivers/gpu/nova-core/Kconfig
index f918f69e0599..abf10e82647b 100644
--- a/drivers/gpu/nova-core/Kconfig
+++ b/drivers/gpu/nova-core/Kconfig
@@ -15,3 +15,13 @@ config NOVA_CORE
 	  This driver is work in progress and may not be functional.
 
 	  If M is selected, the module will be called nova-core.
+
+config NOVA_MM_SELFTESTS
+	bool "Memory management self-tests"
+	depends on NOVA_CORE
+	help
+	  Enable self-tests for the memory management subsystem. When enabled,
+	  tests are run during GPU probe to verify PRAMIN aperture access,
+	  page table walking, and BAR1 virtual memory mapping functionality.
+
+	  This is a testing option and is default-disabled.
diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
index 84b0e1703150..77746d6949d7 100644
--- a/drivers/gpu/nova-core/driver.rs
+++ b/drivers/gpu/nova-core/driver.rs
@@ -96,6 +96,8 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
 
             Ok(try_pin_init!(Self {
                 gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
+                // Run optional GPU selftests.
+                _: { gpu.run_selftests(pdev)? },
                 _reg <- auxiliary::Registration::new(
                     pdev.as_ref(),
                     c"nova-drm",
diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 38544c38d660..aa047fe91054 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -342,4 +342,13 @@ pub(crate) fn unbind(&self, dev: &device::Device<device::Core>) {
             .inspect(|bar| self.sysmem_flush.unregister(bar))
             .is_err());
     }
+
+    /// Run selftests on the constructed [`Gpu`].
+    pub(crate) fn run_selftests(
+        self: Pin<&mut Self>,
+        pdev: &pci::Device<device::Bound>,
+    ) -> Result {
+        crate::mm::run_mm_selftests(pdev, &self.mm, self.spec.chipset)?;
+        Ok(())
+    }
 }
diff --git a/drivers/gpu/nova-core/mm.rs b/drivers/gpu/nova-core/mm.rs
index 5c1941d20d1b..08d74710f790 100644
--- a/drivers/gpu/nova-core/mm.rs
+++ b/drivers/gpu/nova-core/mm.rs
@@ -40,6 +40,7 @@ macro_rules! impl_pfn_bounded {
     device,
     devres::Devres,
     num::Bounded,
+    pci,
     prelude::*,
     sync::Arc, //
 };
@@ -83,6 +84,21 @@ pub(crate) fn pramin(&self) -> &pramin::Pramin {
     }
 }
 
+/// Run MM subsystem self-tests during probe.
+///
+/// No-op when `CONFIG_NOVA_MM_SELFTESTS` is not enabled.
+#[cfg_attr(not(CONFIG_NOVA_MM_SELFTESTS), allow(unused_variables))]
+pub(crate) fn run_mm_selftests(
+    pdev: &pci::Device<device::Bound>,
+    mm: &Arc<GpuMm>,
+    chipset: Chipset,
+) -> Result {
+    #[cfg(CONFIG_NOVA_MM_SELFTESTS)]
+    pramin::run_self_test(pdev.as_ref(), mm.pramin(), chipset)?;
+
+    Ok(())
+}
+
 bitfield! {
     /// Physical VRAM address in GPU video memory.
     pub(crate) struct VramAddress(u64) {
diff --git a/drivers/gpu/nova-core/mm/pramin.rs b/drivers/gpu/nova-core/mm/pramin.rs
index 38758ca971be..73d516c91c15 100644
--- a/drivers/gpu/nova-core/mm/pramin.rs
+++ b/drivers/gpu/nova-core/mm/pramin.rs
@@ -296,3 +296,217 @@ fn compute_window(
     define_pramin_write!(try_write32, u32);
     define_pramin_write!(try_write64, u64);
 }
+
+#[cfg(CONFIG_NOVA_MM_SELFTESTS)]
+mod selftest {
+    use super::*;
+    use crate::{
+        mm::VramAddress,
+        num::IntoSafeCast, //
+    };
+    use kernel::{
+        device,
+        prelude::*, //
+    };
+
+    /// Offset within the VRAM region to use as the self-test area.
+    const SELFTEST_REGION_OFFSET: u64 = 0x1000;
+
+    /// Test read/write at byte-aligned locations.
+    fn test_byte_readwrite(
+        dev: &kernel::device::Device,
+        win: &mut PraminWindow<'_>,
+        base: VramAddress,
+    ) -> Result {
+        for i in 0u8..4 {
+            let offset = base + 1 + u64::from(i);
+            let val = 0xA0 + i;
+            win.try_write8(offset, val)?;
+            let read_val = win.try_read8(offset)?;
+            if read_val != val {
+                dev_err!(
+                    dev,
+                    "PRAMIN: FAIL - offset {:#x}: wrote {:#x}, read {:#x}\n",
+                    offset,
+                    val,
+                    read_val
+                );
+                return Err(EIO);
+            }
+        }
+        Ok(())
+    }
+
+    /// Test writing a `u32` and reading back as individual `u8`s.
+    fn test_u32_as_bytes(
+        dev: &kernel::device::Device,
+        win: &mut PraminWindow<'_>,
+        base: VramAddress,
+    ) -> Result {
+        let offset = base + 0x10;
+        let val: u32 = 0xDEADBEEF;
+        win.try_write32(offset, val)?;
+
+        // Read back as individual bytes (little-endian: EF BE AD DE).
+        let expected_bytes: [u8; 4] = [0xEF, 0xBE, 0xAD, 0xDE];
+        for (i, &expected) in expected_bytes.iter().enumerate() {
+            let i_u64: u64 = i.into_safe_cast();
+            let read_val = win.try_read8(offset + i_u64)?;
+            if read_val != expected {
+                dev_err!(
+                    dev,
+                    "PRAMIN: FAIL - offset {:#x}: expected {:#x}, read {:#x}\n",
+                    offset + i_u64,
+                    expected,
+                    read_val
+                );
+                return Err(EIO);
+            }
+        }
+        Ok(())
+    }
+
+    /// Test window repositioning across 1MB boundaries.
+    fn test_window_reposition(
+        dev: &kernel::device::Device,
+        win: &mut PraminWindow<'_>,
+        base: VramAddress,
+    ) -> Result {
+        let offset_a = base;
+        let offset_b = base + 0x200000; // base + 2MB (different 1MB region).
+        let val_a: u32 = 0x11111111;
+        let val_b: u32 = 0x22222222;
+
+        win.try_write32(offset_a, val_a)?;
+        win.try_write32(offset_b, val_b)?;
+
+        let read_b = win.try_read32(offset_b)?;
+        if read_b != val_b {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - offset {:#x}: expected {:#x}, read {:#x}\n",
+                offset_b,
+                val_b,
+                read_b
+            );
+            return Err(EIO);
+        }
+
+        let read_a = win.try_read32(offset_a)?;
+        if read_a != val_a {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - offset {:#x}: expected {:#x}, read {:#x}\n",
+                offset_a,
+                val_a,
+                read_a
+            );
+            return Err(EIO);
+        }
+        Ok(())
+    }
+
+    /// Test that offsets outside the VRAM region are rejected.
+    fn test_invalid_offset(
+        dev: &kernel::device::Device,
+        win: &mut PraminWindow<'_>,
+        vram_end: VramAddress,
+    ) -> Result {
+        let result = win.try_read32(vram_end);
+        if result.is_ok() {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - read at invalid offset {:#x} should have failed\n",
+                vram_end
+            );
+            return Err(EIO);
+        }
+        Ok(())
+    }
+
+    /// Test that misaligned multi-byte accesses are rejected.
+    fn test_misaligned_access(
+        dev: &kernel::device::Device,
+        win: &mut PraminWindow<'_>,
+        base: VramAddress,
+    ) -> Result {
+        // `u16` at odd offset (not 2-byte aligned).
+        let offset_u16 = base + 0x21;
+        if win.try_write16(offset_u16, 0xABCD).is_ok() {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - misaligned u16 write at {:#x} should have failed\n",
+                offset_u16
+            );
+            return Err(EIO);
+        }
+
+        // `u32` at 2-byte-aligned (not 4-byte-aligned) offset.
+        let offset_u32 = base + 0x32;
+        if win.try_write32(offset_u32, 0x12345678).is_ok() {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - misaligned u32 write at {:#x} should have failed\n",
+                offset_u32
+            );
+            return Err(EIO);
+        }
+
+        // `u64` read at 4-byte-aligned (not 8-byte-aligned) offset.
+        let offset_u64 = base + 0x44;
+        if win.try_read64(offset_u64).is_ok() {
+            dev_err!(
+                dev,
+                "PRAMIN: FAIL - misaligned u64 read at {:#x} should have failed\n",
+                offset_u64
+            );
+            return Err(EIO);
+        }
+        Ok(())
+    }
+
+    /// Run PRAMIN self-tests during boot if self-tests are enabled.
+    pub(crate) fn run_self_test(
+        pdev: &device::Device<device::Bound>,
+        pramin: &Pramin,
+        chipset: crate::gpu::Chipset,
+    ) -> Result {
+        use crate::gpu::Architecture;
+
+        let dev = pdev;
+
+        // PRAMIN uses NV_PBUS_BAR0_WINDOW which is only available on pre-Hopper GPUs.
+        // Hopper+ uses NV_XAL_EP_BAR0_WINDOW instead, requiring a separate HAL that
+        // has not been implemented yet.
+        if !matches!(
+            chipset.arch(),
+            Architecture::Turing | Architecture::Ampere | Architecture::Ada
+        ) {
+            dev_info!(
+                dev,
+                "PRAMIN: Skipping self-tests for {:?} (only pre-Hopper supported)\n",
+                chipset
+            );
+            return Ok(());
+        }
+
+        dev_info!(dev, "PRAMIN: Starting self-test...\n");
+
+        let vram_region = pramin.vram_region();
+        let base = vram_region.start + SELFTEST_REGION_OFFSET;
+        let vram_end = vram_region.end;
+        let mut win = pramin.get_window(pdev)?;
+
+        test_byte_readwrite(dev, &mut win, base)?;
+        test_u32_as_bytes(dev, &mut win, base)?;
+        test_window_reposition(dev, &mut win, base)?;
+        test_invalid_offset(dev, &mut win, vram_end)?;
+        test_misaligned_access(dev, &mut win, base)?;
+
+        dev_info!(dev, "PRAMIN: All self-tests PASSED\n");
+        Ok(())
+    }
+}
+
+#[cfg(CONFIG_NOVA_MM_SELFTESTS)]
+pub(crate) use selftest::run_self_test;
-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox