From: Jason Gunthorpe <jgg@nvidia.com>
To: Alex Williamson <alex@shazbot.org>,
David Matlack <dmatlack@google.com>,
kvm@vger.kernel.org, Leon Romanovsky <leon@kernel.org>,
linux-kselftest@vger.kernel.org, linux-rdma@vger.kernel.org,
Mark Bloch <mbloch@nvidia.com>,
netdev@vger.kernel.org, Saeed Mahameed <saeedm@nvidia.com>,
Shuah Khan <shuah@kernel.org>, Tariq Toukan <tariqt@nvidia.com>
Cc: patches@lists.linux.dev
Subject: [PATCH 11/11] vfio: selftests: mlx5 driver - add send_msi support
Date: Thu, 30 Apr 2026 21:08:37 -0300 [thread overview]
Message-ID: <11-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com> (raw)
In-Reply-To: <0-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com>
Wire an MSI-X vector to a dedicated EQ so the mlx5 driver supports
send_msi().
Each EQ can be linked to an MSI-X vector, and the CQ can be set up
to deliver an event to the EQ. Thus, when everything is armed, an
RDMA WRITE posted to the QP generates a CQE, which generates an
EQE, which generates an MSI-X.
To keep things simple this just re-uses all the existing QPs and
CQs, so they generate single MSIs during memcpy.
send_msi() drains any accumulated MSI EQ events from prior memcpy
completions, posts a small signaled RDMA Write, then polls the CQ to
consume the resulting CQE (avoiding stale completions on subsequent
test cycles).
Assisted-by: Claude:claude-opus-4.6
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
.../selftests/vfio/lib/drivers/mlx5/mlx5.c | 165 +++++++++++++++++-
.../selftests/vfio/lib/drivers/mlx5/mlx5_hw.h | 6 +
2 files changed, 168 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
index 39c5414e2c743c..cf6c436a6df0de 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
@@ -56,17 +56,23 @@ struct mlx5st_device {
/* CQ */
u32 cqn;
u32 cq_ci;
+ u32 cq_arm_sn;
/* UAR */
u32 uar_page;
void __iomem *uar_base;
unsigned int uar_bf_offset;
- /* EQ */
+ /* EQ (cmd/pages events — polled, not interrupt-driven) */
u32 eqn;
u32 eq_cons_index;
bool have_eq;
+ /* MSI EQ (CQ completion events — fires MSI-X) */
+ u32 msi_eqn;
+ u32 msi_eq_cons_index;
+ bool have_msi_eq;
+
/* Async pages slot state */
bool pages_slot_in_use;
bool pages_slot_is_reclaim;
@@ -89,6 +95,10 @@ struct mlx5st_device {
/* Capabilities */
bool fl_supported;
+ /* Buffers used by send_msi() to trigger an interrupt */
+ u64 send_msi_src;
+ u64 send_msi_dst;
+
/*
* HW-visible DMA buffers below — device reads/writes via DMA.
*/
@@ -111,6 +121,9 @@ struct mlx5st_device {
/* EQ does not support page_offset */
struct mlx5st_eqe eq_buf[EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE);
+ /* MSI EQ buffer — CQ completions generate EQEs here -> MSI-X */
+ struct mlx5st_eqe msi_eq_buf[MSI_EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE);
+
u8 fw_pages[MAX_FW_PAGES][MLX5_HW_PAGE_SIZE]
__aligned(MLX5_HW_PAGE_SIZE);
};
@@ -133,6 +146,9 @@ static_assert(offsetof(struct mlx5st_device, qp_dbrec) % 64 == 0,
static_assert(offsetof(struct mlx5st_device, eq_buf) %
MLX5_HW_PAGE_SIZE == 0,
"eq_buf must be page-aligned");
+static_assert(offsetof(struct mlx5st_device, msi_eq_buf) %
+ MLX5_HW_PAGE_SIZE == 0,
+ "msi_eq_buf must be page-aligned");
static_assert(offsetof(struct mlx5st_device, fw_pages) %
MLX5_HW_PAGE_SIZE == 0,
"fw_pages must be page-aligned");
@@ -1012,6 +1028,85 @@ static void mlx5st_process_events(struct mlx5st_device *dev)
mlx5st_eq_update_ci(dev, cc, 0);
}
+/*
+ * MSI EQ — dedicated EQ for CQ completion events that fires MSI-X.
+ * Separate from the cmd/pages EQ so that only CQ completions (from
+ * send_msi or memcpy) trigger the interrupt vector.
+ */
+
+static void mlx5st_msi_eq_drain(struct mlx5st_device *dev)
+{
+ u32 cc = 0;
+ u32 val;
+
+ while (cc < MSI_EQ_NENT) {
+ u32 ci = dev->msi_eq_cons_index + cc;
+ struct mlx5st_eqe *eqe =
+ &dev->msi_eq_buf[ci % MSI_EQ_NENT];
+
+ if (MLX5_GET_ONCE(eqe, eqe, owner) != !!(ci & MSI_EQ_NENT))
+ break;
+ cc++;
+ }
+
+ /* Update consumer index and re-arm for next interrupt */
+ dev->msi_eq_cons_index += cc;
+ val = (dev->msi_eq_cons_index & 0xffffff) | (dev->msi_eqn << 24);
+ iowrite32be(val, (u8 __iomem *)dev->uar_base + MLX5_EQ_DOORBELL_OFFSET);
+}
+
+static void mlx5st_create_msi_eq(struct mlx5st_device *dev)
+{
+ struct vfio_pci_device *device = dev->device;
+ u64 in[MLX5_ST_SZ_QW(create_eq_in) + 1] = {};
+ u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {};
+ struct mlx5_ifc_eqc_bits *eqc;
+ unsigned int i;
+ __be64 *pas;
+
+ /* Initialize EQE owner bits */
+ for (i = 0; i < MSI_EQ_NENT; i++) {
+ struct mlx5st_eqe *eqe = &dev->msi_eq_buf[i];
+
+ MLX5_SET_ONCE(eqe, eqe, owner, 1);
+ }
+
+ MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
+
+ /*
+ * No event_bitmask — completion events are routed to this EQ via
+ * the CQ's c_eqn field, not through CREATE_EQ subscription.
+ */
+ eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
+ MLX5_SET(eqc, eqc, log_eq_size, LOG_MSI_EQ_SIZE);
+ MLX5_SET(eqc, eqc, uar_page, dev->uar_page);
+ MLX5_SET(eqc, eqc, intr, MSI_VECTOR);
+ pas = MLX5_ADDR_OF(create_eq_in, in, pas);
+ VFIO_ASSERT_EQ(mlx5st_fill_pas(device, dev->msi_eq_buf, pas), 0u);
+ MLX5_SET(eqc, eqc, log_page_size, 0);
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+ dev->msi_eqn = MLX5_GET(create_eq_out, out, eq_number);
+ dev->msi_eq_cons_index = 0;
+ dev->have_msi_eq = true;
+ mlx5st_msi_eq_drain(dev);
+
+ dev_dbg(device,
+ "Created MSI EQ: eqn=%u, %d entries (COMP), vector=%d\n",
+ dev->msi_eqn, MSI_EQ_NENT, MSI_VECTOR);
+}
+
+static void mlx5st_destroy_msi_eq(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {};
+
+ MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
+ MLX5_SET(destroy_eq_in, in, eq_number, dev->msi_eqn);
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
/*
* HCA init / teardown
*/
@@ -1366,7 +1461,7 @@ static void mlx5st_create_cq(struct mlx5st_device *dev)
cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE);
MLX5_SET(cqc, cqc, uar_page, dev->uar_page);
- MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn);
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->msi_eqn);
MLX5_SET(cqc, cqc, cqe_sz, 0);
pas = MLX5_ADDR_OF(create_cq_in, in, pas);
MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, pas));
@@ -1391,6 +1486,30 @@ static void mlx5st_destroy_cq(struct mlx5st_device *dev)
mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
}
+/*
+ * Arm CQ for event generation. The CQ event delivery state machine is
+ * single-shot: after generating one EQE the CQ enters "Fired" state and
+ * won't generate another until re-armed via ARM_NEXT. Both the CQ doorbell
+ * record and the UAR CQ doorbell register must be written.
+ */
+static void mlx5st_arm_cq(struct mlx5st_device *dev)
+{
+ u32 sn = dev->cq_arm_sn & 3;
+ u32 ci = dev->cq_ci & 0xffffff;
+ u64 doorbell;
+
+ /* Update CQ doorbell record arm word */
+ WRITE_ONCE(dev->cq_dbrec.send_counter,
+ cpu_to_be32(sn << 28 | ci));
+
+ /* Ring CQ doorbell register, iowrite has an internal dma_wmb() */
+ doorbell = ((u64)(sn << 28 | ci) << 32) | dev->cqn;
+ iowrite64be(doorbell,
+ (u8 __iomem *)dev->uar_base + MLX5_CQ_DOORBELL_OFFSET);
+
+ dev->cq_arm_sn++;
+}
+
/*
* QP create/destroy
*/
@@ -1647,6 +1766,7 @@ static void mlx5st_teardown_datapath(struct mlx5st_device *dev)
}
dev->sq_pi = 0;
dev->sq_ci = 0;
+ dev->cq_arm_sn = 0;
memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec));
memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec));
}
@@ -1688,6 +1808,34 @@ static int mlx5st_memcpy_wait(struct vfio_pci_device *device)
return ret;
}
+/*
+ * send_msi callback — trigger CQE -> EQE -> MSI-X via a small RDMA Write.
+ *
+ * Both the CQ and MSI EQ use single-shot arming: the CQ must be armed so the
+ * CQE generates an EQE, and the MSI EQ must be armed so the EQE fires MSI-X.
+ */
+static void mlx5st_send_msi(struct vfio_pci_device *device)
+{
+ struct mlx5st_device *dev = to_mlx5st(device);
+
+ /* Drain accumulated MSI EQ events and re-arm for next interrupt */
+ mlx5st_msi_eq_drain(dev);
+
+ /* Arm CQ so the next CQE generates an EQE on the MSI EQ */
+ mlx5st_arm_cq(dev);
+
+ /* Post a signaled RDMA Write to trigger CQE -> EQE -> MSI-X */
+ mlx5st_post_rdma_write(dev,
+ to_iova(device, &dev->send_msi_src),
+ dev->global_lkey,
+ to_iova(device, &dev->send_msi_dst),
+ dev->global_rkey,
+ sizeof(dev->send_msi_src), true);
+
+ /* Consume the CQE to avoid stale completions */
+ VFIO_ASSERT_EQ(mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS), 0);
+}
+
/*
* Driver ops callbacks
*/
@@ -1716,8 +1864,13 @@ static void mlx5st_init(struct vfio_pci_device *device)
mlx5st_alloc_pd(dev);
mlx5st_create_mkey(dev);
+ /* MSI EQ must be created before CQ so CQ can reference its eqn */
+ mlx5st_create_msi_eq(dev);
mlx5st_setup_datapath(dev);
+ vfio_pci_msix_enable(device, MSI_VECTOR, 1);
+ device->driver.msi = MSI_VECTOR;
+
device->driver.max_memcpy_size = 1 << 20;
device->driver.max_memcpy_count = SQ_WQE_CNT - 1;
@@ -1728,8 +1881,14 @@ static void mlx5st_remove(struct vfio_pci_device *device)
{
struct mlx5st_device *dev = to_mlx5st(device);
+ vfio_pci_msix_disable(device);
mlx5st_teardown_datapath(dev);
+ if (dev->have_msi_eq) {
+ mlx5st_destroy_msi_eq(dev);
+ dev->have_msi_eq = false;
+ }
+
dev_dbg(device, "teardown: destroy_mkey\n");
if (dev->mkey_index) {
mlx5st_destroy_mkey(dev);
@@ -1757,5 +1916,5 @@ struct vfio_pci_driver_ops mlx5st_ops = {
.remove = mlx5st_remove,
.memcpy_start = mlx5st_memcpy_start,
.memcpy_wait = mlx5st_memcpy_wait,
- .send_msi = NULL,
+ .send_msi = mlx5st_send_msi,
};
diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
index a2506ec8a19523..2c451e411ec13f 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
@@ -80,6 +80,9 @@ struct mlx5st_dbrec {
#define MLX5_BF_OFFSET 0x800
#define MLX5_BF_SIZE 0x100
+/* CQ doorbell offset within UAR page */
+#define MLX5_CQ_DOORBELL_OFFSET 0x20
+
/* EQ doorbell offset within UAR page */
#define MLX5_EQ_DOORBELL_OFFSET 0x40
@@ -94,6 +97,9 @@ struct mlx5st_dbrec {
#define LOG_CQ_SIZE 4
#define EQ_NENT 64
#define LOG_EQ_SIZE 6
+#define MSI_EQ_NENT 16
+#define LOG_MSI_EQ_SIZE 4
+#define MSI_VECTOR 0
#define MAX_FW_PAGES 8192
#define MAX_FW_PAGES_PER_CMD 512
--
2.43.0
next prev parent reply other threads:[~2026-05-01 0:08 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-01 0:08 [PATCH 00/11] mlx5 support for VFIO self test Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 01/11] net/mlx5: Add IFC structures for CQE and WQE Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 02/11] net/mlx5: Move HW constant groups from device.h/cq.h to mlx5_ifc.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 03/11] net/mlx5: Extract MLX5_SET/GET macros into mlx5_ifc_macros.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 04/11] net/mlx5: Add ONCE and MMIO accessor variants to mlx5_ifc_macros.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 05/11] selftests: Add additional kernel functions to tools/include/ Jason Gunthorpe
2026-05-04 21:48 ` David Matlack
2026-05-05 15:43 ` Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 06/11] selftests: Fix arm64 IO barriers to match kernel Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 07/11] vfio: selftests: Allow drivers to specify required region size Jason Gunthorpe
2026-05-02 8:33 ` Manuel Ebner
2026-05-04 20:55 ` David Matlack
2026-05-05 15:52 ` Jason Gunthorpe
2026-05-05 16:05 ` David Matlack
2026-05-01 0:08 ` [PATCH 08/11] vfio: selftests: Add dev_dbg Jason Gunthorpe
2026-05-04 21:15 ` David Matlack
2026-05-05 15:53 ` Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 09/11] vfio: selftests: Add mlx5 driver - HW init and command interface Jason Gunthorpe
2026-05-02 9:35 ` Manuel Ebner
2026-05-04 22:35 ` David Matlack
2026-05-05 15:45 ` Jason Gunthorpe
2026-05-05 16:03 ` David Matlack
2026-05-01 0:08 ` [PATCH 10/11] vfio: selftests: Add mlx5 driver - data path and memcpy ops Jason Gunthorpe
2026-05-04 22:41 ` David Matlack
2026-05-05 15:49 ` Jason Gunthorpe
2026-05-01 0:08 ` Jason Gunthorpe [this message]
2026-05-01 16:11 ` [PATCH 00/11] mlx5 support for VFIO self test David Matlack
2026-05-01 16:43 ` Jason Gunthorpe
2026-05-04 22:54 ` David Matlack
2026-05-05 15:50 ` Jason Gunthorpe
2026-05-05 15:57 ` David Matlack
2026-05-02 4:31 ` Alex Williamson
2026-05-02 13:40 ` Jason Gunthorpe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=11-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com \
--to=jgg@nvidia.com \
--cc=alex@shazbot.org \
--cc=dmatlack@google.com \
--cc=kvm@vger.kernel.org \
--cc=leon@kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=patches@lists.linux.dev \
--cc=saeedm@nvidia.com \
--cc=shuah@kernel.org \
--cc=tariqt@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox