From: Jason Gunthorpe <jgg@nvidia.com>
To: Alex Williamson <alex@shazbot.org>,
David Matlack <dmatlack@google.com>,
kvm@vger.kernel.org, Leon Romanovsky <leon@kernel.org>,
linux-kselftest@vger.kernel.org, linux-rdma@vger.kernel.org,
Mark Bloch <mbloch@nvidia.com>,
netdev@vger.kernel.org, Saeed Mahameed <saeedm@nvidia.com>,
Shuah Khan <shuah@kernel.org>, Tariq Toukan <tariqt@nvidia.com>
Cc: patches@lists.linux.dev
Subject: [PATCH 10/11] vfio: selftests: Add mlx5 driver - data path and memcpy ops
Date: Thu, 30 Apr 2026 21:08:36 -0300 [thread overview]
Message-ID: <10-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com> (raw)
In-Reply-To: <0-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com>
Complete the mlx5 driver by adding CQ/QP creation, QP state
transitions, WQE posting, CQ polling, and the
memcpy_start/memcpy_wait callbacks. After this patch the driver is
functional for DMA tests.
The data path implements RDMA Write self-loopback via an RC QP with
force-loopback. WQEs are posted to a 16-entry send queue with an
NC doorbell, and completions are polled from a 16-entry CQ.
Assisted-by: Claude:claude-opus-4.6
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
.../selftests/vfio/lib/drivers/mlx5/mlx5.c | 359 +++++++++++++++++-
1 file changed, 357 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
index 0ab941bad7a66c..39c5414e2c743c 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
@@ -1340,6 +1340,354 @@ static void mlx5st_destroy_mkey(struct mlx5st_device *dev)
mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
}
+/*
+ * CQ create/destroy
+ */
+
+static void mlx5st_create_cq(struct mlx5st_device *dev)
+{
+ struct vfio_pci_device *device = dev->device;
+ u64 in[MLX5_ST_SZ_QW(create_cq_in) + 1] = {};
+ u32 out[MLX5_ST_SZ_DW(create_cq_out)] = {};
+ struct mlx5_ifc_cqc_bits *cqc;
+ unsigned int i;
+ __be64 *pas;
+
+ /* Initialize CQEs before CREATE_CQ: opcode=0xF, owner=1 */
+ for (i = 0; i < CQ_CQE_CNT; i++) {
+ struct mlx5st_cqe64 *cqe = &dev->cq_buf[i];
+
+ MLX5_SET(cqe64, cqe, opcode, 0xF);
+ MLX5_SET_ONCE(cqe64, cqe, owner, 1);
+ }
+
+ MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
+
+ cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+ MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE);
+ MLX5_SET(cqc, cqc, uar_page, dev->uar_page);
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn);
+ MLX5_SET(cqc, cqc, cqe_sz, 0);
+ pas = MLX5_ADDR_OF(create_cq_in, in, pas);
+ MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, pas));
+ MLX5_SET(cqc, cqc, log_page_size, 0);
+ MLX5_SET64(cqc, cqc, dbr_addr, to_iova(device, &dev->cq_dbrec));
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+ dev->cqn = MLX5_GET(create_cq_out, out, cqn);
+ dev->cq_ci = 0;
+ dev_dbg(device, "Created CQ: cqn=%u, %d entries\n", dev->cqn,
+ CQ_CQE_CNT);
+}
+
+static void mlx5st_destroy_cq(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {};
+
+ MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
+ MLX5_SET(destroy_cq_in, in, cqn, dev->cqn);
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/*
+ * QP create/destroy
+ */
+
+static void mlx5st_create_qp(struct mlx5st_device *dev)
+{
+ struct vfio_pci_device *device = dev->device;
+ u64 in[MLX5_ST_SZ_QW(create_qp_in) + 1] = {};
+ u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+ struct mlx5_ifc_qpc_bits *qpc;
+ __be64 *pas;
+
+ MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ MLX5_SET(qpc, qpc, st, MLX5_QPC_ST_RC);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+ MLX5_SET(qpc, qpc, pd, dev->pdn);
+ MLX5_SET(qpc, qpc, uar_page, dev->uar_page);
+ MLX5_SET(qpc, qpc, cqn_snd, dev->cqn);
+ MLX5_SET(qpc, qpc, cqn_rcv, dev->cqn);
+ MLX5_SET(qpc, qpc, log_sq_size, LOG_SQ_SIZE);
+ MLX5_SET(qpc, qpc, log_msg_max, 20);
+ MLX5_SET(qpc, qpc, rq_type, 0x3);
+ MLX5_SET(qpc, qpc, ts_format, 1);
+ pas = MLX5_ADDR_OF(create_qp_in, in, pas);
+ MLX5_SET(qpc, qpc, page_offset,
+ mlx5st_fill_pas(device, dev->sq_buf, pas));
+ MLX5_SET(qpc, qpc, log_page_size, 0);
+ MLX5_SET64(qpc, qpc, dbr_addr, to_iova(device, &dev->qp_dbrec));
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+ dev->qpn = MLX5_GET(create_qp_out, out, qpn);
+ dev->sq_pi = 0;
+ dev_dbg(device, "Created QP: qpn=%u, RC, sq=%d wqes\n", dev->qpn,
+ SQ_WQE_CNT);
+}
+
+static void mlx5st_destroy_qp(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+ MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+ MLX5_SET(destroy_qp_in, in, qpn, dev->qpn);
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/*
+ * QP state transitions
+ */
+
+static void mlx5st_qp_rst2init(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(rst2init_qp_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
+ struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
+
+ MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+ MLX5_SET(rst2init_qp_in, in, qpn, dev->qpn);
+
+ MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+ MLX5_SET(qpc, qpc, rre, 1);
+ MLX5_SET(qpc, qpc, rwe, 1);
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ dev_dbg(dev->device, "QP RST->INIT\n");
+}
+
+static void mlx5st_qp_init2rtr(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(init2rtr_qp_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
+ struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
+
+ MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+ MLX5_SET(init2rtr_qp_in, in, qpn, dev->qpn);
+
+ MLX5_SET(qpc, qpc, mtu, 3);
+ MLX5_SET(qpc, qpc, log_msg_max, 20);
+ MLX5_SET(qpc, qpc, remote_qpn, dev->qpn);
+ MLX5_SET(qpc, qpc, min_rnr_nak, 12);
+ MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+ MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ dev_dbg(dev->device, "QP INIT->RTR (fl=1)\n");
+}
+
+static void mlx5st_qp_rtr2rts(struct mlx5st_device *dev)
+{
+ u32 out[MLX5_ST_SZ_DW(rtr2rts_qp_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
+ struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
+
+ MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+ MLX5_SET(rtr2rts_qp_in, in, qpn, dev->qpn);
+
+ MLX5_SET(qpc, qpc, log_ack_req_freq, 0);
+ MLX5_SET(qpc, qpc, retry_count, 7);
+ MLX5_SET(qpc, qpc, rnr_retry, 7);
+ MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 14);
+
+ mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ dev_dbg(dev->device, "QP RTR->RTS\n");
+}
+
+/*
+ * Post RDMA Write WQE
+ */
+static void mlx5st_post_rdma_write(struct mlx5st_device *dev, u64 src_addr,
+ u32 src_lkey, u64 dst_addr, u32 dst_rkey,
+ u32 length, bool signaled)
+{
+ struct mlx5st_send_wqe *wqe;
+ unsigned int idx;
+
+ idx = dev->sq_pi % SQ_WQE_CNT;
+ wqe = &dev->sq_buf[idx];
+
+ memset(wqe, 0, sizeof(*wqe));
+ MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, opcode, MLX5_OPCODE_RDMA_WRITE);
+ MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, wqe_index, dev->sq_pi);
+ MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, qp_or_sq, dev->qpn);
+ MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ds, MLX5_RDMA_WRITE_DS);
+ if (signaled)
+ MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ce, MLX5_WQE_CE_CQE_ALWAYS);
+
+ MLX5_SET64(wqe_raddr_seg, &wqe->raddr, raddr, dst_addr);
+ MLX5_SET(wqe_raddr_seg, &wqe->raddr, rkey, dst_rkey);
+
+ MLX5_SET(wqe_data_seg, &wqe->data, byte_count, length);
+ MLX5_SET(wqe_data_seg, &wqe->data, lkey, src_lkey);
+ MLX5_SET64(wqe_data_seg, &wqe->data, addr, src_addr);
+
+ dev->sq_pi++;
+
+ /* Ensure WQE is visible to device before doorbell record */
+ dma_wmb();
+
+ WRITE_ONCE(dev->qp_dbrec.send_counter,
+ cpu_to_be32(dev->sq_pi & 0xffff));
+
+ /*
+ * Ring doorbell: write first 8 bytes of ctrl to UAR BF register,
+ * iowrite has an internal dma_wmb() so the doorbell record will be
+ * visible.
+ */
+ iowrite64be(be64_to_cpu(*(__be64 *)wqe),
+ (u8 __iomem *)dev->uar_base + dev->uar_bf_offset);
+ dev->uar_bf_offset ^= MLX5_BF_SIZE;
+}
+
+/*
+ * Poll CQ
+ */
+static int mlx5st_poll_cq_batch(struct mlx5st_device *dev,
+ unsigned int max_cqe)
+{
+ unsigned int polled = 0;
+
+ while (polled < max_cqe) {
+ unsigned int idx = dev->cq_ci % CQ_CQE_CNT;
+ struct mlx5st_cqe64 *cqe = &dev->cq_buf[idx];
+ u8 owner, opcode;
+
+ owner = MLX5_GET_ONCE(cqe64, cqe, owner);
+ if (owner != ((dev->cq_ci >> LOG_CQ_SIZE) & 1))
+ break;
+
+ dma_rmb();
+
+ opcode = MLX5_GET(cqe64, cqe, opcode);
+
+ dev->cq_ci++;
+ WRITE_ONCE(dev->cq_dbrec.recv_counter,
+ cpu_to_be32(dev->cq_ci & 0xffffff));
+
+ if (opcode == MLX5_CQE_REQ) {
+ dev->sq_ci =
+ (u16)(MLX5_GET(cqe64, cqe, wqe_counter) + 1);
+ polled++;
+ continue;
+ }
+ if (opcode == MLX5_CQE_REQ_ERR ||
+ opcode == MLX5_CQE_RESP_ERR) {
+ dev_dbg(dev->device,
+ "CQE error: opcode=0x%x syndrome=0x%x vendor=0x%x\n",
+ opcode,
+ MLX5_GET(cqe64, cqe, error_syndrome.syndrome),
+ MLX5_GET(cqe64, cqe,
+ error_syndrome.vendor_error_syndrome));
+ return -1;
+ }
+ dev_err(dev->device, "CQE unexpected opcode=0x%x\n", opcode);
+ return -1;
+ }
+
+ return polled;
+}
+
+static int mlx5st_poll_cq(struct mlx5st_device *dev, unsigned int timeout_ms)
+{
+ struct timespec start, now;
+ unsigned int elapsed;
+ int ret;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (;;) {
+ ret = mlx5st_poll_cq_batch(dev, 1);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 0;
+
+ if (dev->have_eq)
+ mlx5st_process_events(dev);
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (now.tv_sec - start.tv_sec) * 1000 +
+ (now.tv_nsec - start.tv_nsec) / 1000000;
+ if (elapsed > timeout_ms) {
+ dev_err(dev->device, "CQ poll timeout after %u ms\n",
+ timeout_ms);
+ return -1;
+ }
+ }
+}
+
+/*
+ * Data path setup/teardown helpers
+ */
+
+static void mlx5st_setup_datapath(struct mlx5st_device *dev)
+{
+ mlx5st_create_cq(dev);
+ mlx5st_create_qp(dev);
+ mlx5st_qp_rst2init(dev);
+ mlx5st_qp_init2rtr(dev);
+ mlx5st_qp_rtr2rts(dev);
+}
+
+static void mlx5st_teardown_datapath(struct mlx5st_device *dev)
+{
+ if (dev->qpn) {
+ mlx5st_destroy_qp(dev);
+ dev->qpn = 0;
+ }
+ if (dev->cqn) {
+ mlx5st_destroy_cq(dev);
+ dev->cqn = 0;
+ }
+ dev->sq_pi = 0;
+ dev->sq_ci = 0;
+ memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec));
+ memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec));
+}
+
+/*
+ * memcpy callbacks
+ */
+
+#define MLX5ST_MEMCPY_TIMEOUT_MS 60000
+
+static void mlx5st_memcpy_start(struct vfio_pci_device *device,
+ iova_t src, iova_t dst, u64 size, u64 count)
+{
+ struct mlx5st_device *dev = to_mlx5st(device);
+ u64 i;
+
+ for (i = 0; i < count; i++) {
+ bool signaled = (i == count - 1);
+
+ mlx5st_post_rdma_write(dev, src, dev->global_lkey, dst,
+ dev->global_rkey, size, signaled);
+ }
+}
+
+static int mlx5st_memcpy_wait(struct vfio_pci_device *device)
+{
+ struct mlx5st_device *dev = to_mlx5st(device);
+ int ret;
+
+ ret = mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS);
+ if (ret) {
+ /*
+ * CQE error puts the QP in error state. Rebuild the data path
+ * so subsequent operations can succeed.
+ */
+ mlx5st_teardown_datapath(dev);
+ mlx5st_setup_datapath(dev);
+ }
+ return ret;
+}
+
/*
* Driver ops callbacks
*/
@@ -1368,6 +1716,11 @@ static void mlx5st_init(struct vfio_pci_device *device)
mlx5st_alloc_pd(dev);
mlx5st_create_mkey(dev);
+ mlx5st_setup_datapath(dev);
+
+ device->driver.max_memcpy_size = 1 << 20;
+ device->driver.max_memcpy_count = SQ_WQE_CNT - 1;
+
dev_dbg(device, "mlx5 driver initialized\n");
}
@@ -1375,6 +1728,8 @@ static void mlx5st_remove(struct vfio_pci_device *device)
{
struct mlx5st_device *dev = to_mlx5st(device);
+ mlx5st_teardown_datapath(dev);
+
dev_dbg(device, "teardown: destroy_mkey\n");
if (dev->mkey_index) {
mlx5st_destroy_mkey(dev);
@@ -1400,7 +1755,7 @@ struct vfio_pci_driver_ops mlx5st_ops = {
.probe = mlx5st_probe,
.init = mlx5st_init,
.remove = mlx5st_remove,
- .memcpy_start = NULL,
- .memcpy_wait = NULL,
+ .memcpy_start = mlx5st_memcpy_start,
+ .memcpy_wait = mlx5st_memcpy_wait,
.send_msi = NULL,
};
--
2.43.0
next prev parent reply other threads:[~2026-05-01 0:08 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-01 0:08 [PATCH 00/11] mlx5 support for VFIO self test Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 01/11] net/mlx5: Add IFC structures for CQE and WQE Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 02/11] net/mlx5: Move HW constant groups from device.h/cq.h to mlx5_ifc.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 03/11] net/mlx5: Extract MLX5_SET/GET macros into mlx5_ifc_macros.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 04/11] net/mlx5: Add ONCE and MMIO accessor variants to mlx5_ifc_macros.h Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 05/11] selftests: Add additional kernel functions to tools/include/ Jason Gunthorpe
2026-05-04 21:48 ` David Matlack
2026-05-05 15:43 ` Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 06/11] selftests: Fix arm64 IO barriers to match kernel Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 07/11] vfio: selftests: Allow drivers to specify required region size Jason Gunthorpe
2026-05-02 8:33 ` Manuel Ebner
2026-05-04 20:55 ` David Matlack
2026-05-05 15:52 ` Jason Gunthorpe
2026-05-05 16:05 ` David Matlack
2026-05-01 0:08 ` [PATCH 08/11] vfio: selftests: Add dev_dbg Jason Gunthorpe
2026-05-04 21:15 ` David Matlack
2026-05-05 15:53 ` Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 09/11] vfio: selftests: Add mlx5 driver - HW init and command interface Jason Gunthorpe
2026-05-02 9:35 ` Manuel Ebner
2026-05-04 22:35 ` David Matlack
2026-05-05 15:45 ` Jason Gunthorpe
2026-05-05 16:03 ` David Matlack
2026-05-01 0:08 ` Jason Gunthorpe [this message]
2026-05-04 22:41 ` [PATCH 10/11] vfio: selftests: Add mlx5 driver - data path and memcpy ops David Matlack
2026-05-05 15:49 ` Jason Gunthorpe
2026-05-01 0:08 ` [PATCH 11/11] vfio: selftests: mlx5 driver - add send_msi support Jason Gunthorpe
2026-05-01 16:11 ` [PATCH 00/11] mlx5 support for VFIO self test David Matlack
2026-05-01 16:43 ` Jason Gunthorpe
2026-05-04 22:54 ` David Matlack
2026-05-05 15:50 ` Jason Gunthorpe
2026-05-05 15:57 ` David Matlack
2026-05-02 4:31 ` Alex Williamson
2026-05-02 13:40 ` Jason Gunthorpe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=10-v1-dc5fa250ca1d+3213-mlx5st_jgg@nvidia.com \
--to=jgg@nvidia.com \
--cc=alex@shazbot.org \
--cc=dmatlack@google.com \
--cc=kvm@vger.kernel.org \
--cc=leon@kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=patches@lists.linux.dev \
--cc=saeedm@nvidia.com \
--cc=shuah@kernel.org \
--cc=tariqt@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox