From: Chuck Lever <cel@kernel.org>
To: Leon Romanovsky <leon@kernel.org>, Christoph Hellwig <hch@lst.de>,
Jason Gunthorpe <jgg@nvidia.com>
Cc: <linux-rdma@vger.kernel.org>, <linux-nfs@vger.kernel.org>,
Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v5 2/5] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
Date: Tue, 27 Jan 2026 19:53:57 -0500 [thread overview]
Message-ID: <20260128005400.25147-3-cel@kernel.org> (raw)
In-Reply-To: <20260128005400.25147-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The bvec RDMA API maps each bvec individually via dma_map_phys(),
requiring an IOTLB sync for each mapping. For large I/O operations
with many bvecs, this overhead becomes significant.
The two-step IOVA API (dma_iova_try_alloc / dma_iova_link /
dma_iova_sync) allocates a contiguous IOVA range upfront, links
all physical pages without IOTLB syncs, then performs a single
sync at the end. This reduces IOTLB flushes from O(n) to O(1).
It also requires only a single output dma_addr_t compared to extra
per-input element storage in struct scatterlist.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 106 +++++++++++++++++++++++++++++++++++
include/rdma/rw.h | 8 +++
2 files changed, 114 insertions(+)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 39ca21d18d7b..c2fc8cba972e 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -14,6 +14,7 @@ enum {
RDMA_RW_MULTI_WR,
RDMA_RW_MR,
RDMA_RW_SIG_MR,
+ RDMA_RW_IOVA,
};
static bool rdma_rw_force_mr;
@@ -383,6 +384,87 @@ static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return -ENOMEM;
}
+/*
+ * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
+ * This reduces IOTLB sync overhead by doing one sync at the end instead of
+ * one per bvec, and produces a contiguous DMA address range that can be
+ * described by a single SGE.
+ *
+ * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
+ * mapping is not available, or another negative error code on failure.
+ */
+static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
+ struct ib_qp *qp, const struct bio_vec *bvec,
+ struct bvec_iter *iter, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct device *dma_dev = dev->dma_device;
+ size_t total_len = iter->bi_size;
+ struct bio_vec first_bv;
+ size_t mapped_len = 0;
+ int ret;
+
+ /* Virtual DMA devices cannot support IOVA allocators */
+ if (ib_uses_virt_dma(dev))
+ return -EOPNOTSUPP;
+
+ /* Try to allocate contiguous IOVA space */
+ first_bv = mp_bvec_iter_bvec(bvec, *iter);
+ if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
+ bvec_phys(&first_bv), total_len))
+ return -EOPNOTSUPP;
+
+ /* Link all bvecs into the IOVA space */
+ while (iter->bi_size) {
+ struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
+
+ ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
+ mapped_len, bv.bv_len, dir, 0);
+ if (ret)
+ goto out_destroy;
+
+ mapped_len += bv.bv_len;
+ bvec_iter_advance(bvec, iter, bv.bv_len);
+ }
+
+ /* Sync the IOTLB once for all linked pages */
+ ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
+ if (ret)
+ goto out_destroy;
+
+ ctx->iova.mapped_len = mapped_len;
+
+ /* Single SGE covers the entire contiguous IOVA range */
+ ctx->iova.sge.addr = ctx->iova.state.addr;
+ ctx->iova.sge.length = mapped_len;
+ ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
+
+ /* Single WR for the whole transfer */
+ memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
+ if (dir == DMA_TO_DEVICE)
+ ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
+ ctx->iova.wr.wr.num_sge = 1;
+ ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
+ ctx->iova.wr.remote_addr = remote_addr;
+ ctx->iova.wr.rkey = rkey;
+
+ ctx->type = RDMA_RW_IOVA;
+ ctx->nr_ops = 1;
+ return 1;
+
+out_destroy:
+ /*
+ * dma_iova_destroy() expects the actual mapped length, not the
+ * total allocation size. It unlinks only the successfully linked
+ * range and frees the entire IOVA allocation.
+ */
+ dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
+ return ret;
+}
+
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -485,6 +567,8 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct bvec_iter iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
+ int ret;
+
if (nr_bvec == 0 || iter.bi_size == 0)
return -EINVAL;
@@ -495,6 +579,16 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
if (nr_bvec == 1)
return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
remote_addr, rkey, dir);
+
+ /*
+ * Try IOVA-based mapping first for multi-bvec transfers.
+ * This reduces IOTLB sync overhead by batching all mappings.
+ */
+ ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
+ rkey, dir);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
remote_addr, rkey, dir);
}
@@ -671,6 +765,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
first_wr = &ctx->reg[0].reg_wr.wr;
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
break;
+ case RDMA_RW_IOVA:
+ first_wr = &ctx->iova.wr.wr;
+ last_wr = &ctx->iova.wr.wr;
+ break;
case RDMA_RW_MULTI_WR:
first_wr = &ctx->map.wrs[0].wr;
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@@ -745,6 +843,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
break;
case RDMA_RW_SINGLE_WR:
break;
+ case RDMA_RW_IOVA:
+ /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
+ WARN_ON_ONCE(1);
+ return;
default:
BUG();
break;
@@ -778,6 +880,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_IOVA:
+ dma_iova_destroy(dev->dma_device, &ctx->iova.state,
+ ctx->iova.mapped_len, dir, 0);
+ break;
case RDMA_RW_MULTI_WR:
for (i = 0; i < nr_bvec; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index b2fc3e2373d7..205e16ed6cd8 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -32,6 +32,14 @@ struct rdma_rw_ctx {
struct ib_rdma_wr *wrs;
} map;
+ /* for IOVA-based mapping of bvecs into contiguous DMA range: */
+ struct {
+ struct dma_iova_state state;
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ size_t mapped_len;
+ } iova;
+
/* for registering multiple WRs: */
struct rdma_rw_reg_ctx {
struct ib_sge sge;
--
2.49.0
next prev parent reply other threads:[~2026-01-28 0:54 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-28 0:53 [PATCH v5 0/5] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-28 0:53 ` [PATCH v5 1/5] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-28 0:53 ` Chuck Lever [this message]
2026-01-28 0:53 ` [PATCH v5 3/5] RDMA/core: add MR support for bvec-based RDMA operations Chuck Lever
2026-01-28 3:36 ` Christoph Hellwig
2026-01-28 0:53 ` [PATCH v5 4/5] RDMA/core: add rdma_rw_max_sge() helper for SQ sizing Chuck Lever
2026-01-28 0:54 ` [PATCH v5 5/5] svcrdma: use bvec-based RDMA read/write API Chuck Lever
2026-01-28 3:36 ` Christoph Hellwig
2026-01-28 12:18 ` [PATCH v5 0/5] Add a bio_vec based API to core/rw.c Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260128005400.25147-3-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=hch@lst.de \
--cc=jgg@nvidia.com \
--cc=leon@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.