From: Chuck Lever <cel@kernel.org>
To: Jason Gunthorpe <jgg@nvidia.com>,
Leon Romanovsky <leon@kernel.org>, Christoph Hellwig <hch@lst.de>
Cc: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
<linux-rdma@vger.kernel.org>, <linux-nfs@vger.kernel.org>,
Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
Date: Tue, 20 Jan 2026 09:31:22 -0500 [thread overview]
Message-ID: <20260120143124.1822121-3-cel@kernel.org> (raw)
In-Reply-To: <20260120143124.1822121-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The bvec RDMA API maps each bvec individually via dma_map_phys(),
requiring an IOTLB sync for each mapping. For large I/O operations
with many bvecs, this overhead becomes significant.
The two-step IOVA API (dma_iova_try_alloc / dma_iova_link /
dma_iova_sync) allocates a contiguous IOVA range upfront, links
all physical pages without IOTLB syncs, then performs a single
sync at the end. This reduces IOTLB flushes from O(n) to O(1).
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 116 +++++++++++++++++++++++++++++++++++
include/rdma/rw.h | 8 +++
2 files changed, 124 insertions(+)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 59f32fecf3df..51f650c4fa8c 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -15,6 +15,7 @@ enum {
RDMA_RW_MULTI_WR,
RDMA_RW_MR,
RDMA_RW_SIG_MR,
+ RDMA_RW_IOVA,
};
static bool rdma_rw_force_mr;
@@ -380,6 +381,93 @@ static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return -ENOMEM;
}
+/*
+ * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
+ * This reduces IOTLB sync overhead by doing one sync at the end instead of
+ * one per bvec, and produces a contiguous DMA address range that can be
+ * described by a single SGE.
+ *
+ * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
+ * mapping is not available, or another negative error code on failure.
+ */
+static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ const struct bio_vec *bvec, u32 nr_bvec,
+ struct bvec_iter *iter,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct device *dma_dev = dev->dma_device;
+ struct bvec_iter link_iter;
+ struct bio_vec first_bv;
+ size_t total_len, mapped_len = 0;
+ int ret;
+
+ /* Virtual DMA devices lack IOVA allocators */
+ if (ib_uses_virt_dma(dev))
+ return -EOPNOTSUPP;
+
+ total_len = iter->bi_size;
+
+ /* Get the first (possibly offset-adjusted) bvec for starting phys addr */
+ first_bv = mp_bvec_iter_bvec(bvec, *iter);
+
+ /* Try to allocate contiguous IOVA space */
+ if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
+ bvec_phys(&first_bv), total_len))
+ return -EOPNOTSUPP;
+
+ /* Link all bvecs into the IOVA space */
+ link_iter = *iter;
+ while (link_iter.bi_size) {
+ struct bio_vec bv = mp_bvec_iter_bvec(bvec, link_iter);
+
+ ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
+ mapped_len, bv.bv_len, dir, 0);
+ if (ret)
+ goto out_destroy;
+
+ if (check_add_overflow(mapped_len, bv.bv_len, &mapped_len)) {
+ ret = -EOVERFLOW;
+ goto out_destroy;
+ }
+ bvec_iter_advance(bvec, &link_iter, bv.bv_len);
+ }
+
+ /* Sync the IOTLB once for all linked pages */
+ ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
+ if (ret)
+ goto out_destroy;
+
+ ctx->iova.mapped_len = mapped_len;
+
+ /* Single SGE covers the entire contiguous IOVA range */
+ ctx->iova.sge.addr = ctx->iova.state.addr;
+ ctx->iova.sge.length = mapped_len;
+ ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
+
+ /* Single WR for the whole transfer */
+ memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
+ ctx->iova.wr.wr.opcode = dir == DMA_TO_DEVICE ?
+ IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ ctx->iova.wr.wr.num_sge = 1;
+ ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
+ ctx->iova.wr.remote_addr = remote_addr;
+ ctx->iova.wr.rkey = rkey;
+
+ ctx->type = RDMA_RW_IOVA;
+ ctx->nr_ops = 1;
+ return 1;
+
+out_destroy:
+ /*
+ * dma_iova_destroy() expects the actual mapped length, not the
+ * total allocation size. It unlinks only the successfully linked
+ * range and frees the entire IOVA allocation.
+ */
+ dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
+ return ret;
+}
+
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -484,6 +572,7 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
{
struct bvec_iter iter;
u32 i, total_len = 0;
+ int ret;
if (nr_bvec == 0 || offset >= bvec[0].bv_len)
return -EINVAL;
@@ -507,6 +596,21 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return rdma_rw_init_single_wr_bvec(ctx, qp, bvec, &iter,
remote_addr, rkey, dir);
+ /*
+ * Try IOVA-based mapping first for multi-bvec transfers.
+ * This reduces IOTLB sync overhead by batching all mappings.
+ */
+ ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
+ remote_addr, rkey, dir);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
+ /* Fallback path requires iterator at initial state */
+ iter.bi_sector = 0;
+ iter.bi_size = total_len;
+ iter.bi_idx = 0;
+ iter.bi_bvec_done = offset;
+
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
remote_addr, rkey, dir);
}
@@ -683,6 +787,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
first_wr = &ctx->reg[0].reg_wr.wr;
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
break;
+ case RDMA_RW_IOVA:
+ first_wr = &ctx->iova.wr.wr;
+ last_wr = &ctx->iova.wr.wr;
+ break;
case RDMA_RW_MULTI_WR:
first_wr = &ctx->map.wrs[0].wr;
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@@ -757,6 +865,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
break;
case RDMA_RW_SINGLE_WR:
break;
+ case RDMA_RW_IOVA:
+ /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
+ WARN_ON_ONCE(1);
+ break;
default:
BUG();
break;
@@ -790,6 +902,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_IOVA:
+ dma_iova_destroy(dev->dma_device, &ctx->iova.state,
+ ctx->iova.mapped_len, dir, 0);
+ break;
case RDMA_RW_MULTI_WR:
for (i = 0; i < nr_bvec; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 046a8eb57125..2a5f33665d52 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -31,6 +31,14 @@ struct rdma_rw_ctx {
struct ib_rdma_wr *wrs;
} map;
+ /* for IOVA-based mapping of bvecs into contiguous DMA range: */
+ struct {
+ struct dma_iova_state state;
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ size_t mapped_len;
+ } iova;
+
/* for registering multiple WRs: */
struct rdma_rw_reg_ctx {
struct ib_sge sge;
--
2.52.0
next prev parent reply other threads:[~2026-01-20 14:31 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-21 8:42 ` Christoph Hellwig
2026-01-21 8:48 ` Leon Romanovsky
2026-01-21 8:57 ` Christoph Hellwig
2026-01-21 10:16 ` Leon Romanovsky
2026-01-21 8:56 ` Christoph Hellwig
2026-01-21 14:14 ` Chuck Lever
2026-01-21 14:57 ` Christoph Hellwig
2026-01-21 15:10 ` Chuck Lever
2026-01-20 14:31 ` Chuck Lever [this message]
2026-01-21 8:51 ` [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 3/4] RDMA/core: add MR support for bvec-based " Chuck Lever
2026-01-21 9:05 ` Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 4/4] svcrdma: use bvec-based RDMA read/write API Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260120143124.1822121-3-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=hch@lst.de \
--cc=jgg@nvidia.com \
--cc=jlayton@kernel.org \
--cc=leon@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.