From: Chuck Lever <cel@kernel.org>
To: Jason Gunthorpe <jgg@nvidia.com>,
Leon Romanovsky <leon@kernel.org>, Christoph Hellwig <hch@lst.de>
Cc: <linux-rdma@vger.kernel.org>, <linux-nfs@vger.kernel.org>,
NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v1 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
Date: Wed, 14 Jan 2026 09:39:46 -0500 [thread overview]
Message-ID: <20260114143948.3946615-3-cel@kernel.org> (raw)
In-Reply-To: <20260114143948.3946615-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The bvec RDMA API maps each bvec individually via dma_map_phys(),
requiring an IOTLB sync for each mapping. For large I/O operations
with many bvecs, this overhead becomes significant.
The two-step IOVA API (dma_iova_try_alloc/dma_iova_link/
dma_iova_sync) allocates a contiguous IOVA range upfront, links
all physical pages without IOTLB syncs, then performs a single
sync at the end. This reduces IOTLB flushes from O(n) to O(1).
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 153 +++++++++++++++++++++++++++++++++++
include/rdma/rw.h | 8 ++
2 files changed, 161 insertions(+)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 42215c2ff42b..36038e5f9197 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -14,6 +14,7 @@ enum {
RDMA_RW_MULTI_WR,
RDMA_RW_MR,
RDMA_RW_SIG_MR,
+ RDMA_RW_IOVA,
};
static bool rdma_rw_force_mr;
@@ -392,6 +393,137 @@ static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return -ENOMEM;
}
+/*
+ * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
+ * This reduces IOTLB sync overhead by doing one sync at the end instead of
+ * one per bvec, and produces a contiguous DMA address range.
+ *
+ * Returns the number of WQEs on success, -EOPNOTSUPP if IOVA mapping is not
+ * available, or another negative error code on failure.
+ */
+static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ const struct bio_vec *bvec, u32 nr_bvec, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct device *dma_dev = dev->dma_device;
+ u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+ qp->max_read_sge;
+ struct ib_sge *sge;
+ size_t total_len = 0, mapped_len = 0;
+ u32 i, j, bvec_idx = 0;
+ int ret;
+
+ /* Virtual DMA devices don't support IOVA mapping */
+ if (ib_uses_virt_dma(dev))
+ return -EOPNOTSUPP;
+
+ if (!max_sge)
+ return -EINVAL;
+
+ /* Calculate total transfer length */
+ for (i = 0; i < nr_bvec; i++) {
+ size_t len = (i == 0 && offset) ?
+ bvec[i].bv_len - offset : bvec[i].bv_len;
+
+ if (check_add_overflow(total_len, len, &total_len))
+ return -EINVAL;
+ }
+
+ /* Try to allocate contiguous IOVA space */
+ if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
+ bvec_phys(&bvec[0]) + offset, total_len))
+ return -EOPNOTSUPP;
+
+ ctx->nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
+
+ ctx->iova.sges = sge = kcalloc(nr_bvec, sizeof(*sge), GFP_KERNEL);
+ if (!ctx->iova.sges) {
+ ret = -ENOMEM;
+ goto out_free_iova;
+ }
+
+ ctx->iova.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->iova.wrs), GFP_KERNEL);
+ if (!ctx->iova.wrs) {
+ ret = -ENOMEM;
+ goto out_free_sges;
+ }
+
+ /* Link all bvecs into the IOVA space */
+ for (i = 0; i < nr_bvec; i++) {
+ const struct bio_vec *bv = &bvec[i];
+ phys_addr_t phys = bvec_phys(bv);
+ size_t len = bv->bv_len;
+
+ if (i == 0 && offset) {
+ phys += offset;
+ len -= offset;
+ }
+
+ ret = dma_iova_link(dma_dev, &ctx->iova.state, phys,
+ mapped_len, len, dir, 0);
+ if (ret)
+ goto out_destroy;
+
+ mapped_len += len;
+ }
+
+ /* Sync the IOTLB once for all linked pages */
+ ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
+ if (ret)
+ goto out_destroy;
+
+ ctx->iova.mapped_len = mapped_len;
+
+ /* Build SGEs using offsets into the contiguous IOVA range */
+ mapped_len = 0;
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->iova.wrs[i];
+ u32 nr_sge = min(nr_bvec - bvec_idx, max_sge);
+
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->remote_addr = remote_addr + mapped_len;
+ rdma_wr->rkey = rkey;
+ rdma_wr->wr.num_sge = nr_sge;
+ rdma_wr->wr.sg_list = sge;
+
+ for (j = 0; j < nr_sge; j++, bvec_idx++) {
+ const struct bio_vec *bv = &bvec[bvec_idx];
+ size_t len = bv->bv_len;
+
+ if (bvec_idx == 0 && offset)
+ len -= offset;
+
+ sge->addr = ctx->iova.state.addr + mapped_len;
+ sge->length = len;
+ sge->lkey = qp->pd->local_dma_lkey;
+
+ mapped_len += len;
+ sge++;
+ }
+
+ rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+ &ctx->iova.wrs[i + 1].wr : NULL;
+ }
+
+ ctx->type = RDMA_RW_IOVA;
+ return ctx->nr_ops;
+
+out_destroy:
+ dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
+ kfree(ctx->iova.wrs);
+ kfree(ctx->iova.sges);
+ return ret;
+out_free_sges:
+ kfree(ctx->iova.sges);
+out_free_iova:
+ dma_iova_free(dma_dev, &ctx->iova.state);
+ return ret;
+}
+
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -486,6 +618,8 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 offset, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
+ int ret;
+
if (nr_bvec == 0 || offset > bvec[0].bv_len)
return -EINVAL;
@@ -497,6 +631,15 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return rdma_rw_init_single_wr_bvec(ctx, qp, bvec, offset,
remote_addr, rkey, dir);
+ /*
+ * Try IOVA-based mapping first for multi-bvec transfers.
+ * This reduces IOTLB sync overhead by batching all mappings.
+ */
+ ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvec, nr_bvec, offset,
+ remote_addr, rkey, dir);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvec, nr_bvec, offset,
remote_addr, rkey, dir);
}
@@ -673,6 +816,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
first_wr = &ctx->reg[0].reg_wr.wr;
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
break;
+ case RDMA_RW_IOVA:
+ first_wr = &ctx->iova.wrs[0].wr;
+ last_wr = &ctx->iova.wrs[ctx->nr_ops - 1].wr;
+ break;
case RDMA_RW_MULTI_WR:
first_wr = &ctx->map.wrs[0].wr;
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@@ -774,6 +921,12 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_IOVA:
+ dma_iova_destroy(dev->dma_device, &ctx->iova.state,
+ ctx->iova.mapped_len, dir, 0);
+ kfree(ctx->iova.wrs);
+ kfree(ctx->iova.sges);
+ break;
case RDMA_RW_MULTI_WR:
for (i = 0; i < nr_bvec; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 046a8eb57125..8a2012f03667 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -31,6 +31,14 @@ struct rdma_rw_ctx {
struct ib_rdma_wr *wrs;
} map;
+ /* for IOVA-based mapping of multiple bvecs: */
+ struct {
+ struct dma_iova_state state;
+ struct ib_sge *sges;
+ struct ib_rdma_wr *wrs;
+ size_t mapped_len;
+ } iova;
+
/* for registering multiple WRs: */
struct rdma_rw_reg_ctx {
struct ib_sge sge;
--
2.52.0
next prev parent reply other threads:[~2026-01-14 14:39 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-14 14:39 [PATCH v1 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-14 14:39 ` [PATCH v1 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-15 15:53 ` Christoph Hellwig
2026-01-16 11:33 ` Leon Romanovsky
2026-01-16 14:52 ` Christoph Hellwig
2026-01-16 14:57 ` Chuck Lever
2026-01-16 21:14 ` Leon Romanovsky
2026-01-16 21:24 ` Leon Romanovsky
2026-01-16 21:49 ` Chuck Lever
2026-01-17 16:20 ` Leon Romanovsky
2026-01-19 6:52 ` Christoph Hellwig
2026-01-19 10:28 ` Leon Romanovsky
2026-01-19 12:03 ` Christoph Hellwig
2026-01-19 14:37 ` Chuck Lever
2026-01-19 18:34 ` Leon Romanovsky
2026-01-14 14:39 ` Chuck Lever [this message]
2026-01-15 15:58 ` [PATCH v1 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Christoph Hellwig
2026-01-14 14:39 ` [PATCH v1 3/4] RDMA/core: add MR support for bvec-based " Chuck Lever
2026-01-15 15:58 ` Christoph Hellwig
2026-01-16 11:42 ` Leon Romanovsky
2026-01-16 14:50 ` Christoph Hellwig
2026-01-16 21:16 ` Leon Romanovsky
2026-01-14 14:39 ` [PATCH v1 4/4] svcrdma: use bvec-based RDMA read/write API Chuck Lever
2026-01-15 9:51 ` Leon Romanovsky
2026-01-15 16:29 ` Christoph Hellwig
2026-01-15 18:29 ` Chuck Lever
2026-01-15 21:53 ` Chuck Lever
2026-01-16 9:38 ` Christoph Hellwig
2026-01-15 9:50 ` [PATCH v1 0/4] Add a bio_vec based API to core/rw.c Leon Romanovsky
2026-01-15 15:46 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260114143948.3946615-3-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=hch@lst.de \
--cc=jgg@nvidia.com \
--cc=jlayton@kernel.org \
--cc=leon@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.