From: Chuck Lever <cel@kernel.org>
To: Jason Gunthorpe <jgg@nvidia.com>,
Leon Romanovsky <leon@kernel.org>, Christoph Hellwig <hch@lst.de>
Cc: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
<linux-rdma@vger.kernel.org>, <linux-nfs@vger.kernel.org>,
Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v2 3/4] RDMA/core: add MR support for bvec-based RDMA operations
Date: Tue, 20 Jan 2026 09:31:23 -0500 [thread overview]
Message-ID: <20260120143124.1822121-4-cel@kernel.org> (raw)
In-Reply-To: <20260120143124.1822121-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The bvec-based RDMA API currently returns -EOPNOTSUPP when Memory
Region registration is required. This prevents iWARP devices from
using the bvec path, since iWARP requires MR registration for RDMA
READ operations. The force_mr debug parameter is also unusable with
bvec input.
Add rdma_rw_init_mr_wrs_bvec() to handle MR registration for bvec
arrays. The approach creates a synthetic scatterlist populated with
DMA addresses from the bvecs, then reuses the existing ib_map_mr_sg()
infrastructure. This avoids driver changes while keeping the
implementation small.
The synthetic scatterlist is stored in the rdma_rw_ctx for cleanup.
On destroy, the MRs are returned to the pool and the bvec DMA
mappings are released using the stored addresses.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 159 ++++++++++++++++++++++++++++++++---
include/rdma/rw.h | 8 ++
2 files changed, 156 insertions(+), 11 deletions(-)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 51f650c4fa8c..9181fca8ff3f 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -194,6 +194,135 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return ret;
}
+static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
+ u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct rdma_rw_reg_ctx *prev = NULL;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
+ struct scatterlist *sgl;
+ int i, j, ret = 0, count = 0;
+ u32 sg_idx = 0;
+
+ ctx->nr_ops = DIV_ROUND_UP(nr_bvec, pages_per_mr);
+ ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg)
+ return -ENOMEM;
+
+ /*
+ * Allocate synthetic scatterlist to hold DMA addresses.
+ * ib_map_mr_sg() extracts sg_dma_address/len, so the page
+ * pointer is unused.
+ */
+ sgl = kmalloc_array(nr_bvec, sizeof(*sgl), GFP_KERNEL);
+ if (!sgl) {
+ ret = -ENOMEM;
+ goto out_free_reg;
+ }
+ sg_init_table(sgl, nr_bvec);
+
+ for (i = 0; i < nr_bvec; i++) {
+ const struct bio_vec *bv = &bvec[i];
+ struct bio_vec adjusted;
+ u64 dma_addr;
+ u32 len;
+
+ /*
+ * The offset parameter applies only to the first bvec,
+ * allowing callers to start partway into the array.
+ */
+ if (i == 0 && offset) {
+ adjusted = *bv;
+ adjusted.bv_offset += offset;
+ adjusted.bv_len -= offset;
+ bv = &adjusted;
+ }
+ len = bv->bv_len;
+
+ dma_addr = ib_dma_map_bvec(dev, bv, dir);
+ if (ib_dma_mapping_error(dev, dma_addr)) {
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ /* sg_set_page() initializes the entry; ib_map_mr_sg() uses
+ * only sg_dma_address/len, ignoring the page pointer.
+ */
+ sg_set_page(&sgl[i], bv->bv_page, len, bv->bv_offset);
+ sg_dma_address(&sgl[i]) = dma_addr;
+ sg_dma_len(&sgl[i]) = len;
+ }
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ u32 nents = min(nr_bvec - sg_idx, pages_per_mr);
+
+ ret = rdma_rw_init_one_mr(qp, port_num, reg, &sgl[sg_idx],
+ nents, 0);
+ if (ret < 0)
+ goto out_free_mrs;
+ count += ret;
+
+ if (prev) {
+ if (reg->mr->need_inval)
+ prev->wr.wr.next = ®->inv_wr;
+ else
+ prev->wr.wr.next = ®->reg_wr.wr;
+ }
+
+ reg->reg_wr.wr.next = ®->wr.wr;
+
+ reg->wr.wr.sg_list = ®->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr;
+ reg->wr.rkey = rkey;
+
+ if (dir == DMA_TO_DEVICE) {
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+ } else {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+ }
+ count++;
+
+ remote_addr += reg->sge.length;
+ sg_idx += nents;
+ prev = reg;
+ }
+
+ if (prev)
+ prev->wr.wr.next = NULL;
+
+ ctx->type = RDMA_RW_MR;
+ ctx->mr_sgl = sgl;
+ ctx->mr_sg_cnt = nr_bvec;
+ return count;
+
+out_free_mrs:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ for (j = 0; j < nr_bvec; j++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&sgl[j]),
+ sg_dma_len(&sgl[j]), dir);
+ kfree(sgl);
+ kfree(ctx->reg);
+ return ret;
+
+out_unmap:
+ /* Unmap bvecs that were successfully mapped (0 through i-1) */
+ for (j = 0; j < i; j++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&sgl[j]),
+ sg_dma_len(&sgl[j]), dir);
+ kfree(sgl);
+out_free_reg:
+ kfree(ctx->reg);
+ return ret;
+}
+
static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@@ -550,19 +679,13 @@ EXPORT_SYMBOL(rdma_rw_ctx_init);
* @rkey: remote key to operate on
* @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
*
- * Accepts bio_vec arrays directly, avoiding scatterlist conversion for
- * callers that already have data in bio_vec form. Prefer this over
- * rdma_rw_ctx_init() when the source data is a bio_vec array.
- *
- * This function does not support devices requiring memory registration.
- * iWARP devices and configurations with force_mr=1 should use
- * rdma_rw_ctx_init() with a scatterlist instead.
+ * Maps the bio_vec array directly, avoiding intermediate scatterlist
+ * conversion. Supports MR registration for iWARP devices and force_mr mode.
*
* Returns the number of WQEs that will be needed on the workqueue if
* successful, or a negative error code:
*
* * -EINVAL - @nr_bvec is zero, @offset exceeds first bvec, or overflow
- * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1)
* * -ENOMEM - DMA mapping or memory allocation failed
*/
int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
@@ -570,6 +693,7 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 offset, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
+ struct ib_device *dev = qp->pd->device;
struct bvec_iter iter;
u32 i, total_len = 0;
int ret;
@@ -577,9 +701,10 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
if (nr_bvec == 0 || offset >= bvec[0].bv_len)
return -EINVAL;
- /* MR path not supported for bvec - reject iWARP and force_mr */
- if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec))
- return -EOPNOTSUPP;
+ if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
+ return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvec,
+ nr_bvec, offset, remote_addr,
+ rkey, dir);
for (i = 0; i < nr_bvec; i++) {
if (check_add_overflow(total_len, bvec[i].bv_len, &total_len))
@@ -855,6 +980,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
switch (ctx->type) {
case RDMA_RW_MR:
+ /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
+ WARN_ON_ONCE(ctx->mr_sgl);
for (i = 0; i < ctx->nr_ops; i++)
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
kfree(ctx->reg);
@@ -902,6 +1029,16 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ /* DMA addresses were stored in mr_sgl during init */
+ for (i = 0; i < ctx->mr_sg_cnt; i++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&ctx->mr_sgl[i]),
+ sg_dma_len(&ctx->mr_sgl[i]), dir);
+ kfree(ctx->mr_sgl);
+ break;
case RDMA_RW_IOVA:
dma_iova_destroy(dev->dma_device, &ctx->iova.state,
ctx->iova.mapped_len, dir, 0);
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 2a5f33665d52..01177fd09eae 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -48,6 +48,14 @@ struct rdma_rw_ctx {
struct ib_mr *mr;
} *reg;
};
+
+ /*
+ * For bvec MR path: store synthetic scatterlist with DMA addresses
+ * for cleanup. Only valid when type == RDMA_RW_MR and initialized
+ * via rdma_rw_ctx_init_bvec().
+ */
+ struct scatterlist *mr_sgl;
+ u32 mr_sg_cnt;
};
int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
--
2.52.0
next prev parent reply other threads:[~2026-01-20 14:31 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-21 8:42 ` Christoph Hellwig
2026-01-21 8:48 ` Leon Romanovsky
2026-01-21 8:57 ` Christoph Hellwig
2026-01-21 10:16 ` Leon Romanovsky
2026-01-21 8:56 ` Christoph Hellwig
2026-01-21 14:14 ` Chuck Lever
2026-01-21 14:57 ` Christoph Hellwig
2026-01-21 15:10 ` Chuck Lever
2026-01-20 14:31 ` [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Chuck Lever
2026-01-21 8:51 ` Christoph Hellwig
2026-01-20 14:31 ` Chuck Lever [this message]
2026-01-21 9:05 ` [PATCH v2 3/4] RDMA/core: add MR support for bvec-based " Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 4/4] svcrdma: use bvec-based RDMA read/write API Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260120143124.1822121-4-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=hch@lst.de \
--cc=jgg@nvidia.com \
--cc=jlayton@kernel.org \
--cc=leon@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.