* [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
@ 2026-01-20 14:31 ` Chuck Lever
2026-01-21 8:42 ` Christoph Hellwig
2026-01-21 8:56 ` Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Chuck Lever
` (2 subsequent siblings)
3 siblings, 2 replies; 15+ messages in thread
From: Chuck Lever @ 2026-01-20 14:31 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-rdma, linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The existing rdma_rw_ctx_init() API requires callers to construct a
scatterlist, which is then DMA-mapped page by page. Callers that
already have data in bio_vec form (such as the NVMe-oF target) must
first convert to scatterlist, adding overhead and complexity.
Introduce rdma_rw_ctx_init_bvec() and rdma_rw_ctx_destroy_bvec() to
accept bio_vec arrays directly. The new helpers use dma_map_phys()
for hardware RDMA devices and virtual addressing for software RDMA
devices (rxe, siw), avoiding intermediate scatterlist construction.
Memory registration (MR) path support is deferred to a follow-up
series; callers requiring MR-based transfers (iWARP devices or
force_mr=1) receive -EOPNOTSUPP and should use the scatterlist API.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 210 +++++++++++++++++++++++++++++++++++
include/rdma/ib_verbs.h | 42 +++++++
include/rdma/rw.h | 10 ++
3 files changed, 262 insertions(+)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 6354ddf2a274..59f32fecf3df 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -4,6 +4,7 @@
*/
#include <linux/memremap.h>
#include <linux/moduleparam.h>
+#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/pci-p2pdma.h>
#include <rdma/mr_pool.h>
@@ -274,6 +275,111 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return 1;
}
+static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
+ struct ib_qp *qp, const struct bio_vec *bvec,
+ struct bvec_iter *iter,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+ struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
+ u64 dma_addr;
+
+ ctx->nr_ops = 1;
+
+ dma_addr = ib_dma_map_bvec(dev, &bv, dir);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -ENOMEM;
+
+ ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+ ctx->single.sge.addr = dma_addr;
+ ctx->single.sge.length = bv.bv_len;
+
+ memset(rdma_wr, 0, sizeof(*rdma_wr));
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = &ctx->single.sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+
+ ctx->type = RDMA_RW_SINGLE_WR;
+ return 1;
+}
+
+static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ const struct bio_vec *bvec, u32 nr_bvec,
+ struct bvec_iter *iter,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+ qp->max_read_sge;
+ struct ib_sge *sge;
+ u32 total_len = 0, i, j;
+ u32 mapped_bvecs = 0;
+ u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
+
+ ctx->map.sges = sge = kcalloc(nr_bvec, sizeof(*sge), GFP_KERNEL);
+ if (!ctx->map.sges)
+ return -ENOMEM;
+
+ ctx->map.wrs = kcalloc(nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+ if (!ctx->map.wrs)
+ goto out_free_sges;
+
+ for (i = 0; i < nr_ops; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+ u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge);
+
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->remote_addr = remote_addr + total_len;
+ rdma_wr->rkey = rkey;
+ rdma_wr->wr.num_sge = nr_sge;
+ rdma_wr->wr.sg_list = sge;
+
+ for (j = 0; j < nr_sge; j++) {
+ struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
+ u64 dma_addr;
+
+ dma_addr = ib_dma_map_bvec(dev, &bv, dir);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ goto out_unmap;
+
+ mapped_bvecs++;
+ sge->addr = dma_addr;
+ sge->length = bv.bv_len;
+ sge->lkey = qp->pd->local_dma_lkey;
+
+ total_len += bv.bv_len;
+ sge++;
+
+ bvec_iter_advance(bvec, iter, bv.bv_len);
+ }
+
+ rdma_wr->wr.next = i + 1 < nr_ops ?
+ &ctx->map.wrs[i + 1].wr : NULL;
+ }
+
+ ctx->nr_ops = nr_ops;
+ ctx->type = RDMA_RW_MULTI_WR;
+ return nr_ops;
+
+out_unmap:
+ for (i = 0; i < mapped_bvecs; i++)
+ ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
+ ctx->map.sges[i].length, dir);
+ kfree(ctx->map.wrs);
+out_free_sges:
+ kfree(ctx->map.sges);
+ return -ENOMEM;
+}
+
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -344,6 +450,68 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
}
EXPORT_SYMBOL(rdma_rw_ctx_init);
+/**
+ * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @bvec: bio_vec array to READ/WRITE from/to
+ * @nr_bvec: number of entries in @bvec
+ * @offset: byte offset into first bvec
+ * @remote_addr: remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Accepts bio_vec arrays directly, avoiding scatterlist conversion for
+ * callers that already have data in bio_vec form. Prefer this over
+ * rdma_rw_ctx_init() when the source data is a bio_vec array.
+ *
+ * This function does not support devices requiring memory registration.
+ * iWARP devices and configurations with force_mr=1 should use
+ * rdma_rw_ctx_init() with a scatterlist instead.
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code:
+ *
+ * * -EINVAL - @nr_bvec is zero, @offset exceeds first bvec, or overflow
+ * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1)
+ * * -ENOMEM - DMA mapping or memory allocation failed
+ */
+int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
+ u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct bvec_iter iter;
+ u32 i, total_len = 0;
+
+ if (nr_bvec == 0 || offset >= bvec[0].bv_len)
+ return -EINVAL;
+
+ /* MR path not supported for bvec - reject iWARP and force_mr */
+ if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec))
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < nr_bvec; i++) {
+ if (check_add_overflow(total_len, bvec[i].bv_len, &total_len))
+ return -EINVAL;
+ }
+ total_len -= offset;
+
+ iter.bi_sector = 0;
+ iter.bi_size = total_len;
+ iter.bi_idx = 0;
+ iter.bi_bvec_done = offset;
+
+ if (nr_bvec == 1)
+ return rdma_rw_init_single_wr_bvec(ctx, qp, bvec, &iter,
+ remote_addr, rkey, dir);
+
+ return rdma_rw_init_map_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
+ remote_addr, rkey, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init_bvec);
+
/**
* rdma_rw_ctx_signature_init - initialize a RW context with signature offload
* @ctx: context to initialize
@@ -598,6 +766,48 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+/**
+ * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound (unused)
+ * @bvec: bio_vec array that was used for the READ/WRITE (unused)
+ * @nr_bvec: number of entries in @bvec
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec()
+ * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error.
+ *
+ * The @port_num and @bvec parameters are unused but present for API
+ * symmetry with rdma_rw_ctx_destroy().
+ */
+void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 __maybe_unused port_num,
+ const struct bio_vec __maybe_unused *bvec,
+ u32 nr_bvec, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 i;
+
+ switch (ctx->type) {
+ case RDMA_RW_MULTI_WR:
+ for (i = 0; i < nr_bvec; i++)
+ ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
+ ctx->map.sges[i].length, dir);
+ kfree(ctx->map.wrs);
+ kfree(ctx->map.sges);
+ break;
+ case RDMA_RW_SINGLE_WR:
+ ib_dma_unmap_bvec(dev, ctx->single.sge.addr,
+ ctx->single.sge.length, dir);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec);
+
/**
* rdma_rw_ctx_destroy_signature - release all resources allocated by
* rdma_rw_ctx_signature_init
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6aad66bc5dd7..82958f5117c3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -15,6 +15,7 @@
#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/device.h>
+#include <linux/bvec.h>
#include <linux/dma-mapping.h>
#include <linux/kref.h>
#include <linux/list.h>
@@ -4249,6 +4250,47 @@ static inline void ib_dma_unmap_page(struct ib_device *dev,
dma_unmap_page(dev->dma_device, addr, size, direction);
}
+/**
+ * ib_dma_map_bvec - Map a bio_vec to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @bvec: The bio_vec to map
+ * @direction: The direction of the DMA
+ *
+ * Returns a DMA address for the bio_vec. The caller must check the
+ * result with ib_dma_mapping_error() before use; a failed mapping
+ * must not be passed to ib_dma_unmap_bvec().
+ *
+ * For software RDMA devices (rxe, siw), returns a virtual address
+ * and no actual DMA mapping occurs.
+ */
+static inline u64 ib_dma_map_bvec(struct ib_device *dev,
+ const struct bio_vec *bvec,
+ enum dma_data_direction direction)
+{
+ if (ib_uses_virt_dma(dev))
+ return (uintptr_t)(page_address(bvec->bv_page) + bvec->bv_offset);
+ return dma_map_phys(dev->dma_device, bvec_phys(bvec),
+ bvec->bv_len, direction, 0);
+}
+
+/**
+ * ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address returned by ib_dma_map_bvec()
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ *
+ * Releases a DMA mapping created by ib_dma_map_bvec(). For software
+ * RDMA devices this is a no-op since no actual mapping occurred.
+ */
+static inline void ib_dma_unmap_bvec(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_phys(dev->dma_device, addr, size, direction, 0);
+}
+
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents);
static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
struct scatterlist *sg, int nents,
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index d606cac48233..046a8eb57125 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -49,6 +49,16 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, struct scatterlist *sg, u32 sg_cnt,
enum dma_data_direction dir);
+struct bio_vec;
+
+int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
+ u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir);
+void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
+ enum dma_data_direction dir);
+
int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, struct scatterlist *sg, u32 sg_cnt,
struct scatterlist *prot_sg, u32 prot_sg_cnt,
--
2.52.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
@ 2026-01-21 8:42 ` Christoph Hellwig
2026-01-21 8:48 ` Leon Romanovsky
2026-01-21 8:56 ` Christoph Hellwig
1 sibling, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 8:42 UTC (permalink / raw)
To: Chuck Lever
Cc: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
On Tue, Jan 20, 2026 at 09:31:21AM -0500, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> The existing rdma_rw_ctx_init() API requires callers to construct a
> scatterlist, which is then DMA-mapped page by page. Callers that
> already have data in bio_vec form (such as the NVMe-oF target) must
> first convert to scatterlist, adding overhead and complexity.
>
> Introduce rdma_rw_ctx_init_bvec() and rdma_rw_ctx_destroy_bvec() to
> accept bio_vec arrays directly. The new helpers use dma_map_phys()
> for hardware RDMA devices and virtual addressing for software RDMA
> devices (rxe, siw), avoiding intermediate scatterlist construction.
>
> Memory registration (MR) path support is deferred to a follow-up
> series; callers requiring MR-based transfers (iWARP devices or
> force_mr=1) receive -EOPNOTSUPP and should use the scatterlist API.
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> drivers/infiniband/core/rw.c | 210 +++++++++++++++++++++++++++++++++++
> include/rdma/ib_verbs.h | 42 +++++++
> include/rdma/rw.h | 10 ++
> 3 files changed, 262 insertions(+)
>
> diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
> index 6354ddf2a274..59f32fecf3df 100644
> --- a/drivers/infiniband/core/rw.c
> +++ b/drivers/infiniband/core/rw.c
> @@ -4,6 +4,7 @@
> */
> #include <linux/memremap.h>
> #include <linux/moduleparam.h>
> +#include <linux/overflow.h>
> #include <linux/slab.h>
> #include <linux/pci-p2pdma.h>
> #include <rdma/mr_pool.h>
> @@ -274,6 +275,111 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> return 1;
> }
>
> +static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
> + struct ib_qp *qp, const struct bio_vec *bvec,
Nit: maybe rename bvec to bvecs or bvec_table to make it clear this
is the base array that the iter operates on?
> + struct bvec_iter *iter,
> + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
Nit 2: this can be condensed a little:
> + struct bvec_iter *iter, u64 remote_addr, u32 rkey,
> + enum dma_data_direction dir)
> +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> + const struct bio_vec *bvec, u32 nr_bvec,
> + struct bvec_iter *iter,
> + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
Same here.
Otherwise looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 8:42 ` Christoph Hellwig
@ 2026-01-21 8:48 ` Leon Romanovsky
2026-01-21 8:57 ` Christoph Hellwig
0 siblings, 1 reply; 15+ messages in thread
From: Leon Romanovsky @ 2026-01-21 8:48 UTC (permalink / raw)
To: Christoph Hellwig, Chuck Lever
Cc: Jason Gunthorpe, NeilBrown, Jeff Layton, Olga Kornievskaia,
Dai Ngo, Tom Talpey, linux-rdma, linux-nfs, Chuck Lever
On Wed, Jan 21, 2026 at 09:42:17AM +0100, Christoph Hellwig wrote:
> On Tue, Jan 20, 2026 at 09:31:21AM -0500, Chuck Lever wrote:
> > From: Chuck Lever <chuck.lever@oracle.com>
> >
> > The existing rdma_rw_ctx_init() API requires callers to construct a
> > scatterlist, which is then DMA-mapped page by page. Callers that
> > already have data in bio_vec form (such as the NVMe-oF target) must
> > first convert to scatterlist, adding overhead and complexity.
> >
> > Introduce rdma_rw_ctx_init_bvec() and rdma_rw_ctx_destroy_bvec() to
> > accept bio_vec arrays directly. The new helpers use dma_map_phys()
> > for hardware RDMA devices and virtual addressing for software RDMA
> > devices (rxe, siw), avoiding intermediate scatterlist construction.
> >
> > Memory registration (MR) path support is deferred to a follow-up
> > series; callers requiring MR-based transfers (iWARP devices or
> > force_mr=1) receive -EOPNOTSUPP and should use the scatterlist API.
> >
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > drivers/infiniband/core/rw.c | 210 +++++++++++++++++++++++++++++++++++
> > include/rdma/ib_verbs.h | 42 +++++++
> > include/rdma/rw.h | 10 ++
> > 3 files changed, 262 insertions(+)
> >
> > diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
> > index 6354ddf2a274..59f32fecf3df 100644
> > --- a/drivers/infiniband/core/rw.c
> > +++ b/drivers/infiniband/core/rw.c
> > @@ -4,6 +4,7 @@
> > */
> > #include <linux/memremap.h>
> > #include <linux/moduleparam.h>
> > +#include <linux/overflow.h>
> > #include <linux/slab.h>
> > #include <linux/pci-p2pdma.h>
> > #include <rdma/mr_pool.h>
> > @@ -274,6 +275,111 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> > return 1;
> > }
> >
> > +static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
> > + struct ib_qp *qp, const struct bio_vec *bvec,
>
> Nit: maybe rename bvec to bvecs or bvec_table to make it clear this
> is the base array that the iter operates on?
>
> > + struct bvec_iter *iter,
> > + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
>
> Nit 2: this can be condensed a little:
>
> > + struct bvec_iter *iter, u64 remote_addr, u32 rkey,
> > + enum dma_data_direction dir)
>
> > +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> > + const struct bio_vec *bvec, u32 nr_bvec,
> > + struct bvec_iter *iter,
> > + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
Don't you both think these functions take too many arguments? It might be
worth introducing something like "struct rdma_rw_init_attrs" and passing
that instead.
At the end, these functions are for you to use.
Thanks
>
> Same here.
>
> Otherwise looks good:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 8:48 ` Leon Romanovsky
@ 2026-01-21 8:57 ` Christoph Hellwig
2026-01-21 10:16 ` Leon Romanovsky
0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 8:57 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Christoph Hellwig, Chuck Lever, Jason Gunthorpe, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
On Wed, Jan 21, 2026 at 10:48:40AM +0200, Leon Romanovsky wrote:
> > > +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> > > + const struct bio_vec *bvec, u32 nr_bvec,
> > > + struct bvec_iter *iter,
> > > + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
>
> Don't you both think these functions take too many arguments? It might be
> worth introducing something like "struct rdma_rw_init_attrs" and passing
> that instead.
Not sure that buys us much. Having a {bvec_table, bvec_iter} tuple
OTOH might be a nice general data structure. Not really for this
series, though.
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 8:57 ` Christoph Hellwig
@ 2026-01-21 10:16 ` Leon Romanovsky
0 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2026-01-21 10:16 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Chuck Lever, Jason Gunthorpe, NeilBrown, Jeff Layton,
Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma, linux-nfs,
Chuck Lever
On Wed, Jan 21, 2026 at 09:57:27AM +0100, Christoph Hellwig wrote:
> On Wed, Jan 21, 2026 at 10:48:40AM +0200, Leon Romanovsky wrote:
> > > > +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> > > > + const struct bio_vec *bvec, u32 nr_bvec,
> > > > + struct bvec_iter *iter,
> > > > + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
> >
> > Don't you both think these functions take too many arguments? It might be
> > worth introducing something like "struct rdma_rw_init_attrs" and passing
> > that instead.
>
> Not sure that buys us much.
Readability???
Thanks
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-21 8:42 ` Christoph Hellwig
@ 2026-01-21 8:56 ` Christoph Hellwig
2026-01-21 14:14 ` Chuck Lever
1 sibling, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 8:56 UTC (permalink / raw)
To: Chuck Lever
Cc: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
Another reply, sorry. I noticed I skipped over the end while reviewing
the 3rd patch.
> + u32 i, total_len = 0;
> +
> + if (nr_bvec == 0 || offset >= bvec[0].bv_len)
> + return -EINVAL;
> +
> + if (check_add_overflow(total_len, bvec[i].bv_len, &total_len))
> + return -EINVAL;
> + }
> + total_len -= offset;
> +
> + iter.bi_sector = 0;
> + iter.bi_size = total_len;
> + iter.bi_idx = 0;
> + iter.bi_bvec_done = offset;
I'd much rather have the callers pass in the bvec_iter, as that's
more useful. We can probably look into factoring the quoted code
into a helper if that's useful.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 8:56 ` Christoph Hellwig
@ 2026-01-21 14:14 ` Chuck Lever
2026-01-21 14:57 ` Christoph Hellwig
0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2026-01-21 14:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jason Gunthorpe, Leon Romanovsky, NeilBrown, Jeff Layton,
Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma, linux-nfs,
Chuck Lever
On 1/21/26 3:56 AM, Christoph Hellwig wrote:
> Another reply, sorry. I noticed I skipped over the end while reviewing
> the 3rd patch.
>
>> + u32 i, total_len = 0;
>> +
>> + if (nr_bvec == 0 || offset >= bvec[0].bv_len)
>> + return -EINVAL;
>> +
>> + if (check_add_overflow(total_len, bvec[i].bv_len, &total_len))
>> + return -EINVAL;
>> + }
>> + total_len -= offset;
>> +
>> + iter.bi_sector = 0;
>> + iter.bi_size = total_len;
>> + iter.bi_idx = 0;
>> + iter.bi_bvec_done = offset;
>
> I'd much rather have the callers pass in the bvec_iter, as that's
> more useful.
"The callers" -- Can you clarify whether you mean that the API
consumers would pass in a bvec_iter, or whether the iter is
entirely internal to rw.c ?
> We can probably look into factoring the quoted code
> into a helper if that's useful.
>
--
Chuck Lever
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 14:14 ` Chuck Lever
@ 2026-01-21 14:57 ` Christoph Hellwig
2026-01-21 15:10 ` Chuck Lever
0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 14:57 UTC (permalink / raw)
To: Chuck Lever
Cc: Christoph Hellwig, Jason Gunthorpe, Leon Romanovsky, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
On Wed, Jan 21, 2026 at 09:14:36AM -0500, Chuck Lever wrote:
> >
> > I'd much rather have the callers pass in the bvec_iter, as that's
> > more useful.
>
> "The callers" -- Can you clarify whether you mean that the API
> consumers would pass in a bvec_iter, or whether the iter is
> entirely internal to rw.c ?
I mean passing the iter from the API consumer into rw.c. In general
that is the sanes way to deal with a collection of bvecs.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API
2026-01-21 14:57 ` Christoph Hellwig
@ 2026-01-21 15:10 ` Chuck Lever
0 siblings, 0 replies; 15+ messages in thread
From: Chuck Lever @ 2026-01-21 15:10 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jason Gunthorpe, Leon Romanovsky, NeilBrown, Jeff Layton,
Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma, linux-nfs,
Chuck Lever
On 1/21/26 9:57 AM, Christoph Hellwig wrote:
> On Wed, Jan 21, 2026 at 09:14:36AM -0500, Chuck Lever wrote:
>>>
>>> I'd much rather have the callers pass in the bvec_iter, as that's
>>> more useful.
>>
>> "The callers" -- Can you clarify whether you mean that the API
>> consumers would pass in a bvec_iter, or whether the iter is
>> entirely internal to rw.c ?
>
> I mean passing the iter from the API consumer into rw.c. In general
> that is the sanes way to deal with a collection of bvecs.
OK. It wasn't clear from your earlier review comments, and I took a
guess. I'll get that done.
--
Chuck Lever
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
@ 2026-01-20 14:31 ` Chuck Lever
2026-01-21 8:51 ` Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 3/4] RDMA/core: add MR support for bvec-based " Chuck Lever
2026-01-20 14:31 ` [PATCH v2 4/4] svcrdma: use bvec-based RDMA read/write API Chuck Lever
3 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2026-01-20 14:31 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-rdma, linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The bvec RDMA API maps each bvec individually via dma_map_phys(),
requiring an IOTLB sync for each mapping. For large I/O operations
with many bvecs, this overhead becomes significant.
The two-step IOVA API (dma_iova_try_alloc / dma_iova_link /
dma_iova_sync) allocates a contiguous IOVA range upfront, links
all physical pages without IOTLB syncs, then performs a single
sync at the end. This reduces IOTLB flushes from O(n) to O(1).
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 116 +++++++++++++++++++++++++++++++++++
include/rdma/rw.h | 8 +++
2 files changed, 124 insertions(+)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 59f32fecf3df..51f650c4fa8c 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -15,6 +15,7 @@ enum {
RDMA_RW_MULTI_WR,
RDMA_RW_MR,
RDMA_RW_SIG_MR,
+ RDMA_RW_IOVA,
};
static bool rdma_rw_force_mr;
@@ -380,6 +381,93 @@ static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return -ENOMEM;
}
+/*
+ * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
+ * This reduces IOTLB sync overhead by doing one sync at the end instead of
+ * one per bvec, and produces a contiguous DMA address range that can be
+ * described by a single SGE.
+ *
+ * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
+ * mapping is not available, or another negative error code on failure.
+ */
+static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ const struct bio_vec *bvec, u32 nr_bvec,
+ struct bvec_iter *iter,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct device *dma_dev = dev->dma_device;
+ struct bvec_iter link_iter;
+ struct bio_vec first_bv;
+ size_t total_len, mapped_len = 0;
+ int ret;
+
+ /* Virtual DMA devices lack IOVA allocators */
+ if (ib_uses_virt_dma(dev))
+ return -EOPNOTSUPP;
+
+ total_len = iter->bi_size;
+
+ /* Get the first (possibly offset-adjusted) bvec for starting phys addr */
+ first_bv = mp_bvec_iter_bvec(bvec, *iter);
+
+ /* Try to allocate contiguous IOVA space */
+ if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
+ bvec_phys(&first_bv), total_len))
+ return -EOPNOTSUPP;
+
+ /* Link all bvecs into the IOVA space */
+ link_iter = *iter;
+ while (link_iter.bi_size) {
+ struct bio_vec bv = mp_bvec_iter_bvec(bvec, link_iter);
+
+ ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
+ mapped_len, bv.bv_len, dir, 0);
+ if (ret)
+ goto out_destroy;
+
+ if (check_add_overflow(mapped_len, bv.bv_len, &mapped_len)) {
+ ret = -EOVERFLOW;
+ goto out_destroy;
+ }
+ bvec_iter_advance(bvec, &link_iter, bv.bv_len);
+ }
+
+ /* Sync the IOTLB once for all linked pages */
+ ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
+ if (ret)
+ goto out_destroy;
+
+ ctx->iova.mapped_len = mapped_len;
+
+ /* Single SGE covers the entire contiguous IOVA range */
+ ctx->iova.sge.addr = ctx->iova.state.addr;
+ ctx->iova.sge.length = mapped_len;
+ ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
+
+ /* Single WR for the whole transfer */
+ memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
+ ctx->iova.wr.wr.opcode = dir == DMA_TO_DEVICE ?
+ IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ ctx->iova.wr.wr.num_sge = 1;
+ ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
+ ctx->iova.wr.remote_addr = remote_addr;
+ ctx->iova.wr.rkey = rkey;
+
+ ctx->type = RDMA_RW_IOVA;
+ ctx->nr_ops = 1;
+ return 1;
+
+out_destroy:
+ /*
+ * dma_iova_destroy() expects the actual mapped length, not the
+ * total allocation size. It unlinks only the successfully linked
+ * range and frees the entire IOVA allocation.
+ */
+ dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
+ return ret;
+}
+
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -484,6 +572,7 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
{
struct bvec_iter iter;
u32 i, total_len = 0;
+ int ret;
if (nr_bvec == 0 || offset >= bvec[0].bv_len)
return -EINVAL;
@@ -507,6 +596,21 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return rdma_rw_init_single_wr_bvec(ctx, qp, bvec, &iter,
remote_addr, rkey, dir);
+ /*
+ * Try IOVA-based mapping first for multi-bvec transfers.
+ * This reduces IOTLB sync overhead by batching all mappings.
+ */
+ ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
+ remote_addr, rkey, dir);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
+ /* Fallback path requires iterator at initial state */
+ iter.bi_sector = 0;
+ iter.bi_size = total_len;
+ iter.bi_idx = 0;
+ iter.bi_bvec_done = offset;
+
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
remote_addr, rkey, dir);
}
@@ -683,6 +787,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
first_wr = &ctx->reg[0].reg_wr.wr;
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
break;
+ case RDMA_RW_IOVA:
+ first_wr = &ctx->iova.wr.wr;
+ last_wr = &ctx->iova.wr.wr;
+ break;
case RDMA_RW_MULTI_WR:
first_wr = &ctx->map.wrs[0].wr;
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@@ -757,6 +865,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
break;
case RDMA_RW_SINGLE_WR:
break;
+ case RDMA_RW_IOVA:
+ /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
+ WARN_ON_ONCE(1);
+ break;
default:
BUG();
break;
@@ -790,6 +902,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_IOVA:
+ dma_iova_destroy(dev->dma_device, &ctx->iova.state,
+ ctx->iova.mapped_len, dir, 0);
+ break;
case RDMA_RW_MULTI_WR:
for (i = 0; i < nr_bvec; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 046a8eb57125..2a5f33665d52 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -31,6 +31,14 @@ struct rdma_rw_ctx {
struct ib_rdma_wr *wrs;
} map;
+ /* for IOVA-based mapping of bvecs into contiguous DMA range: */
+ struct {
+ struct dma_iova_state state;
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ size_t mapped_len;
+ } iova;
+
/* for registering multiple WRs: */
struct rdma_rw_reg_ctx {
struct ib_sge sge;
--
2.52.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
2026-01-20 14:31 ` [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Chuck Lever
@ 2026-01-21 8:51 ` Christoph Hellwig
0 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 8:51 UTC (permalink / raw)
To: Chuck Lever
Cc: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
On Tue, Jan 20, 2026 at 09:31:22AM -0500, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> The bvec RDMA API maps each bvec individually via dma_map_phys(),
> requiring an IOTLB sync for each mapping. For large I/O operations
> with many bvecs, this overhead becomes significant.
>
> The two-step IOVA API (dma_iova_try_alloc / dma_iova_link /
> dma_iova_sync) allocates a contiguous IOVA range upfront, links
> all physical pages without IOTLB syncs, then performs a single
> sync at the end. This reduces IOTLB flushes from O(n) to O(1).
... and requires only a single output dma_addr_t compared to extra
per-input element storage in struct scatterlist.
> + const struct bio_vec *bvec, u32 nr_bvec,
> + struct bvec_iter *iter,
> + u64 remote_addr, u32 rkey, enum dma_data_direction dir)
Same minor nits as for the previous patch here as well.
> + struct ib_device *dev = qp->pd->device;
> + struct device *dma_dev = dev->dma_device;
> + struct bvec_iter link_iter;
> + struct bio_vec first_bv;
> + size_t total_len, mapped_len = 0;
> + int ret;
> +
> + /* Virtual DMA devices lack IOVA allocators */
> + if (ib_uses_virt_dma(dev))
> + return -EOPNOTSUPP;
No only lacks, but fundamentally can't support it.
> + total_len = iter->bi_size;
I'd just initialize this at declaration time.
> + /* Get the first (possibly offset-adjusted) bvec for starting phys addr */
I think this comment is kinda out of date now, as the offset adjustment
is transparently done by the bvec helpers, and there's no visible concept
of a start phys addr.
> + first_bv = mp_bvec_iter_bvec(bvec, *iter);
I'd also initialize first_bv at declaration time. The compilers are
smart enough defer the work past past the virtual dma check.
> + struct bio_vec bv = mp_bvec_iter_bvec(bvec, link_iter);
> +
> + ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
> + mapped_len, bv.bv_len, dir, 0);
> + if (ret)
> + goto out_destroy;
> +
> + if (check_add_overflow(mapped_len, bv.bv_len, &mapped_len)) {
> + ret = -EOVERFLOW;
> + goto out_destroy;
> + }
Do the overflow check before calling dma_iova_link as it's kinda
pointless to continue with that operation. But then again, I don't
really think we need the overflow check at all. The length is known
beforehand in bi_size, which is a u32, while mapped_len is a size_t,
so we can't really overflow here at all.
> + /*
> + * Try IOVA-based mapping first for multi-bvec transfers.
> + * This reduces IOTLB sync overhead by batching all mappings.
> + */
> + ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvec, nr_bvec, &iter,
> + remote_addr, rkey, dir);
> + if (ret != -EOPNOTSUPP)
> + return ret;
> +
> + /* Fallback path requires iterator at initial state */
> + iter.bi_sector = 0;
> + iter.bi_size = total_len;
> + iter.bi_idx = 0;
> + iter.bi_bvec_done = offset;
rdma_rw_init_iova_wrs_bvec already avoids advancing the passed in
iter, and this rebuilds it. In addition, rdma_rw_init_iova_wrs_bvec
only returns -EOPNOTSUPP before advancing even the local iter.
So I think both the local iter copy in rdma_rw_init_iova_wrs_bvec
and this can go away. But it would be useful to capture that
rdma_rw_init_iova_wrs_bvec must leave the iter unmodified when
returning -EOPNOTSUPP in a comment.
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 3/4] RDMA/core: add MR support for bvec-based RDMA operations
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
2026-01-20 14:31 ` [PATCH v2 1/4] RDMA/core: add bio_vec based RDMA read/write API Chuck Lever
2026-01-20 14:31 ` [PATCH v2 2/4] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations Chuck Lever
@ 2026-01-20 14:31 ` Chuck Lever
2026-01-21 9:05 ` Christoph Hellwig
2026-01-20 14:31 ` [PATCH v2 4/4] svcrdma: use bvec-based RDMA read/write API Chuck Lever
3 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2026-01-20 14:31 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-rdma, linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The bvec-based RDMA API currently returns -EOPNOTSUPP when Memory
Region registration is required. This prevents iWARP devices from
using the bvec path, since iWARP requires MR registration for RDMA
READ operations. The force_mr debug parameter is also unusable with
bvec input.
Add rdma_rw_init_mr_wrs_bvec() to handle MR registration for bvec
arrays. The approach creates a synthetic scatterlist populated with
DMA addresses from the bvecs, then reuses the existing ib_map_mr_sg()
infrastructure. This avoids driver changes while keeping the
implementation small.
The synthetic scatterlist is stored in the rdma_rw_ctx for cleanup.
On destroy, the MRs are returned to the pool and the bvec DMA
mappings are released using the stored addresses.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
drivers/infiniband/core/rw.c | 159 ++++++++++++++++++++++++++++++++---
include/rdma/rw.h | 8 ++
2 files changed, 156 insertions(+), 11 deletions(-)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 51f650c4fa8c..9181fca8ff3f 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -194,6 +194,135 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return ret;
}
+static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
+ u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct rdma_rw_reg_ctx *prev = NULL;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
+ struct scatterlist *sgl;
+ int i, j, ret = 0, count = 0;
+ u32 sg_idx = 0;
+
+ ctx->nr_ops = DIV_ROUND_UP(nr_bvec, pages_per_mr);
+ ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg)
+ return -ENOMEM;
+
+ /*
+ * Allocate synthetic scatterlist to hold DMA addresses.
+ * ib_map_mr_sg() extracts sg_dma_address/len, so the page
+ * pointer is unused.
+ */
+ sgl = kmalloc_array(nr_bvec, sizeof(*sgl), GFP_KERNEL);
+ if (!sgl) {
+ ret = -ENOMEM;
+ goto out_free_reg;
+ }
+ sg_init_table(sgl, nr_bvec);
+
+ for (i = 0; i < nr_bvec; i++) {
+ const struct bio_vec *bv = &bvec[i];
+ struct bio_vec adjusted;
+ u64 dma_addr;
+ u32 len;
+
+ /*
+ * The offset parameter applies only to the first bvec,
+ * allowing callers to start partway into the array.
+ */
+ if (i == 0 && offset) {
+ adjusted = *bv;
+ adjusted.bv_offset += offset;
+ adjusted.bv_len -= offset;
+ bv = &adjusted;
+ }
+ len = bv->bv_len;
+
+ dma_addr = ib_dma_map_bvec(dev, bv, dir);
+ if (ib_dma_mapping_error(dev, dma_addr)) {
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ /* sg_set_page() initializes the entry; ib_map_mr_sg() uses
+ * only sg_dma_address/len, ignoring the page pointer.
+ */
+ sg_set_page(&sgl[i], bv->bv_page, len, bv->bv_offset);
+ sg_dma_address(&sgl[i]) = dma_addr;
+ sg_dma_len(&sgl[i]) = len;
+ }
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ u32 nents = min(nr_bvec - sg_idx, pages_per_mr);
+
+ ret = rdma_rw_init_one_mr(qp, port_num, reg, &sgl[sg_idx],
+ nents, 0);
+ if (ret < 0)
+ goto out_free_mrs;
+ count += ret;
+
+ if (prev) {
+ if (reg->mr->need_inval)
+ prev->wr.wr.next = ®->inv_wr;
+ else
+ prev->wr.wr.next = ®->reg_wr.wr;
+ }
+
+ reg->reg_wr.wr.next = ®->wr.wr;
+
+ reg->wr.wr.sg_list = ®->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr;
+ reg->wr.rkey = rkey;
+
+ if (dir == DMA_TO_DEVICE) {
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+ } else {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+ }
+ count++;
+
+ remote_addr += reg->sge.length;
+ sg_idx += nents;
+ prev = reg;
+ }
+
+ if (prev)
+ prev->wr.wr.next = NULL;
+
+ ctx->type = RDMA_RW_MR;
+ ctx->mr_sgl = sgl;
+ ctx->mr_sg_cnt = nr_bvec;
+ return count;
+
+out_free_mrs:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ for (j = 0; j < nr_bvec; j++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&sgl[j]),
+ sg_dma_len(&sgl[j]), dir);
+ kfree(sgl);
+ kfree(ctx->reg);
+ return ret;
+
+out_unmap:
+ /* Unmap bvecs that were successfully mapped (0 through i-1) */
+ for (j = 0; j < i; j++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&sgl[j]),
+ sg_dma_len(&sgl[j]), dir);
+ kfree(sgl);
+out_free_reg:
+ kfree(ctx->reg);
+ return ret;
+}
+
static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@@ -550,19 +679,13 @@ EXPORT_SYMBOL(rdma_rw_ctx_init);
* @rkey: remote key to operate on
* @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
*
- * Accepts bio_vec arrays directly, avoiding scatterlist conversion for
- * callers that already have data in bio_vec form. Prefer this over
- * rdma_rw_ctx_init() when the source data is a bio_vec array.
- *
- * This function does not support devices requiring memory registration.
- * iWARP devices and configurations with force_mr=1 should use
- * rdma_rw_ctx_init() with a scatterlist instead.
+ * Maps the bio_vec array directly, avoiding intermediate scatterlist
+ * conversion. Supports MR registration for iWARP devices and force_mr mode.
*
* Returns the number of WQEs that will be needed on the workqueue if
* successful, or a negative error code:
*
* * -EINVAL - @nr_bvec is zero, @offset exceeds first bvec, or overflow
- * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1)
* * -ENOMEM - DMA mapping or memory allocation failed
*/
int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
@@ -570,6 +693,7 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 offset, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
+ struct ib_device *dev = qp->pd->device;
struct bvec_iter iter;
u32 i, total_len = 0;
int ret;
@@ -577,9 +701,10 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
if (nr_bvec == 0 || offset >= bvec[0].bv_len)
return -EINVAL;
- /* MR path not supported for bvec - reject iWARP and force_mr */
- if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec))
- return -EOPNOTSUPP;
+ if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
+ return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvec,
+ nr_bvec, offset, remote_addr,
+ rkey, dir);
for (i = 0; i < nr_bvec; i++) {
if (check_add_overflow(total_len, bvec[i].bv_len, &total_len))
@@ -855,6 +980,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
switch (ctx->type) {
case RDMA_RW_MR:
+ /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
+ WARN_ON_ONCE(ctx->mr_sgl);
for (i = 0; i < ctx->nr_ops; i++)
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
kfree(ctx->reg);
@@ -902,6 +1029,16 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 i;
switch (ctx->type) {
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ /* DMA addresses were stored in mr_sgl during init */
+ for (i = 0; i < ctx->mr_sg_cnt; i++)
+ ib_dma_unmap_bvec(dev, sg_dma_address(&ctx->mr_sgl[i]),
+ sg_dma_len(&ctx->mr_sgl[i]), dir);
+ kfree(ctx->mr_sgl);
+ break;
case RDMA_RW_IOVA:
dma_iova_destroy(dev->dma_device, &ctx->iova.state,
ctx->iova.mapped_len, dir, 0);
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 2a5f33665d52..01177fd09eae 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -48,6 +48,14 @@ struct rdma_rw_ctx {
struct ib_mr *mr;
} *reg;
};
+
+ /*
+ * For bvec MR path: store synthetic scatterlist with DMA addresses
+ * for cleanup. Only valid when type == RDMA_RW_MR and initialized
+ * via rdma_rw_ctx_init_bvec().
+ */
+ struct scatterlist *mr_sgl;
+ u32 mr_sg_cnt;
};
int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
--
2.52.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v2 3/4] RDMA/core: add MR support for bvec-based RDMA operations
2026-01-20 14:31 ` [PATCH v2 3/4] RDMA/core: add MR support for bvec-based " Chuck Lever
@ 2026-01-21 9:05 ` Christoph Hellwig
0 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-21 9:05 UTC (permalink / raw)
To: Chuck Lever
Cc: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig, NeilBrown,
Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-rdma,
linux-nfs, Chuck Lever
> +static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
> + u32 port_num, const struct bio_vec *bvec, u32 nr_bvec,
> + u32 offset, u64 remote_addr, u32 rkey,
> + enum dma_data_direction dir)
Any reason this is not using the bvec_iter like the other paths?
Yhe mapping to the scatterlist would then basically be a much
simplified version of __blk_rq_map_sg.
> + dma_addr = ib_dma_map_bvec(dev, bv, dir);
> + if (ib_dma_mapping_error(dev, dma_addr)) {
> + ret = -ENOMEM;
> + goto out_unmap;
> + }
> +
> + /* sg_set_page() initializes the entry; ib_map_mr_sg() uses
> + * only sg_dma_address/len, ignoring the page pointer.
> + */
> + sg_set_page(&sgl[i], bv->bv_page, len, bv->bv_offset);
> + sg_dma_address(&sgl[i]) = dma_addr;
> + sg_dma_len(&sgl[i]) = len;
And once we have a scatterlist, this should probably just use
ib_dma_map_sg* ? And maybe rdma_rw_init_one_mr?
> + /*
> + * For bvec MR path: store synthetic scatterlist with DMA addresses
> + * for cleanup. Only valid when type == RDMA_RW_MR and initialized
> + * via rdma_rw_ctx_init_bvec().
> + */
> + struct scatterlist *mr_sgl;
> + u32 mr_sg_cnt;
> };
This probably should be in the reg union arm and thus the separate
allocatiom?
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 4/4] svcrdma: use bvec-based RDMA read/write API
2026-01-20 14:31 [PATCH v2 0/4] Add a bio_vec based API to core/rw.c Chuck Lever
` (2 preceding siblings ...)
2026-01-20 14:31 ` [PATCH v2 3/4] RDMA/core: add MR support for bvec-based " Chuck Lever
@ 2026-01-20 14:31 ` Chuck Lever
3 siblings, 0 replies; 15+ messages in thread
From: Chuck Lever @ 2026-01-20 14:31 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Christoph Hellwig
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-rdma, linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Convert svcrdma to the bvec-based RDMA API introduced earlier in
this series.
The bvec-based RDMA API eliminates the intermediate scatterlist
conversion step, allowing direct DMA mapping from bio_vec arrays.
This simplifies the svc_rdma_rw_ctxt structure by removing the
chained SG table management.
The structure retains an inline array approach similar to the
previous scatterlist implementation: an inline bvec array sized
to max_send_sge handles most I/O operations without additional
allocation. Larger requests fall back to dynamic allocation.
This preserves the allocation-free fast path for typical NFS
operations while supporting arbitrarily large transfers.
The bvec API handles all device types internally, including iWARP
devices which require memory registration. No explicit fallback
path is needed.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_rw.c | 136 ++++++++++++++++--------------
1 file changed, 74 insertions(+), 62 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 310de7a80be5..33c1c0ac4e79 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -5,6 +5,8 @@
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
*/
+#include <linux/bvec.h>
+#include <linux/overflow.h>
#include <rdma/rw.h>
#include <linux/sunrpc/xdr.h>
@@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
/* Each R/W context contains state for one chain of RDMA Read or
* Write Work Requests.
*
- * Each WR chain handles a single contiguous server-side buffer,
- * because scatterlist entries after the first have to start on
- * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ * Each WR chain handles a single contiguous server-side buffer.
+ * - each xdr_buf iovec is a single contiguous buffer
+ * - the xdr_buf pages array is a single contiguous buffer because the
+ * second through the last element always start on a page boundary
*
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
* from a client may contain a unique R_key, so each WR chain moves
* up to one segment at a time.
*
- * The scatterlist makes this data structure over 4KB in size. To
- * make it less likely to fail, and to handle the allocation for
- * smaller I/O requests without disabling bottom-halves, these
- * contexts are created on demand, but cached and reused until the
- * controlling svcxprt_rdma is destroyed.
+ * The inline bvec array is sized to handle most I/O requests without
+ * additional allocation. Larger requests fall back to dynamic allocation.
+ * These contexts are created on demand, but cached and reused until
+ * the controlling svcxprt_rdma is destroyed.
*/
struct svc_rdma_rw_ctxt {
struct llist_node rw_node;
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
unsigned int rw_nents;
- unsigned int rw_first_sgl_nents;
- struct sg_table rw_sg_table;
- struct scatterlist rw_first_sgl[];
+ unsigned int rw_first_bvec_nents;
+ struct bio_vec *rw_bvec;
+ struct bio_vec rw_first_bvec[];
};
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt);
+
static inline struct svc_rdma_rw_ctxt *
svc_rdma_next_ctxt(struct list_head *list)
{
@@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list)
}
static struct svc_rdma_rw_ctxt *
-svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec)
{
struct ib_device *dev = rdma->sc_cm_id->device;
- unsigned int first_sgl_nents = dev->attrs.max_send_sge;
+ unsigned int first_bvec_nents = dev->attrs.max_send_sge;
struct svc_rdma_rw_ctxt *ctxt;
struct llist_node *node;
@@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
if (node) {
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
- ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents),
+ ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec,
+ first_bvec_nents),
GFP_KERNEL, ibdev_to_node(dev));
if (!ctxt)
goto out_noctx;
INIT_LIST_HEAD(&ctxt->rw_list);
- ctxt->rw_first_sgl_nents = first_sgl_nents;
+ ctxt->rw_first_bvec_nents = first_bvec_nents;
}
- ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
- if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
- ctxt->rw_sg_table.sgl,
- first_sgl_nents))
- goto out_free;
+ if (nr_bvec <= ctxt->rw_first_bvec_nents) {
+ ctxt->rw_bvec = ctxt->rw_first_bvec;
+ } else {
+ ctxt->rw_bvec = kmalloc_array_node(nr_bvec,
+ sizeof(*ctxt->rw_bvec),
+ GFP_KERNEL,
+ ibdev_to_node(dev));
+ if (!ctxt->rw_bvec)
+ goto out_free;
+ }
return ctxt;
out_free:
- kfree(ctxt);
+ /* Return cached contexts to cache; free freshly allocated ones */
+ if (node)
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ else
+ kfree(ctxt);
out_noctx:
- trace_svcrdma_rwctx_empty(rdma, sges);
+ trace_svcrdma_rwctx_empty(rdma, nr_bvec);
return NULL;
}
static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
struct llist_head *list)
{
- sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents);
+ if (ctxt->rw_bvec != ctxt->rw_first_bvec)
+ kfree(ctxt->rw_bvec);
llist_add(&ctxt->rw_node, list);
}
@@ -135,9 +151,10 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
{
int ret;
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
- ctxt->rw_sg_table.sgl, ctxt->rw_nents,
- 0, offset, handle, direction);
+ ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num,
+ ctxt->rw_bvec, ctxt->rw_nents,
+ 0, offset, handle, direction);
if (unlikely(ret < 0)) {
trace_svcrdma_dma_map_rw_err(rdma, offset, handle,
ctxt->rw_nents, ret);
@@ -183,9 +200,9 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
- rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, ctxt->rw_sg_table.sgl,
- ctxt->rw_nents, dir);
+ rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num,
+ ctxt->rw_bvec, ctxt->rw_nents, dir);
__svc_rdma_put_rw_ctxt(ctxt, &free);
ctxt->rw_node.next = first;
@@ -414,29 +431,25 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
return -ENOTCONN;
}
-/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+/* Build a bvec that covers one kvec in an xdr_buf.
*/
-static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
- unsigned int len,
- struct svc_rdma_rw_ctxt *ctxt)
+static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
{
- struct scatterlist *sg = ctxt->rw_sg_table.sgl;
-
- sg_set_buf(&sg[0], info->wi_base, len);
+ bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len);
info->wi_base += len;
-
ctxt->rw_nents = 1;
}
-/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+/* Build a bvec array that covers part of an xdr_buf's pagelist.
*/
-static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
- unsigned int remaining,
- struct svc_rdma_rw_ctxt *ctxt)
+static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
{
- unsigned int sge_no, sge_bytes, page_off, page_no;
+ unsigned int bvec_idx, sge_bytes, page_off, page_no;
const struct xdr_buf *xdr = info->wi_xdr;
- struct scatterlist *sg;
struct page **page;
page_off = info->wi_next_off + xdr->page_base;
@@ -444,21 +457,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
page_off = offset_in_page(page_off);
page = xdr->pages + page_no;
info->wi_next_off += remaining;
- sg = ctxt->rw_sg_table.sgl;
- sge_no = 0;
+ bvec_idx = 0;
do {
sge_bytes = min_t(unsigned int, remaining,
PAGE_SIZE - page_off);
- sg_set_page(sg, *page, sge_bytes, page_off);
-
+ bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, sge_bytes,
+ page_off);
remaining -= sge_bytes;
- sg = sg_next(sg);
page_off = 0;
- sge_no++;
+ bvec_idx++;
page++;
} while (remaining);
- ctxt->rw_nents = sge_no;
+ ctxt->rw_nents = bvec_idx;
}
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
@@ -535,7 +546,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
const struct kvec *iov)
{
info->wi_base = iov->iov_base;
- return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec,
iov->iov_len);
}
@@ -559,7 +570,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
{
info->wi_xdr = xdr;
info->wi_next_off = offset - xdr->head[0].iov_len;
- return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec,
length);
}
@@ -734,29 +745,30 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
{
struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
- unsigned int sge_no, seg_len, len;
+ unsigned int bvec_idx, nr_bvec, seg_len, len;
struct svc_rdma_rw_ctxt *ctxt;
- struct scatterlist *sg;
int ret;
len = segment->rs_length;
- sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
- ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
+ if (check_add_overflow(head->rc_pageoff, len, &len))
+ return -EINVAL;
+ nr_bvec = PAGE_ALIGN(len) >> PAGE_SHIFT;
+ len = segment->rs_length;
+ ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
if (!ctxt)
return -ENOMEM;
- ctxt->rw_nents = sge_no;
+ ctxt->rw_nents = nr_bvec;
- sg = ctxt->rw_sg_table.sgl;
- for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
+ for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) {
seg_len = min_t(unsigned int, len,
PAGE_SIZE - head->rc_pageoff);
if (!head->rc_pageoff)
head->rc_page_count++;
- sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
- seg_len, head->rc_pageoff);
- sg = sg_next(sg);
+ bvec_set_page(&ctxt->rw_bvec[bvec_idx],
+ rqstp->rq_pages[head->rc_curpage],
+ seg_len, head->rc_pageoff);
head->rc_pageoff += seg_len;
if (head->rc_pageoff == PAGE_SIZE) {
--
2.52.0
^ permalink raw reply related [flat|nested] 15+ messages in thread