[RFC PATCH v2] svcrdma: Introduce Receive buffer arenas

linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
@ 2025-08-11 20:35 Chuck Lever
  2025-08-19 16:03 ` Tom Talpey
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Lever @ 2025-08-11 20:35 UTC (permalink / raw)
  To: linux-nfs, linux-rdma; +Cc: Chuck Lever

From: Chuck Lever <chuck.lever@oracle.com>

Reduce the per-connection footprint in the host's and RNIC's memory
management TLBs by combining groups of a connection's Receive
buffers into fewer IOVAs.

I don't have a good way to measure whether this approach is
effective.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |   3 +
 include/trace/events/rpcrdma.h          |  99 +++++++++++
 net/sunrpc/xprtrdma/Makefile            |   2 +-
 net/sunrpc/xprtrdma/pool.c              | 223 ++++++++++++++++++++++++
 net/sunrpc/xprtrdma/pool.h              |  25 +++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  43 ++---
 6 files changed, 370 insertions(+), 25 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/pool.c
 create mode 100644 net/sunrpc/xprtrdma/pool.h

Changes since v1:
- Rename "chunks" to "shards" -- RPC/RDMA already has chunks
- Replace pool's list of shards with an xarray
- Implement bitmap-based shard free space management
- Implement some naive observability

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 22704c2e5b9b..b4f3c01f1b94 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
 extern struct percpu_counter svcrdma_stat_sq_starve;
 extern struct percpu_counter svcrdma_stat_write;
 
+struct rpcrdma_pool;
+
 struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
@@ -112,6 +114,7 @@ struct svcxprt_rdma {
 	unsigned long	     sc_flags;
 	struct work_struct   sc_work;
 
+	struct rpcrdma_pool  *sc_recv_pool;
 	struct llist_head    sc_recv_ctxts;
 
 	atomic_t	     sc_completion_ids;
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index e6a72646c507..8bc713082c1a 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -2336,6 +2336,105 @@ DECLARE_EVENT_CLASS(rpcrdma_client_register_class,
 DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register);
 DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister);
 
+TRACE_EVENT(rpcrdma_pool_create,
+	TP_PROTO(
+		unsigned int poolid,
+		size_t bufsize
+	),
+
+	TP_ARGS(poolid, bufsize),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, poolid)
+		__field(size_t, bufsize)
+	),
+
+	TP_fast_assign(
+		__entry->poolid = poolid;
+		__entry->bufsize = bufsize;
+	),
+
+	TP_printk("poolid=%u bufsize=%zu bytes",
+		__entry->poolid, __entry->bufsize
+	)
+);
+
+TRACE_EVENT(rpcrdma_pool_destroy,
+	TP_PROTO(
+		unsigned int poolid
+	),
+
+	TP_ARGS(poolid),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, poolid)
+	),
+
+	TP_fast_assign(
+		__entry->poolid = poolid;),
+
+	TP_printk("poolid=%u",
+		__entry->poolid
+	)
+);
+
+DECLARE_EVENT_CLASS(rpcrdma_pool_shard_class,
+	TP_PROTO(
+		unsigned int poolid,
+		u32 shardid
+	),
+
+	TP_ARGS(poolid, shardid),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, poolid)
+		__field(u32, shardid)
+	),
+
+	TP_fast_assign(
+		__entry->poolid = poolid;
+		__entry->shardid = shardid;
+	),
+
+	TP_printk("poolid=%u shardid=%u",
+		__entry->poolid, __entry->shardid
+	)
+);
+
+#define DEFINE_RPCRDMA_POOL_SHARD_EVENT(name)				\
+	DEFINE_EVENT(rpcrdma_pool_shard_class, name,			\
+	TP_PROTO(							\
+		unsigned int poolid,					\
+		u32 shardid						\
+	),								\
+	TP_ARGS(poolid, shardid))
+
+DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_new);
+DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_free);
+
+TRACE_EVENT(rpcrdma_pool_buffer,
+	TP_PROTO(
+		unsigned int poolid,
+		const void *buffer
+	),
+
+	TP_ARGS(poolid, buffer),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, poolid)
+		__field(const void *, buffer)
+	),
+
+	TP_fast_assign(
+		__entry->poolid = poolid;
+		__entry->buffer = buffer;
+	),
+
+	TP_printk("poolid=%u buffer=%p",
+		__entry->poolid, __entry->buffer
+	)
+);
+
 #endif /* _TRACE_RPCRDMA_H */
 
 #include <trace/define_trace.h>
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 3232aa23cdb4..f69456dffe87 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
-rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o pool.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
 	svc_rdma_pcl.o module.o
diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
new file mode 100644
index 000000000000..e285c3e9c38e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for RPC-over-RDMA Receive buffers.
+ *
+ * A buffer pool attempts to conserve both the number of DMA mappings
+ * and the device's IOVA space by collecting small buffers together
+ * into a shard that has a single DMA mapping.
+ *
+ * API Contract:
+ *  - Buffers contained in one rpcrdma_pool instance are the same
+ *    size (rp_bufsize), no larger than RPCRDMA_MAX_INLINE_THRESH
+ *  - Buffers in one rpcrdma_pool instance are mapped using the same
+ *    DMA direction
+ *  - Buffers in one rpcrdma_pool instance are automatically released
+ *    when the instance is destroyed
+ *
+ * Future work:
+ *   - Manage pool resources by reference count
+ */
+
+#include <linux/list.h>
+#include <linux/xarray.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "xprt_rdma.h"
+#include "pool.h"
+
+#include <trace/events/rpcrdma.h>
+
+/*
+ * An idr would give near perfect pool ID uniqueness, but for
+ * the moment the pool ID is used only for observability, not
+ * correctness.
+ */
+static atomic_t rpcrdma_pool_id;
+
+struct rpcrdma_pool {
+	struct xarray		rp_xa;
+	struct ib_device	*rp_device;
+	size_t			rp_shardsize;	// in bytes
+	size_t			rp_bufsize;	// in bytes
+	enum dma_data_direction	rp_direction;
+	unsigned int		rp_bufs_per_shard;
+	unsigned int		rp_pool_id;
+};
+
+struct rpcrdma_pool_shard {
+	u8			*pc_cpu_addr;
+	u64			pc_mapped_addr;
+	unsigned long		*pc_bitmap;
+};
+
+static struct rpcrdma_pool_shard *
+rpcrdma_pool_shard_alloc(struct rpcrdma_pool *pool, gfp_t flags)
+{
+	struct rpcrdma_pool_shard *shard;
+	size_t bmap_size;
+
+	shard = kmalloc(sizeof(*shard), flags);
+	if (!shard)
+		goto fail;
+
+	bmap_size = BITS_TO_LONGS(pool->rp_bufs_per_shard) * sizeof(unsigned long);
+	shard->pc_bitmap = kzalloc(bmap_size, flags);
+	if (!shard->pc_bitmap)
+		goto free_shard;
+
+	/*
+	 * For good NUMA awareness, allocate the shard's I/O buffer
+	 * on the NUMA node that the underlying device is affined to.
+	 */
+	shard->pc_cpu_addr = kmalloc_node(pool->rp_shardsize, flags,
+					  ibdev_to_node(pool->rp_device));
+	if (!shard->pc_cpu_addr)
+		goto free_bitmap;
+	shard->pc_mapped_addr = ib_dma_map_single(pool->rp_device,
+						  shard->pc_cpu_addr,
+						  pool->rp_shardsize,
+						  pool->rp_direction);
+	if (ib_dma_mapping_error(pool->rp_device, shard->pc_mapped_addr))
+		goto free_iobuf;
+
+	return shard;
+
+free_iobuf:
+	kfree(shard->pc_cpu_addr);
+free_bitmap:
+	kfree(shard->pc_bitmap);
+free_shard:
+	kfree(shard);
+fail:
+	return NULL;
+}
+
+static void
+rpcrdma_pool_shard_free(struct rpcrdma_pool *pool,
+			struct rpcrdma_pool_shard *shard)
+{
+	ib_dma_unmap_single(pool->rp_device, shard->pc_mapped_addr,
+			    pool->rp_shardsize, pool->rp_direction);
+	kfree(shard->pc_cpu_addr);
+	kfree(shard->pc_bitmap);
+	kfree(shard);
+}
+
+/**
+ * rpcrdma_pool_create - Allocate and initialize an rpcrdma_pool instance
+ * @args: pool creation arguments
+ * @flags: GFP flags used during pool creation
+ *
+ * Returns a pointer to an opaque rpcrdma_pool instance or
+ * NULL. If a pool instance is returned, caller must free the
+ * returned instance using rpcrdma_pool_destroy().
+ */
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
+{
+	struct rpcrdma_pool *pool;
+
+	pool = kmalloc(sizeof(*pool), flags);
+	if (!pool)
+		return NULL;
+
+	xa_init_flags(&pool->rp_xa, XA_FLAGS_ALLOC);
+	pool->rp_device = args->pa_device;
+	pool->rp_shardsize = RPCRDMA_MAX_INLINE_THRESH;
+	pool->rp_bufsize = args->pa_bufsize;
+	pool->rp_direction = args->pa_direction;
+	pool->rp_bufs_per_shard = pool->rp_shardsize / pool->rp_bufsize;
+	pool->rp_pool_id = atomic_inc_return(&rpcrdma_pool_id);
+
+	trace_rpcrdma_pool_create(pool->rp_pool_id, pool->rp_bufsize);
+	return pool;
+}
+
+/**
+ * rpcrdma_pool_destroy - Release resources owned by @pool
+ * @pool: buffer pool instance that will no longer be used
+ *
+ * This call releases all buffers in @pool that were allocated
+ * via rpcrdma_pool_buffer_alloc().
+ */
+void
+rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
+{
+	struct rpcrdma_pool_shard *shard;
+	unsigned long index;
+
+	trace_rpcrdma_pool_destroy(pool->rp_pool_id);
+
+	xa_for_each(&pool->rp_xa, index, shard) {
+		trace_rpcrdma_pool_shard_free(pool->rp_pool_id, index);
+		xa_erase(&pool->rp_xa, index);
+		rpcrdma_pool_shard_free(pool, shard);
+	}
+
+	xa_destroy(&pool->rp_xa);
+	kfree(pool);
+}
+
+/**
+ * rpcrdma_pool_buffer_alloc - Allocate a buffer from @pool
+ * @pool: buffer pool from which to allocate the buffer
+ * @flags: GFP flags used during this allocation
+ * @cpu_addr: CPU address of the buffer
+ * @mapped_addr: mapped address of the buffer
+ *
+ * Return values:
+ *   %true: @cpu_addr and @mapped_addr are filled in with a DMA-mapped buffer
+ *   %false: No buffer is available
+ *
+ * When rpcrdma_pool_buffer_alloc() is successful, the returned
+ * buffer is freed automatically when the buffer pool is released
+ * by rpcrdma_pool_destroy().
+ */
+bool
+rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
+			  void **cpu_addr, u64 *mapped_addr)
+{
+	struct rpcrdma_pool_shard *shard;
+	u64 returned_mapped_addr;
+	void *returned_cpu_addr;
+	unsigned long index;
+	u32 id;
+
+	xa_for_each(&pool->rp_xa, index, shard) {
+		unsigned int i;
+
+		returned_cpu_addr = shard->pc_cpu_addr;
+		returned_mapped_addr = shard->pc_mapped_addr;
+		for (i = 0; i < pool->rp_bufs_per_shard; i++) {
+			if (!test_and_set_bit(i, shard->pc_bitmap)) {
+				returned_cpu_addr += i * pool->rp_bufsize;
+				returned_mapped_addr += i * pool->rp_bufsize;
+				goto out;
+			}
+		}
+	}
+
+	shard = rpcrdma_pool_shard_alloc(pool, flags);
+	if (!shard)
+		return false;
+	set_bit(0, shard->pc_bitmap);
+	returned_cpu_addr = shard->pc_cpu_addr;
+	returned_mapped_addr = shard->pc_mapped_addr;
+
+	if (xa_alloc(&pool->rp_xa, &id, shard, xa_limit_16b, flags) != 0) {
+		rpcrdma_pool_shard_free(pool, shard);
+		return false;
+	}
+	trace_rpcrdma_pool_shard_new(pool->rp_pool_id, id);
+
+out:
+	*cpu_addr = returned_cpu_addr;
+	*mapped_addr = returned_mapped_addr;
+
+	trace_rpcrdma_pool_buffer(pool->rp_pool_id, returned_cpu_addr);
+	return true;
+}
diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
new file mode 100644
index 000000000000..214f8fe78b9a
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for Send and Receive buffers.
+ */
+
+#ifndef RPCRDMA_POOL_H
+#define RPCRDMA_POOL_H
+
+struct rpcrdma_pool_args {
+	struct ib_device	*pa_device;
+	size_t			pa_bufsize;
+	enum dma_data_direction	pa_direction;
+};
+
+struct rpcrdma_pool;
+
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
+void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
+bool rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
+			       void **cpu_addr, u64 *mapped_addr);
+
+#endif /* RPCRDMA_POOL_H */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e7e4a39ca6c6..f625f1ede434 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -104,9 +104,9 @@
 #include <linux/sunrpc/svc_rdma.h>
 
 #include "xprt_rdma.h"
-#include <trace/events/rpcrdma.h>
+#include "pool.h"
 
-static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+#include <trace/events/rpcrdma.h>
 
 static inline struct svc_rdma_recv_ctxt *
 svc_rdma_next_recv_ctxt(struct list_head *list)
@@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
 					rc_list);
 }
 
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+
 static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
 	int node = ibdev_to_node(rdma->sc_cm_id->device);
 	struct svc_rdma_recv_ctxt *ctxt;
 	unsigned long pages;
-	dma_addr_t addr;
-	void *buffer;
 
 	pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
 	ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
@@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	if (!ctxt)
 		goto fail0;
 	ctxt->rc_maxpages = pages;
-	buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
-	if (!buffer)
+
+	if (!rpcrdma_pool_buffer_alloc(rdma->sc_recv_pool, GFP_KERNEL,
+				       &ctxt->rc_recv_buf,
+				       &ctxt->rc_recv_sge.addr))
 		goto fail1;
-	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
-				 rdma->sc_max_req_size, DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
-		goto fail2;
 
 	svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
 	pcl_init(&ctxt->rc_call_pcl);
@@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
 	ctxt->rc_recv_wr.num_sge = 1;
 	ctxt->rc_cqe.done = svc_rdma_wc_receive;
-	ctxt->rc_recv_sge.addr = addr;
 	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
 	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->rc_recv_buf = buffer;
 	svc_rdma_cc_init(rdma, &ctxt->rc_cc);
 	return ctxt;
 
-fail2:
-	kfree(buffer);
 fail1:
 	kfree(ctxt);
 fail0:
 	return NULL;
 }
 
-static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
-				       struct svc_rdma_recv_ctxt *ctxt)
-{
-	ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
-			    ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
-	kfree(ctxt->rc_recv_buf);
-	kfree(ctxt);
-}
-
 /**
  * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
  * @rdma: svcxprt_rdma being torn down
@@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 
 	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
 		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
-		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+		kfree(ctxt);
 	}
+	rpcrdma_pool_destroy(rdma->sc_recv_pool);
 }
 
 /**
@@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
+	struct rpcrdma_pool_args args = {
+		.pa_device	= rdma->sc_cm_id->device,
+		.pa_bufsize	= rdma->sc_max_req_size,
+		.pa_direction	= DMA_FROM_DEVICE,
+	};
 	unsigned int total;
 
+	rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
+	if (!rdma->sc_recv_pool)
+		return false;
+
 	/* For each credit, allocate enough recv_ctxts for one
 	 * posted Receive and one RPC in process.
 	 */
-- 
2.50.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
  2025-08-11 20:35 [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas Chuck Lever
@ 2025-08-19 16:03 ` Tom Talpey
  2025-08-19 16:08   ` Chuck Lever
  0 siblings, 1 reply; 6+ messages in thread
From: Tom Talpey @ 2025-08-19 16:03 UTC (permalink / raw)
  To: Chuck Lever, linux-nfs, linux-rdma; +Cc: Chuck Lever

On 8/11/2025 4:35 PM, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> Reduce the per-connection footprint in the host's and RNIC's memory
> management TLBs by combining groups of a connection's Receive
> buffers into fewer IOVAs.

This is an interesting and potentially useful approach. Keeping
the iova count (==1) reduces the size of work requests and greatly
simplifies processing.

But how large are the iova's currently? RPCRDMA_DEF_INLINE_THRESH
is just 4096, which would mean typically <= 2 iova's. The max is
arbitrarily but consistently 64KB, is this complexity worth it?

And, allocating large contiguous buffers would seem to shift the
burden to kmalloc and/or the IOMMU, so it's not free, right?

> I don't have a good way to measure whether this approach is
> effective.

I guess I'd need to see this data to be more convinced. But it does
seem potentially promising, at least on some RDMA provider hardware.

Tom.

> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   include/linux/sunrpc/svc_rdma.h         |   3 +
>   include/trace/events/rpcrdma.h          |  99 +++++++++++
>   net/sunrpc/xprtrdma/Makefile            |   2 +-
>   net/sunrpc/xprtrdma/pool.c              | 223 ++++++++++++++++++++++++
>   net/sunrpc/xprtrdma/pool.h              |  25 +++
>   net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  43 ++---
>   6 files changed, 370 insertions(+), 25 deletions(-)
>   create mode 100644 net/sunrpc/xprtrdma/pool.c
>   create mode 100644 net/sunrpc/xprtrdma/pool.h
> 
> Changes since v1:
> - Rename "chunks" to "shards" -- RPC/RDMA already has chunks
> - Replace pool's list of shards with an xarray
> - Implement bitmap-based shard free space management
> - Implement some naive observability
> 
> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
> index 22704c2e5b9b..b4f3c01f1b94 100644
> --- a/include/linux/sunrpc/svc_rdma.h
> +++ b/include/linux/sunrpc/svc_rdma.h
> @@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
>   extern struct percpu_counter svcrdma_stat_sq_starve;
>   extern struct percpu_counter svcrdma_stat_write;
>   
> +struct rpcrdma_pool;
> +
>   struct svcxprt_rdma {
>   	struct svc_xprt      sc_xprt;		/* SVC transport structure */
>   	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
> @@ -112,6 +114,7 @@ struct svcxprt_rdma {
>   	unsigned long	     sc_flags;
>   	struct work_struct   sc_work;
>   
> +	struct rpcrdma_pool  *sc_recv_pool;
>   	struct llist_head    sc_recv_ctxts;
>   
>   	atomic_t	     sc_completion_ids;
> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
> index e6a72646c507..8bc713082c1a 100644
> --- a/include/trace/events/rpcrdma.h
> +++ b/include/trace/events/rpcrdma.h
> @@ -2336,6 +2336,105 @@ DECLARE_EVENT_CLASS(rpcrdma_client_register_class,
>   DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register);
>   DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister);
>   
> +TRACE_EVENT(rpcrdma_pool_create,
> +	TP_PROTO(
> +		unsigned int poolid,
> +		size_t bufsize
> +	),
> +
> +	TP_ARGS(poolid, bufsize),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned int, poolid)
> +		__field(size_t, bufsize)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->poolid = poolid;
> +		__entry->bufsize = bufsize;
> +	),
> +
> +	TP_printk("poolid=%u bufsize=%zu bytes",
> +		__entry->poolid, __entry->bufsize
> +	)
> +);
> +
> +TRACE_EVENT(rpcrdma_pool_destroy,
> +	TP_PROTO(
> +		unsigned int poolid
> +	),
> +
> +	TP_ARGS(poolid),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned int, poolid)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->poolid = poolid;),
> +
> +	TP_printk("poolid=%u",
> +		__entry->poolid
> +	)
> +);
> +
> +DECLARE_EVENT_CLASS(rpcrdma_pool_shard_class,
> +	TP_PROTO(
> +		unsigned int poolid,
> +		u32 shardid
> +	),
> +
> +	TP_ARGS(poolid, shardid),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned int, poolid)
> +		__field(u32, shardid)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->poolid = poolid;
> +		__entry->shardid = shardid;
> +	),
> +
> +	TP_printk("poolid=%u shardid=%u",
> +		__entry->poolid, __entry->shardid
> +	)
> +);
> +
> +#define DEFINE_RPCRDMA_POOL_SHARD_EVENT(name)				\
> +	DEFINE_EVENT(rpcrdma_pool_shard_class, name,			\
> +	TP_PROTO(							\
> +		unsigned int poolid,					\
> +		u32 shardid						\
> +	),								\
> +	TP_ARGS(poolid, shardid))
> +
> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_new);
> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_free);
> +
> +TRACE_EVENT(rpcrdma_pool_buffer,
> +	TP_PROTO(
> +		unsigned int poolid,
> +		const void *buffer
> +	),
> +
> +	TP_ARGS(poolid, buffer),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned int, poolid)
> +		__field(const void *, buffer)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->poolid = poolid;
> +		__entry->buffer = buffer;
> +	),
> +
> +	TP_printk("poolid=%u buffer=%p",
> +		__entry->poolid, __entry->buffer
> +	)
> +);
> +
>   #endif /* _TRACE_RPCRDMA_H */
>   
>   #include <trace/define_trace.h>
> diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
> index 3232aa23cdb4..f69456dffe87 100644
> --- a/net/sunrpc/xprtrdma/Makefile
> +++ b/net/sunrpc/xprtrdma/Makefile
> @@ -1,7 +1,7 @@
>   # SPDX-License-Identifier: GPL-2.0
>   obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
>   
> -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
> +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o pool.o \
>   	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
>   	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
>   	svc_rdma_pcl.o module.o
> diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
> new file mode 100644
> index 000000000000..e285c3e9c38e
> --- /dev/null
> +++ b/net/sunrpc/xprtrdma/pool.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025, Oracle and/or its affiliates.
> + *
> + * Pools for RPC-over-RDMA Receive buffers.
> + *
> + * A buffer pool attempts to conserve both the number of DMA mappings
> + * and the device's IOVA space by collecting small buffers together
> + * into a shard that has a single DMA mapping.
> + *
> + * API Contract:
> + *  - Buffers contained in one rpcrdma_pool instance are the same
> + *    size (rp_bufsize), no larger than RPCRDMA_MAX_INLINE_THRESH
> + *  - Buffers in one rpcrdma_pool instance are mapped using the same
> + *    DMA direction
> + *  - Buffers in one rpcrdma_pool instance are automatically released
> + *    when the instance is destroyed
> + *
> + * Future work:
> + *   - Manage pool resources by reference count
> + */
> +
> +#include <linux/list.h>
> +#include <linux/xarray.h>
> +#include <linux/sunrpc/svc_rdma.h>
> +
> +#include <rdma/ib_verbs.h>
> +
> +#include "xprt_rdma.h"
> +#include "pool.h"
> +
> +#include <trace/events/rpcrdma.h>
> +
> +/*
> + * An idr would give near perfect pool ID uniqueness, but for
> + * the moment the pool ID is used only for observability, not
> + * correctness.
> + */
> +static atomic_t rpcrdma_pool_id;
> +
> +struct rpcrdma_pool {
> +	struct xarray		rp_xa;
> +	struct ib_device	*rp_device;
> +	size_t			rp_shardsize;	// in bytes
> +	size_t			rp_bufsize;	// in bytes
> +	enum dma_data_direction	rp_direction;
> +	unsigned int		rp_bufs_per_shard;
> +	unsigned int		rp_pool_id;
> +};
> +
> +struct rpcrdma_pool_shard {
> +	u8			*pc_cpu_addr;
> +	u64			pc_mapped_addr;
> +	unsigned long		*pc_bitmap;
> +};
> +
> +static struct rpcrdma_pool_shard *
> +rpcrdma_pool_shard_alloc(struct rpcrdma_pool *pool, gfp_t flags)
> +{
> +	struct rpcrdma_pool_shard *shard;
> +	size_t bmap_size;
> +
> +	shard = kmalloc(sizeof(*shard), flags);
> +	if (!shard)
> +		goto fail;
> +
> +	bmap_size = BITS_TO_LONGS(pool->rp_bufs_per_shard) * sizeof(unsigned long);
> +	shard->pc_bitmap = kzalloc(bmap_size, flags);
> +	if (!shard->pc_bitmap)
> +		goto free_shard;
> +
> +	/*
> +	 * For good NUMA awareness, allocate the shard's I/O buffer
> +	 * on the NUMA node that the underlying device is affined to.
> +	 */
> +	shard->pc_cpu_addr = kmalloc_node(pool->rp_shardsize, flags,
> +					  ibdev_to_node(pool->rp_device));
> +	if (!shard->pc_cpu_addr)
> +		goto free_bitmap;
> +	shard->pc_mapped_addr = ib_dma_map_single(pool->rp_device,
> +						  shard->pc_cpu_addr,
> +						  pool->rp_shardsize,
> +						  pool->rp_direction);
> +	if (ib_dma_mapping_error(pool->rp_device, shard->pc_mapped_addr))
> +		goto free_iobuf;
> +
> +	return shard;
> +
> +free_iobuf:
> +	kfree(shard->pc_cpu_addr);
> +free_bitmap:
> +	kfree(shard->pc_bitmap);
> +free_shard:
> +	kfree(shard);
> +fail:
> +	return NULL;
> +}
> +
> +static void
> +rpcrdma_pool_shard_free(struct rpcrdma_pool *pool,
> +			struct rpcrdma_pool_shard *shard)
> +{
> +	ib_dma_unmap_single(pool->rp_device, shard->pc_mapped_addr,
> +			    pool->rp_shardsize, pool->rp_direction);
> +	kfree(shard->pc_cpu_addr);
> +	kfree(shard->pc_bitmap);
> +	kfree(shard);
> +}
> +
> +/**
> + * rpcrdma_pool_create - Allocate and initialize an rpcrdma_pool instance
> + * @args: pool creation arguments
> + * @flags: GFP flags used during pool creation
> + *
> + * Returns a pointer to an opaque rpcrdma_pool instance or
> + * NULL. If a pool instance is returned, caller must free the
> + * returned instance using rpcrdma_pool_destroy().
> + */
> +struct rpcrdma_pool *
> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
> +{
> +	struct rpcrdma_pool *pool;
> +
> +	pool = kmalloc(sizeof(*pool), flags);
> +	if (!pool)
> +		return NULL;
> +
> +	xa_init_flags(&pool->rp_xa, XA_FLAGS_ALLOC);
> +	pool->rp_device = args->pa_device;
> +	pool->rp_shardsize = RPCRDMA_MAX_INLINE_THRESH;
> +	pool->rp_bufsize = args->pa_bufsize;
> +	pool->rp_direction = args->pa_direction;
> +	pool->rp_bufs_per_shard = pool->rp_shardsize / pool->rp_bufsize;
> +	pool->rp_pool_id = atomic_inc_return(&rpcrdma_pool_id);
> +
> +	trace_rpcrdma_pool_create(pool->rp_pool_id, pool->rp_bufsize);
> +	return pool;
> +}
> +
> +/**
> + * rpcrdma_pool_destroy - Release resources owned by @pool
> + * @pool: buffer pool instance that will no longer be used
> + *
> + * This call releases all buffers in @pool that were allocated
> + * via rpcrdma_pool_buffer_alloc().
> + */
> +void
> +rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
> +{
> +	struct rpcrdma_pool_shard *shard;
> +	unsigned long index;
> +
> +	trace_rpcrdma_pool_destroy(pool->rp_pool_id);
> +
> +	xa_for_each(&pool->rp_xa, index, shard) {
> +		trace_rpcrdma_pool_shard_free(pool->rp_pool_id, index);
> +		xa_erase(&pool->rp_xa, index);
> +		rpcrdma_pool_shard_free(pool, shard);
> +	}
> +
> +	xa_destroy(&pool->rp_xa);
> +	kfree(pool);
> +}
> +
> +/**
> + * rpcrdma_pool_buffer_alloc - Allocate a buffer from @pool
> + * @pool: buffer pool from which to allocate the buffer
> + * @flags: GFP flags used during this allocation
> + * @cpu_addr: CPU address of the buffer
> + * @mapped_addr: mapped address of the buffer
> + *
> + * Return values:
> + *   %true: @cpu_addr and @mapped_addr are filled in with a DMA-mapped buffer
> + *   %false: No buffer is available
> + *
> + * When rpcrdma_pool_buffer_alloc() is successful, the returned
> + * buffer is freed automatically when the buffer pool is released
> + * by rpcrdma_pool_destroy().
> + */
> +bool
> +rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
> +			  void **cpu_addr, u64 *mapped_addr)
> +{
> +	struct rpcrdma_pool_shard *shard;
> +	u64 returned_mapped_addr;
> +	void *returned_cpu_addr;
> +	unsigned long index;
> +	u32 id;
> +
> +	xa_for_each(&pool->rp_xa, index, shard) {
> +		unsigned int i;
> +
> +		returned_cpu_addr = shard->pc_cpu_addr;
> +		returned_mapped_addr = shard->pc_mapped_addr;
> +		for (i = 0; i < pool->rp_bufs_per_shard; i++) {
> +			if (!test_and_set_bit(i, shard->pc_bitmap)) {
> +				returned_cpu_addr += i * pool->rp_bufsize;
> +				returned_mapped_addr += i * pool->rp_bufsize;
> +				goto out;
> +			}
> +		}
> +	}
> +
> +	shard = rpcrdma_pool_shard_alloc(pool, flags);
> +	if (!shard)
> +		return false;
> +	set_bit(0, shard->pc_bitmap);
> +	returned_cpu_addr = shard->pc_cpu_addr;
> +	returned_mapped_addr = shard->pc_mapped_addr;
> +
> +	if (xa_alloc(&pool->rp_xa, &id, shard, xa_limit_16b, flags) != 0) {
> +		rpcrdma_pool_shard_free(pool, shard);
> +		return false;
> +	}
> +	trace_rpcrdma_pool_shard_new(pool->rp_pool_id, id);
> +
> +out:
> +	*cpu_addr = returned_cpu_addr;
> +	*mapped_addr = returned_mapped_addr;
> +
> +	trace_rpcrdma_pool_buffer(pool->rp_pool_id, returned_cpu_addr);
> +	return true;
> +}
> diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
> new file mode 100644
> index 000000000000..214f8fe78b9a
> --- /dev/null
> +++ b/net/sunrpc/xprtrdma/pool.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2025, Oracle and/or its affiliates.
> + *
> + * Pools for Send and Receive buffers.
> + */
> +
> +#ifndef RPCRDMA_POOL_H
> +#define RPCRDMA_POOL_H
> +
> +struct rpcrdma_pool_args {
> +	struct ib_device	*pa_device;
> +	size_t			pa_bufsize;
> +	enum dma_data_direction	pa_direction;
> +};
> +
> +struct rpcrdma_pool;
> +
> +struct rpcrdma_pool *
> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
> +void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
> +bool rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
> +			       void **cpu_addr, u64 *mapped_addr);
> +
> +#endif /* RPCRDMA_POOL_H */
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> index e7e4a39ca6c6..f625f1ede434 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> @@ -104,9 +104,9 @@
>   #include <linux/sunrpc/svc_rdma.h>
>   
>   #include "xprt_rdma.h"
> -#include <trace/events/rpcrdma.h>
> +#include "pool.h"
>   
> -static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
> +#include <trace/events/rpcrdma.h>
>   
>   static inline struct svc_rdma_recv_ctxt *
>   svc_rdma_next_recv_ctxt(struct list_head *list)
> @@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
>   					rc_list);
>   }
>   
> +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
> +
>   static struct svc_rdma_recv_ctxt *
>   svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>   {
>   	int node = ibdev_to_node(rdma->sc_cm_id->device);
>   	struct svc_rdma_recv_ctxt *ctxt;
>   	unsigned long pages;
> -	dma_addr_t addr;
> -	void *buffer;
>   
>   	pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
>   	ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
> @@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>   	if (!ctxt)
>   		goto fail0;
>   	ctxt->rc_maxpages = pages;
> -	buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
> -	if (!buffer)
> +
> +	if (!rpcrdma_pool_buffer_alloc(rdma->sc_recv_pool, GFP_KERNEL,
> +				       &ctxt->rc_recv_buf,
> +				       &ctxt->rc_recv_sge.addr))
>   		goto fail1;
> -	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
> -				 rdma->sc_max_req_size, DMA_FROM_DEVICE);
> -	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
> -		goto fail2;
>   
>   	svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
>   	pcl_init(&ctxt->rc_call_pcl);
> @@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>   	ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
>   	ctxt->rc_recv_wr.num_sge = 1;
>   	ctxt->rc_cqe.done = svc_rdma_wc_receive;
> -	ctxt->rc_recv_sge.addr = addr;
>   	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
>   	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
> -	ctxt->rc_recv_buf = buffer;
>   	svc_rdma_cc_init(rdma, &ctxt->rc_cc);
>   	return ctxt;
>   
> -fail2:
> -	kfree(buffer);
>   fail1:
>   	kfree(ctxt);
>   fail0:
>   	return NULL;
>   }
>   
> -static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
> -				       struct svc_rdma_recv_ctxt *ctxt)
> -{
> -	ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
> -			    ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
> -	kfree(ctxt->rc_recv_buf);
> -	kfree(ctxt);
> -}
> -
>   /**
>    * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
>    * @rdma: svcxprt_rdma being torn down
> @@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
>   
>   	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
>   		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
> -		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
> +		kfree(ctxt);
>   	}
> +	rpcrdma_pool_destroy(rdma->sc_recv_pool);
>   }
>   
>   /**
> @@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
>    */
>   bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
>   {
> +	struct rpcrdma_pool_args args = {
> +		.pa_device	= rdma->sc_cm_id->device,
> +		.pa_bufsize	= rdma->sc_max_req_size,
> +		.pa_direction	= DMA_FROM_DEVICE,
> +	};
>   	unsigned int total;
>   
> +	rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
> +	if (!rdma->sc_recv_pool)
> +		return false;
> +
>   	/* For each credit, allocate enough recv_ctxts for one
>   	 * posted Receive and one RPC in process.
>   	 */


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
  2025-08-19 16:03 ` Tom Talpey
@ 2025-08-19 16:08   ` Chuck Lever
  2025-08-19 16:58     ` Tom Talpey
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Lever @ 2025-08-19 16:08 UTC (permalink / raw)
  To: Tom Talpey, linux-nfs, linux-rdma; +Cc: Chuck Lever

On 8/19/25 12:03 PM, Tom Talpey wrote:
> On 8/11/2025 4:35 PM, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>>
>> Reduce the per-connection footprint in the host's and RNIC's memory
>> management TLBs by combining groups of a connection's Receive
>> buffers into fewer IOVAs.
> 
> This is an interesting and potentially useful approach. Keeping
> the iova count (==1) reduces the size of work requests and greatly
> simplifies processing.
> 
> But how large are the iova's currently? RPCRDMA_DEF_INLINE_THRESH
> is just 4096, which would mean typically <= 2 iova's. The max is
> arbitrarily but consistently 64KB, is this complexity worth it?

The pool's shard size is RPCRDMA_MAX_INLINE_THRESH, or 64KB. That's the
largest inline threshold this implementation allows.

The default inline threshold is 4KB, so one shared can hold up to
sixteen 4KB Receive buffers. The default credit limit is 64, plus 8
batch overflow, so 72 Receive buffers total per connection.


> And, allocating large contiguous buffers would seem to shift the
> burden to kmalloc and/or the IOMMU, so it's not free, right?

Can you elaborate on what you mean by "burden" ?


>> I don't have a good way to measure whether this approach is
>> effective.
> 
> I guess I'd need to see this data to be more convinced. But it does
> seem potentially promising, at least on some RDMA provider hardware.
> 
> Tom.
> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>   include/linux/sunrpc/svc_rdma.h         |   3 +
>>   include/trace/events/rpcrdma.h          |  99 +++++++++++
>>   net/sunrpc/xprtrdma/Makefile            |   2 +-
>>   net/sunrpc/xprtrdma/pool.c              | 223 ++++++++++++++++++++++++
>>   net/sunrpc/xprtrdma/pool.h              |  25 +++
>>   net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  43 ++---
>>   6 files changed, 370 insertions(+), 25 deletions(-)
>>   create mode 100644 net/sunrpc/xprtrdma/pool.c
>>   create mode 100644 net/sunrpc/xprtrdma/pool.h
>>
>> Changes since v1:
>> - Rename "chunks" to "shards" -- RPC/RDMA already has chunks
>> - Replace pool's list of shards with an xarray
>> - Implement bitmap-based shard free space management
>> - Implement some naive observability
>>
>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/
>> svc_rdma.h
>> index 22704c2e5b9b..b4f3c01f1b94 100644
>> --- a/include/linux/sunrpc/svc_rdma.h
>> +++ b/include/linux/sunrpc/svc_rdma.h
>> @@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
>>   extern struct percpu_counter svcrdma_stat_sq_starve;
>>   extern struct percpu_counter svcrdma_stat_write;
>>   +struct rpcrdma_pool;
>> +
>>   struct svcxprt_rdma {
>>       struct svc_xprt      sc_xprt;        /* SVC transport structure */
>>       struct rdma_cm_id    *sc_cm_id;        /* RDMA connection id */
>> @@ -112,6 +114,7 @@ struct svcxprt_rdma {
>>       unsigned long         sc_flags;
>>       struct work_struct   sc_work;
>>   +    struct rpcrdma_pool  *sc_recv_pool;
>>       struct llist_head    sc_recv_ctxts;
>>         atomic_t         sc_completion_ids;
>> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/
>> rpcrdma.h
>> index e6a72646c507..8bc713082c1a 100644
>> --- a/include/trace/events/rpcrdma.h
>> +++ b/include/trace/events/rpcrdma.h
>> @@ -2336,6 +2336,105 @@
>> DECLARE_EVENT_CLASS(rpcrdma_client_register_class,
>>   DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register);
>>   DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister);
>>   +TRACE_EVENT(rpcrdma_pool_create,
>> +    TP_PROTO(
>> +        unsigned int poolid,
>> +        size_t bufsize
>> +    ),
>> +
>> +    TP_ARGS(poolid, bufsize),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(unsigned int, poolid)
>> +        __field(size_t, bufsize)
>> +    ),
>> +
>> +    TP_fast_assign(
>> +        __entry->poolid = poolid;
>> +        __entry->bufsize = bufsize;
>> +    ),
>> +
>> +    TP_printk("poolid=%u bufsize=%zu bytes",
>> +        __entry->poolid, __entry->bufsize
>> +    )
>> +);
>> +
>> +TRACE_EVENT(rpcrdma_pool_destroy,
>> +    TP_PROTO(
>> +        unsigned int poolid
>> +    ),
>> +
>> +    TP_ARGS(poolid),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(unsigned int, poolid)
>> +    ),
>> +
>> +    TP_fast_assign(
>> +        __entry->poolid = poolid;),
>> +
>> +    TP_printk("poolid=%u",
>> +        __entry->poolid
>> +    )
>> +);
>> +
>> +DECLARE_EVENT_CLASS(rpcrdma_pool_shard_class,
>> +    TP_PROTO(
>> +        unsigned int poolid,
>> +        u32 shardid
>> +    ),
>> +
>> +    TP_ARGS(poolid, shardid),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(unsigned int, poolid)
>> +        __field(u32, shardid)
>> +    ),
>> +
>> +    TP_fast_assign(
>> +        __entry->poolid = poolid;
>> +        __entry->shardid = shardid;
>> +    ),
>> +
>> +    TP_printk("poolid=%u shardid=%u",
>> +        __entry->poolid, __entry->shardid
>> +    )
>> +);
>> +
>> +#define DEFINE_RPCRDMA_POOL_SHARD_EVENT(name)                \
>> +    DEFINE_EVENT(rpcrdma_pool_shard_class, name,            \
>> +    TP_PROTO(                            \
>> +        unsigned int poolid,                    \
>> +        u32 shardid                        \
>> +    ),                                \
>> +    TP_ARGS(poolid, shardid))
>> +
>> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_new);
>> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_free);
>> +
>> +TRACE_EVENT(rpcrdma_pool_buffer,
>> +    TP_PROTO(
>> +        unsigned int poolid,
>> +        const void *buffer
>> +    ),
>> +
>> +    TP_ARGS(poolid, buffer),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(unsigned int, poolid)
>> +        __field(const void *, buffer)
>> +    ),
>> +
>> +    TP_fast_assign(
>> +        __entry->poolid = poolid;
>> +        __entry->buffer = buffer;
>> +    ),
>> +
>> +    TP_printk("poolid=%u buffer=%p",
>> +        __entry->poolid, __entry->buffer
>> +    )
>> +);
>> +
>>   #endif /* _TRACE_RPCRDMA_H */
>>     #include <trace/define_trace.h>
>> diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
>> index 3232aa23cdb4..f69456dffe87 100644
>> --- a/net/sunrpc/xprtrdma/Makefile
>> +++ b/net/sunrpc/xprtrdma/Makefile
>> @@ -1,7 +1,7 @@
>>   # SPDX-License-Identifier: GPL-2.0
>>   obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
>>   -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
>> +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o
>> pool.o \
>>       svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
>>       svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
>>       svc_rdma_pcl.o module.o
>> diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
>> new file mode 100644
>> index 000000000000..e285c3e9c38e
>> --- /dev/null
>> +++ b/net/sunrpc/xprtrdma/pool.c
>> @@ -0,0 +1,223 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (c) 2025, Oracle and/or its affiliates.
>> + *
>> + * Pools for RPC-over-RDMA Receive buffers.
>> + *
>> + * A buffer pool attempts to conserve both the number of DMA mappings
>> + * and the device's IOVA space by collecting small buffers together
>> + * into a shard that has a single DMA mapping.
>> + *
>> + * API Contract:
>> + *  - Buffers contained in one rpcrdma_pool instance are the same
>> + *    size (rp_bufsize), no larger than RPCRDMA_MAX_INLINE_THRESH
>> + *  - Buffers in one rpcrdma_pool instance are mapped using the same
>> + *    DMA direction
>> + *  - Buffers in one rpcrdma_pool instance are automatically released
>> + *    when the instance is destroyed
>> + *
>> + * Future work:
>> + *   - Manage pool resources by reference count
>> + */
>> +
>> +#include <linux/list.h>
>> +#include <linux/xarray.h>
>> +#include <linux/sunrpc/svc_rdma.h>
>> +
>> +#include <rdma/ib_verbs.h>
>> +
>> +#include "xprt_rdma.h"
>> +#include "pool.h"
>> +
>> +#include <trace/events/rpcrdma.h>
>> +
>> +/*
>> + * An idr would give near perfect pool ID uniqueness, but for
>> + * the moment the pool ID is used only for observability, not
>> + * correctness.
>> + */
>> +static atomic_t rpcrdma_pool_id;
>> +
>> +struct rpcrdma_pool {
>> +    struct xarray        rp_xa;
>> +    struct ib_device    *rp_device;
>> +    size_t            rp_shardsize;    // in bytes
>> +    size_t            rp_bufsize;    // in bytes
>> +    enum dma_data_direction    rp_direction;
>> +    unsigned int        rp_bufs_per_shard;
>> +    unsigned int        rp_pool_id;
>> +};
>> +
>> +struct rpcrdma_pool_shard {
>> +    u8            *pc_cpu_addr;
>> +    u64            pc_mapped_addr;
>> +    unsigned long        *pc_bitmap;
>> +};
>> +
>> +static struct rpcrdma_pool_shard *
>> +rpcrdma_pool_shard_alloc(struct rpcrdma_pool *pool, gfp_t flags)
>> +{
>> +    struct rpcrdma_pool_shard *shard;
>> +    size_t bmap_size;
>> +
>> +    shard = kmalloc(sizeof(*shard), flags);
>> +    if (!shard)
>> +        goto fail;
>> +
>> +    bmap_size = BITS_TO_LONGS(pool->rp_bufs_per_shard) *
>> sizeof(unsigned long);
>> +    shard->pc_bitmap = kzalloc(bmap_size, flags);
>> +    if (!shard->pc_bitmap)
>> +        goto free_shard;
>> +
>> +    /*
>> +     * For good NUMA awareness, allocate the shard's I/O buffer
>> +     * on the NUMA node that the underlying device is affined to.
>> +     */
>> +    shard->pc_cpu_addr = kmalloc_node(pool->rp_shardsize, flags,
>> +                      ibdev_to_node(pool->rp_device));
>> +    if (!shard->pc_cpu_addr)
>> +        goto free_bitmap;
>> +    shard->pc_mapped_addr = ib_dma_map_single(pool->rp_device,
>> +                          shard->pc_cpu_addr,
>> +                          pool->rp_shardsize,
>> +                          pool->rp_direction);
>> +    if (ib_dma_mapping_error(pool->rp_device, shard->pc_mapped_addr))
>> +        goto free_iobuf;
>> +
>> +    return shard;
>> +
>> +free_iobuf:
>> +    kfree(shard->pc_cpu_addr);
>> +free_bitmap:
>> +    kfree(shard->pc_bitmap);
>> +free_shard:
>> +    kfree(shard);
>> +fail:
>> +    return NULL;
>> +}
>> +
>> +static void
>> +rpcrdma_pool_shard_free(struct rpcrdma_pool *pool,
>> +            struct rpcrdma_pool_shard *shard)
>> +{
>> +    ib_dma_unmap_single(pool->rp_device, shard->pc_mapped_addr,
>> +                pool->rp_shardsize, pool->rp_direction);
>> +    kfree(shard->pc_cpu_addr);
>> +    kfree(shard->pc_bitmap);
>> +    kfree(shard);
>> +}
>> +
>> +/**
>> + * rpcrdma_pool_create - Allocate and initialize an rpcrdma_pool
>> instance
>> + * @args: pool creation arguments
>> + * @flags: GFP flags used during pool creation
>> + *
>> + * Returns a pointer to an opaque rpcrdma_pool instance or
>> + * NULL. If a pool instance is returned, caller must free the
>> + * returned instance using rpcrdma_pool_destroy().
>> + */
>> +struct rpcrdma_pool *
>> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
>> +{
>> +    struct rpcrdma_pool *pool;
>> +
>> +    pool = kmalloc(sizeof(*pool), flags);
>> +    if (!pool)
>> +        return NULL;
>> +
>> +    xa_init_flags(&pool->rp_xa, XA_FLAGS_ALLOC);
>> +    pool->rp_device = args->pa_device;
>> +    pool->rp_shardsize = RPCRDMA_MAX_INLINE_THRESH;
>> +    pool->rp_bufsize = args->pa_bufsize;
>> +    pool->rp_direction = args->pa_direction;
>> +    pool->rp_bufs_per_shard = pool->rp_shardsize / pool->rp_bufsize;
>> +    pool->rp_pool_id = atomic_inc_return(&rpcrdma_pool_id);
>> +
>> +    trace_rpcrdma_pool_create(pool->rp_pool_id, pool->rp_bufsize);
>> +    return pool;
>> +}
>> +
>> +/**
>> + * rpcrdma_pool_destroy - Release resources owned by @pool
>> + * @pool: buffer pool instance that will no longer be used
>> + *
>> + * This call releases all buffers in @pool that were allocated
>> + * via rpcrdma_pool_buffer_alloc().
>> + */
>> +void
>> +rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
>> +{
>> +    struct rpcrdma_pool_shard *shard;
>> +    unsigned long index;
>> +
>> +    trace_rpcrdma_pool_destroy(pool->rp_pool_id);
>> +
>> +    xa_for_each(&pool->rp_xa, index, shard) {
>> +        trace_rpcrdma_pool_shard_free(pool->rp_pool_id, index);
>> +        xa_erase(&pool->rp_xa, index);
>> +        rpcrdma_pool_shard_free(pool, shard);
>> +    }
>> +
>> +    xa_destroy(&pool->rp_xa);
>> +    kfree(pool);
>> +}
>> +
>> +/**
>> + * rpcrdma_pool_buffer_alloc - Allocate a buffer from @pool
>> + * @pool: buffer pool from which to allocate the buffer
>> + * @flags: GFP flags used during this allocation
>> + * @cpu_addr: CPU address of the buffer
>> + * @mapped_addr: mapped address of the buffer
>> + *
>> + * Return values:
>> + *   %true: @cpu_addr and @mapped_addr are filled in with a DMA-
>> mapped buffer
>> + *   %false: No buffer is available
>> + *
>> + * When rpcrdma_pool_buffer_alloc() is successful, the returned
>> + * buffer is freed automatically when the buffer pool is released
>> + * by rpcrdma_pool_destroy().
>> + */
>> +bool
>> +rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
>> +              void **cpu_addr, u64 *mapped_addr)
>> +{
>> +    struct rpcrdma_pool_shard *shard;
>> +    u64 returned_mapped_addr;
>> +    void *returned_cpu_addr;
>> +    unsigned long index;
>> +    u32 id;
>> +
>> +    xa_for_each(&pool->rp_xa, index, shard) {
>> +        unsigned int i;
>> +
>> +        returned_cpu_addr = shard->pc_cpu_addr;
>> +        returned_mapped_addr = shard->pc_mapped_addr;
>> +        for (i = 0; i < pool->rp_bufs_per_shard; i++) {
>> +            if (!test_and_set_bit(i, shard->pc_bitmap)) {
>> +                returned_cpu_addr += i * pool->rp_bufsize;
>> +                returned_mapped_addr += i * pool->rp_bufsize;
>> +                goto out;
>> +            }
>> +        }
>> +    }
>> +
>> +    shard = rpcrdma_pool_shard_alloc(pool, flags);
>> +    if (!shard)
>> +        return false;
>> +    set_bit(0, shard->pc_bitmap);
>> +    returned_cpu_addr = shard->pc_cpu_addr;
>> +    returned_mapped_addr = shard->pc_mapped_addr;
>> +
>> +    if (xa_alloc(&pool->rp_xa, &id, shard, xa_limit_16b, flags) != 0) {
>> +        rpcrdma_pool_shard_free(pool, shard);
>> +        return false;
>> +    }
>> +    trace_rpcrdma_pool_shard_new(pool->rp_pool_id, id);
>> +
>> +out:
>> +    *cpu_addr = returned_cpu_addr;
>> +    *mapped_addr = returned_mapped_addr;
>> +
>> +    trace_rpcrdma_pool_buffer(pool->rp_pool_id, returned_cpu_addr);
>> +    return true;
>> +}
>> diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
>> new file mode 100644
>> index 000000000000..214f8fe78b9a
>> --- /dev/null
>> +++ b/net/sunrpc/xprtrdma/pool.h
>> @@ -0,0 +1,25 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (c) 2025, Oracle and/or its affiliates.
>> + *
>> + * Pools for Send and Receive buffers.
>> + */
>> +
>> +#ifndef RPCRDMA_POOL_H
>> +#define RPCRDMA_POOL_H
>> +
>> +struct rpcrdma_pool_args {
>> +    struct ib_device    *pa_device;
>> +    size_t            pa_bufsize;
>> +    enum dma_data_direction    pa_direction;
>> +};
>> +
>> +struct rpcrdma_pool;
>> +
>> +struct rpcrdma_pool *
>> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
>> +void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
>> +bool rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
>> +                   void **cpu_addr, u64 *mapped_addr);
>> +
>> +#endif /* RPCRDMA_POOL_H */
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/
>> xprtrdma/svc_rdma_recvfrom.c
>> index e7e4a39ca6c6..f625f1ede434 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -104,9 +104,9 @@
>>   #include <linux/sunrpc/svc_rdma.h>
>>     #include "xprt_rdma.h"
>> -#include <trace/events/rpcrdma.h>
>> +#include "pool.h"
>>   -static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
>> +#include <trace/events/rpcrdma.h>
>>     static inline struct svc_rdma_recv_ctxt *
>>   svc_rdma_next_recv_ctxt(struct list_head *list)
>> @@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
>>                       rc_list);
>>   }
>>   +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
>> +
>>   static struct svc_rdma_recv_ctxt *
>>   svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>   {
>>       int node = ibdev_to_node(rdma->sc_cm_id->device);
>>       struct svc_rdma_recv_ctxt *ctxt;
>>       unsigned long pages;
>> -    dma_addr_t addr;
>> -    void *buffer;
>>         pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
>>       ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
>> @@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>       if (!ctxt)
>>           goto fail0;
>>       ctxt->rc_maxpages = pages;
>> -    buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
>> -    if (!buffer)
>> +
>> +    if (!rpcrdma_pool_buffer_alloc(rdma->sc_recv_pool, GFP_KERNEL,
>> +                       &ctxt->rc_recv_buf,
>> +                       &ctxt->rc_recv_sge.addr))
>>           goto fail1;
>> -    addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
>> -                 rdma->sc_max_req_size, DMA_FROM_DEVICE);
>> -    if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
>> -        goto fail2;
>>         svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
>>       pcl_init(&ctxt->rc_call_pcl);
>> @@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>       ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
>>       ctxt->rc_recv_wr.num_sge = 1;
>>       ctxt->rc_cqe.done = svc_rdma_wc_receive;
>> -    ctxt->rc_recv_sge.addr = addr;
>>       ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
>>       ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
>> -    ctxt->rc_recv_buf = buffer;
>>       svc_rdma_cc_init(rdma, &ctxt->rc_cc);
>>       return ctxt;
>>   -fail2:
>> -    kfree(buffer);
>>   fail1:
>>       kfree(ctxt);
>>   fail0:
>>       return NULL;
>>   }
>>   -static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
>> -                       struct svc_rdma_recv_ctxt *ctxt)
>> -{
>> -    ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
>> -                ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
>> -    kfree(ctxt->rc_recv_buf);
>> -    kfree(ctxt);
>> -}
>> -
>>   /**
>>    * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
>>    * @rdma: svcxprt_rdma being torn down
>> @@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct
>> svcxprt_rdma *rdma)
>>         while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
>>           ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
>> -        svc_rdma_recv_ctxt_destroy(rdma, ctxt);
>> +        kfree(ctxt);
>>       }
>> +    rpcrdma_pool_destroy(rdma->sc_recv_pool);
>>   }
>>     /**
>> @@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct
>> svcxprt_rdma *rdma,
>>    */
>>   bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
>>   {
>> +    struct rpcrdma_pool_args args = {
>> +        .pa_device    = rdma->sc_cm_id->device,
>> +        .pa_bufsize    = rdma->sc_max_req_size,
>> +        .pa_direction    = DMA_FROM_DEVICE,
>> +    };
>>       unsigned int total;
>>   +    rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
>> +    if (!rdma->sc_recv_pool)
>> +        return false;
>> +
>>       /* For each credit, allocate enough recv_ctxts for one
>>        * posted Receive and one RPC in process.
>>        */
> 


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
  2025-08-19 16:08   ` Chuck Lever
@ 2025-08-19 16:58     ` Tom Talpey
  2025-08-19 17:16       ` Chuck Lever
  0 siblings, 1 reply; 6+ messages in thread
From: Tom Talpey @ 2025-08-19 16:58 UTC (permalink / raw)
  To: Chuck Lever, linux-nfs, linux-rdma; +Cc: Chuck Lever

On 8/19/2025 12:08 PM, Chuck Lever wrote:
> On 8/19/25 12:03 PM, Tom Talpey wrote:
>> On 8/11/2025 4:35 PM, Chuck Lever wrote:
>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>
>>> Reduce the per-connection footprint in the host's and RNIC's memory
>>> management TLBs by combining groups of a connection's Receive
>>> buffers into fewer IOVAs.
>>
>> This is an interesting and potentially useful approach. Keeping
>> the iova count (==1) reduces the size of work requests and greatly
>> simplifies processing.
>>
>> But how large are the iova's currently? RPCRDMA_DEF_INLINE_THRESH
>> is just 4096, which would mean typically <= 2 iova's. The max is
>> arbitrarily but consistently 64KB, is this complexity worth it?
> 
> The pool's shard size is RPCRDMA_MAX_INLINE_THRESH, or 64KB. That's the
> largest inline threshold this implementation allows.
> 
> The default inline threshold is 4KB, so one shared can hold up to
> sixteen 4KB Receive buffers. The default credit limit is 64, plus 8
> batch overflow, so 72 Receive buffers total per connection.
> 
> 
>> And, allocating large contiguous buffers would seem to shift the
>> burden to kmalloc and/or the IOMMU, so it's not free, right?
> 
> Can you elaborate on what you mean by "burden" ?

Sure, it's that somebody has to manage the iova scatter/gather
segments.

Using kmalloc or its moral equivalent offers a contract that the
memory returned is physically contiguous, 1 segment. That's
gonna scale badly.

Using the IOMMU, when available, stuffs the s/g list into its
hardware. Simple at the verb layer (again 1 segment) but uses
the shared hardware resource to provide it.

Another approach might be to use fast-register for the receive
buffers, instead of ib_register_mr on the privileged lmr. This
would be a page list with first-byte-offset and length, which
would put it the adapter's TPT instead of the PCI-facing IOMMU.
The fmr's would registerd only once, unlike the fmr's used for
remote transfers, so the cost would remain low. And fmr's typically
support 16 segments minimum, so no restriction there.

My point is that it seems unnecessary somehow in the RPCRDMA
layer. But, that's just my intuition. Finding some way to measure
any benefit (performance, setup overhead, scalbility, ...) would
be certainly be useful.

Tom.


>>> I don't have a good way to measure whether this approach is
>>> effective.
>>
>> I guess I'd need to see this data to be more convinced. But it does
>> seem potentially promising, at least on some RDMA provider hardware.
>>
>> Tom.
>>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>    include/linux/sunrpc/svc_rdma.h         |   3 +
>>>    include/trace/events/rpcrdma.h          |  99 +++++++++++
>>>    net/sunrpc/xprtrdma/Makefile            |   2 +-
>>>    net/sunrpc/xprtrdma/pool.c              | 223 ++++++++++++++++++++++++
>>>    net/sunrpc/xprtrdma/pool.h              |  25 +++
>>>    net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  43 ++---
>>>    6 files changed, 370 insertions(+), 25 deletions(-)
>>>    create mode 100644 net/sunrpc/xprtrdma/pool.c
>>>    create mode 100644 net/sunrpc/xprtrdma/pool.h
>>>
>>> Changes since v1:
>>> - Rename "chunks" to "shards" -- RPC/RDMA already has chunks
>>> - Replace pool's list of shards with an xarray
>>> - Implement bitmap-based shard free space management
>>> - Implement some naive observability
>>>
>>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/
>>> svc_rdma.h
>>> index 22704c2e5b9b..b4f3c01f1b94 100644
>>> --- a/include/linux/sunrpc/svc_rdma.h
>>> +++ b/include/linux/sunrpc/svc_rdma.h
>>> @@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
>>>    extern struct percpu_counter svcrdma_stat_sq_starve;
>>>    extern struct percpu_counter svcrdma_stat_write;
>>>    +struct rpcrdma_pool;
>>> +
>>>    struct svcxprt_rdma {
>>>        struct svc_xprt      sc_xprt;        /* SVC transport structure */
>>>        struct rdma_cm_id    *sc_cm_id;        /* RDMA connection id */
>>> @@ -112,6 +114,7 @@ struct svcxprt_rdma {
>>>        unsigned long         sc_flags;
>>>        struct work_struct   sc_work;
>>>    +    struct rpcrdma_pool  *sc_recv_pool;
>>>        struct llist_head    sc_recv_ctxts;
>>>          atomic_t         sc_completion_ids;
>>> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/
>>> rpcrdma.h
>>> index e6a72646c507..8bc713082c1a 100644
>>> --- a/include/trace/events/rpcrdma.h
>>> +++ b/include/trace/events/rpcrdma.h
>>> @@ -2336,6 +2336,105 @@
>>> DECLARE_EVENT_CLASS(rpcrdma_client_register_class,
>>>    DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register);
>>>    DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister);
>>>    +TRACE_EVENT(rpcrdma_pool_create,
>>> +    TP_PROTO(
>>> +        unsigned int poolid,
>>> +        size_t bufsize
>>> +    ),
>>> +
>>> +    TP_ARGS(poolid, bufsize),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(unsigned int, poolid)
>>> +        __field(size_t, bufsize)
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->poolid = poolid;
>>> +        __entry->bufsize = bufsize;
>>> +    ),
>>> +
>>> +    TP_printk("poolid=%u bufsize=%zu bytes",
>>> +        __entry->poolid, __entry->bufsize
>>> +    )
>>> +);
>>> +
>>> +TRACE_EVENT(rpcrdma_pool_destroy,
>>> +    TP_PROTO(
>>> +        unsigned int poolid
>>> +    ),
>>> +
>>> +    TP_ARGS(poolid),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(unsigned int, poolid)
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->poolid = poolid;),
>>> +
>>> +    TP_printk("poolid=%u",
>>> +        __entry->poolid
>>> +    )
>>> +);
>>> +
>>> +DECLARE_EVENT_CLASS(rpcrdma_pool_shard_class,
>>> +    TP_PROTO(
>>> +        unsigned int poolid,
>>> +        u32 shardid
>>> +    ),
>>> +
>>> +    TP_ARGS(poolid, shardid),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(unsigned int, poolid)
>>> +        __field(u32, shardid)
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->poolid = poolid;
>>> +        __entry->shardid = shardid;
>>> +    ),
>>> +
>>> +    TP_printk("poolid=%u shardid=%u",
>>> +        __entry->poolid, __entry->shardid
>>> +    )
>>> +);
>>> +
>>> +#define DEFINE_RPCRDMA_POOL_SHARD_EVENT(name)                \
>>> +    DEFINE_EVENT(rpcrdma_pool_shard_class, name,            \
>>> +    TP_PROTO(                            \
>>> +        unsigned int poolid,                    \
>>> +        u32 shardid                        \
>>> +    ),                                \
>>> +    TP_ARGS(poolid, shardid))
>>> +
>>> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_new);
>>> +DEFINE_RPCRDMA_POOL_SHARD_EVENT(rpcrdma_pool_shard_free);
>>> +
>>> +TRACE_EVENT(rpcrdma_pool_buffer,
>>> +    TP_PROTO(
>>> +        unsigned int poolid,
>>> +        const void *buffer
>>> +    ),
>>> +
>>> +    TP_ARGS(poolid, buffer),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(unsigned int, poolid)
>>> +        __field(const void *, buffer)
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->poolid = poolid;
>>> +        __entry->buffer = buffer;
>>> +    ),
>>> +
>>> +    TP_printk("poolid=%u buffer=%p",
>>> +        __entry->poolid, __entry->buffer
>>> +    )
>>> +);
>>> +
>>>    #endif /* _TRACE_RPCRDMA_H */
>>>      #include <trace/define_trace.h>
>>> diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
>>> index 3232aa23cdb4..f69456dffe87 100644
>>> --- a/net/sunrpc/xprtrdma/Makefile
>>> +++ b/net/sunrpc/xprtrdma/Makefile
>>> @@ -1,7 +1,7 @@
>>>    # SPDX-License-Identifier: GPL-2.0
>>>    obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
>>>    -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
>>> +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o
>>> pool.o \
>>>        svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
>>>        svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
>>>        svc_rdma_pcl.o module.o
>>> diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
>>> new file mode 100644
>>> index 000000000000..e285c3e9c38e
>>> --- /dev/null
>>> +++ b/net/sunrpc/xprtrdma/pool.c
>>> @@ -0,0 +1,223 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Copyright (c) 2025, Oracle and/or its affiliates.
>>> + *
>>> + * Pools for RPC-over-RDMA Receive buffers.
>>> + *
>>> + * A buffer pool attempts to conserve both the number of DMA mappings
>>> + * and the device's IOVA space by collecting small buffers together
>>> + * into a shard that has a single DMA mapping.
>>> + *
>>> + * API Contract:
>>> + *  - Buffers contained in one rpcrdma_pool instance are the same
>>> + *    size (rp_bufsize), no larger than RPCRDMA_MAX_INLINE_THRESH
>>> + *  - Buffers in one rpcrdma_pool instance are mapped using the same
>>> + *    DMA direction
>>> + *  - Buffers in one rpcrdma_pool instance are automatically released
>>> + *    when the instance is destroyed
>>> + *
>>> + * Future work:
>>> + *   - Manage pool resources by reference count
>>> + */
>>> +
>>> +#include <linux/list.h>
>>> +#include <linux/xarray.h>
>>> +#include <linux/sunrpc/svc_rdma.h>
>>> +
>>> +#include <rdma/ib_verbs.h>
>>> +
>>> +#include "xprt_rdma.h"
>>> +#include "pool.h"
>>> +
>>> +#include <trace/events/rpcrdma.h>
>>> +
>>> +/*
>>> + * An idr would give near perfect pool ID uniqueness, but for
>>> + * the moment the pool ID is used only for observability, not
>>> + * correctness.
>>> + */
>>> +static atomic_t rpcrdma_pool_id;
>>> +
>>> +struct rpcrdma_pool {
>>> +    struct xarray        rp_xa;
>>> +    struct ib_device    *rp_device;
>>> +    size_t            rp_shardsize;    // in bytes
>>> +    size_t            rp_bufsize;    // in bytes
>>> +    enum dma_data_direction    rp_direction;
>>> +    unsigned int        rp_bufs_per_shard;
>>> +    unsigned int        rp_pool_id;
>>> +};
>>> +
>>> +struct rpcrdma_pool_shard {
>>> +    u8            *pc_cpu_addr;
>>> +    u64            pc_mapped_addr;
>>> +    unsigned long        *pc_bitmap;
>>> +};
>>> +
>>> +static struct rpcrdma_pool_shard *
>>> +rpcrdma_pool_shard_alloc(struct rpcrdma_pool *pool, gfp_t flags)
>>> +{
>>> +    struct rpcrdma_pool_shard *shard;
>>> +    size_t bmap_size;
>>> +
>>> +    shard = kmalloc(sizeof(*shard), flags);
>>> +    if (!shard)
>>> +        goto fail;
>>> +
>>> +    bmap_size = BITS_TO_LONGS(pool->rp_bufs_per_shard) *
>>> sizeof(unsigned long);
>>> +    shard->pc_bitmap = kzalloc(bmap_size, flags);
>>> +    if (!shard->pc_bitmap)
>>> +        goto free_shard;
>>> +
>>> +    /*
>>> +     * For good NUMA awareness, allocate the shard's I/O buffer
>>> +     * on the NUMA node that the underlying device is affined to.
>>> +     */
>>> +    shard->pc_cpu_addr = kmalloc_node(pool->rp_shardsize, flags,
>>> +                      ibdev_to_node(pool->rp_device));
>>> +    if (!shard->pc_cpu_addr)
>>> +        goto free_bitmap;
>>> +    shard->pc_mapped_addr = ib_dma_map_single(pool->rp_device,
>>> +                          shard->pc_cpu_addr,
>>> +                          pool->rp_shardsize,
>>> +                          pool->rp_direction);
>>> +    if (ib_dma_mapping_error(pool->rp_device, shard->pc_mapped_addr))
>>> +        goto free_iobuf;
>>> +
>>> +    return shard;
>>> +
>>> +free_iobuf:
>>> +    kfree(shard->pc_cpu_addr);
>>> +free_bitmap:
>>> +    kfree(shard->pc_bitmap);
>>> +free_shard:
>>> +    kfree(shard);
>>> +fail:
>>> +    return NULL;
>>> +}
>>> +
>>> +static void
>>> +rpcrdma_pool_shard_free(struct rpcrdma_pool *pool,
>>> +            struct rpcrdma_pool_shard *shard)
>>> +{
>>> +    ib_dma_unmap_single(pool->rp_device, shard->pc_mapped_addr,
>>> +                pool->rp_shardsize, pool->rp_direction);
>>> +    kfree(shard->pc_cpu_addr);
>>> +    kfree(shard->pc_bitmap);
>>> +    kfree(shard);
>>> +}
>>> +
>>> +/**
>>> + * rpcrdma_pool_create - Allocate and initialize an rpcrdma_pool
>>> instance
>>> + * @args: pool creation arguments
>>> + * @flags: GFP flags used during pool creation
>>> + *
>>> + * Returns a pointer to an opaque rpcrdma_pool instance or
>>> + * NULL. If a pool instance is returned, caller must free the
>>> + * returned instance using rpcrdma_pool_destroy().
>>> + */
>>> +struct rpcrdma_pool *
>>> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
>>> +{
>>> +    struct rpcrdma_pool *pool;
>>> +
>>> +    pool = kmalloc(sizeof(*pool), flags);
>>> +    if (!pool)
>>> +        return NULL;
>>> +
>>> +    xa_init_flags(&pool->rp_xa, XA_FLAGS_ALLOC);
>>> +    pool->rp_device = args->pa_device;
>>> +    pool->rp_shardsize = RPCRDMA_MAX_INLINE_THRESH;
>>> +    pool->rp_bufsize = args->pa_bufsize;
>>> +    pool->rp_direction = args->pa_direction;
>>> +    pool->rp_bufs_per_shard = pool->rp_shardsize / pool->rp_bufsize;
>>> +    pool->rp_pool_id = atomic_inc_return(&rpcrdma_pool_id);
>>> +
>>> +    trace_rpcrdma_pool_create(pool->rp_pool_id, pool->rp_bufsize);
>>> +    return pool;
>>> +}
>>> +
>>> +/**
>>> + * rpcrdma_pool_destroy - Release resources owned by @pool
>>> + * @pool: buffer pool instance that will no longer be used
>>> + *
>>> + * This call releases all buffers in @pool that were allocated
>>> + * via rpcrdma_pool_buffer_alloc().
>>> + */
>>> +void
>>> +rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
>>> +{
>>> +    struct rpcrdma_pool_shard *shard;
>>> +    unsigned long index;
>>> +
>>> +    trace_rpcrdma_pool_destroy(pool->rp_pool_id);
>>> +
>>> +    xa_for_each(&pool->rp_xa, index, shard) {
>>> +        trace_rpcrdma_pool_shard_free(pool->rp_pool_id, index);
>>> +        xa_erase(&pool->rp_xa, index);
>>> +        rpcrdma_pool_shard_free(pool, shard);
>>> +    }
>>> +
>>> +    xa_destroy(&pool->rp_xa);
>>> +    kfree(pool);
>>> +}
>>> +
>>> +/**
>>> + * rpcrdma_pool_buffer_alloc - Allocate a buffer from @pool
>>> + * @pool: buffer pool from which to allocate the buffer
>>> + * @flags: GFP flags used during this allocation
>>> + * @cpu_addr: CPU address of the buffer
>>> + * @mapped_addr: mapped address of the buffer
>>> + *
>>> + * Return values:
>>> + *   %true: @cpu_addr and @mapped_addr are filled in with a DMA-
>>> mapped buffer
>>> + *   %false: No buffer is available
>>> + *
>>> + * When rpcrdma_pool_buffer_alloc() is successful, the returned
>>> + * buffer is freed automatically when the buffer pool is released
>>> + * by rpcrdma_pool_destroy().
>>> + */
>>> +bool
>>> +rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
>>> +              void **cpu_addr, u64 *mapped_addr)
>>> +{
>>> +    struct rpcrdma_pool_shard *shard;
>>> +    u64 returned_mapped_addr;
>>> +    void *returned_cpu_addr;
>>> +    unsigned long index;
>>> +    u32 id;
>>> +
>>> +    xa_for_each(&pool->rp_xa, index, shard) {
>>> +        unsigned int i;
>>> +
>>> +        returned_cpu_addr = shard->pc_cpu_addr;
>>> +        returned_mapped_addr = shard->pc_mapped_addr;
>>> +        for (i = 0; i < pool->rp_bufs_per_shard; i++) {
>>> +            if (!test_and_set_bit(i, shard->pc_bitmap)) {
>>> +                returned_cpu_addr += i * pool->rp_bufsize;
>>> +                returned_mapped_addr += i * pool->rp_bufsize;
>>> +                goto out;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    shard = rpcrdma_pool_shard_alloc(pool, flags);
>>> +    if (!shard)
>>> +        return false;
>>> +    set_bit(0, shard->pc_bitmap);
>>> +    returned_cpu_addr = shard->pc_cpu_addr;
>>> +    returned_mapped_addr = shard->pc_mapped_addr;
>>> +
>>> +    if (xa_alloc(&pool->rp_xa, &id, shard, xa_limit_16b, flags) != 0) {
>>> +        rpcrdma_pool_shard_free(pool, shard);
>>> +        return false;
>>> +    }
>>> +    trace_rpcrdma_pool_shard_new(pool->rp_pool_id, id);
>>> +
>>> +out:
>>> +    *cpu_addr = returned_cpu_addr;
>>> +    *mapped_addr = returned_mapped_addr;
>>> +
>>> +    trace_rpcrdma_pool_buffer(pool->rp_pool_id, returned_cpu_addr);
>>> +    return true;
>>> +}
>>> diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
>>> new file mode 100644
>>> index 000000000000..214f8fe78b9a
>>> --- /dev/null
>>> +++ b/net/sunrpc/xprtrdma/pool.h
>>> @@ -0,0 +1,25 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +/*
>>> + * Copyright (c) 2025, Oracle and/or its affiliates.
>>> + *
>>> + * Pools for Send and Receive buffers.
>>> + */
>>> +
>>> +#ifndef RPCRDMA_POOL_H
>>> +#define RPCRDMA_POOL_H
>>> +
>>> +struct rpcrdma_pool_args {
>>> +    struct ib_device    *pa_device;
>>> +    size_t            pa_bufsize;
>>> +    enum dma_data_direction    pa_direction;
>>> +};
>>> +
>>> +struct rpcrdma_pool;
>>> +
>>> +struct rpcrdma_pool *
>>> +rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
>>> +void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
>>> +bool rpcrdma_pool_buffer_alloc(struct rpcrdma_pool *pool, gfp_t flags,
>>> +                   void **cpu_addr, u64 *mapped_addr);
>>> +
>>> +#endif /* RPCRDMA_POOL_H */
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/
>>> xprtrdma/svc_rdma_recvfrom.c
>>> index e7e4a39ca6c6..f625f1ede434 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> @@ -104,9 +104,9 @@
>>>    #include <linux/sunrpc/svc_rdma.h>
>>>      #include "xprt_rdma.h"
>>> -#include <trace/events/rpcrdma.h>
>>> +#include "pool.h"
>>>    -static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
>>> +#include <trace/events/rpcrdma.h>
>>>      static inline struct svc_rdma_recv_ctxt *
>>>    svc_rdma_next_recv_ctxt(struct list_head *list)
>>> @@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
>>>                        rc_list);
>>>    }
>>>    +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
>>> +
>>>    static struct svc_rdma_recv_ctxt *
>>>    svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>>    {
>>>        int node = ibdev_to_node(rdma->sc_cm_id->device);
>>>        struct svc_rdma_recv_ctxt *ctxt;
>>>        unsigned long pages;
>>> -    dma_addr_t addr;
>>> -    void *buffer;
>>>          pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
>>>        ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
>>> @@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>>        if (!ctxt)
>>>            goto fail0;
>>>        ctxt->rc_maxpages = pages;
>>> -    buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
>>> -    if (!buffer)
>>> +
>>> +    if (!rpcrdma_pool_buffer_alloc(rdma->sc_recv_pool, GFP_KERNEL,
>>> +                       &ctxt->rc_recv_buf,
>>> +                       &ctxt->rc_recv_sge.addr))
>>>            goto fail1;
>>> -    addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
>>> -                 rdma->sc_max_req_size, DMA_FROM_DEVICE);
>>> -    if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
>>> -        goto fail2;
>>>          svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
>>>        pcl_init(&ctxt->rc_call_pcl);
>>> @@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
>>>        ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
>>>        ctxt->rc_recv_wr.num_sge = 1;
>>>        ctxt->rc_cqe.done = svc_rdma_wc_receive;
>>> -    ctxt->rc_recv_sge.addr = addr;
>>>        ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
>>>        ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
>>> -    ctxt->rc_recv_buf = buffer;
>>>        svc_rdma_cc_init(rdma, &ctxt->rc_cc);
>>>        return ctxt;
>>>    -fail2:
>>> -    kfree(buffer);
>>>    fail1:
>>>        kfree(ctxt);
>>>    fail0:
>>>        return NULL;
>>>    }
>>>    -static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
>>> -                       struct svc_rdma_recv_ctxt *ctxt)
>>> -{
>>> -    ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
>>> -                ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
>>> -    kfree(ctxt->rc_recv_buf);
>>> -    kfree(ctxt);
>>> -}
>>> -
>>>    /**
>>>     * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
>>>     * @rdma: svcxprt_rdma being torn down
>>> @@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct
>>> svcxprt_rdma *rdma)
>>>          while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
>>>            ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
>>> -        svc_rdma_recv_ctxt_destroy(rdma, ctxt);
>>> +        kfree(ctxt);
>>>        }
>>> +    rpcrdma_pool_destroy(rdma->sc_recv_pool);
>>>    }
>>>      /**
>>> @@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct
>>> svcxprt_rdma *rdma,
>>>     */
>>>    bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
>>>    {
>>> +    struct rpcrdma_pool_args args = {
>>> +        .pa_device    = rdma->sc_cm_id->device,
>>> +        .pa_bufsize    = rdma->sc_max_req_size,
>>> +        .pa_direction    = DMA_FROM_DEVICE,
>>> +    };
>>>        unsigned int total;
>>>    +    rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
>>> +    if (!rdma->sc_recv_pool)
>>> +        return false;
>>> +
>>>        /* For each credit, allocate enough recv_ctxts for one
>>>         * posted Receive and one RPC in process.
>>>         */
>>
> 
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
  2025-08-19 16:58     ` Tom Talpey
@ 2025-08-19 17:16       ` Chuck Lever
  2025-08-19 17:39         ` Tom Talpey
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Lever @ 2025-08-19 17:16 UTC (permalink / raw)
  To: Tom Talpey, linux-nfs, linux-rdma; +Cc: Chuck Lever

On 8/19/25 12:58 PM, Tom Talpey wrote:
> On 8/19/2025 12:08 PM, Chuck Lever wrote:
>> On 8/19/25 12:03 PM, Tom Talpey wrote:
>>> On 8/11/2025 4:35 PM, Chuck Lever wrote:
>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>
>>>> Reduce the per-connection footprint in the host's and RNIC's memory
>>>> management TLBs by combining groups of a connection's Receive
>>>> buffers into fewer IOVAs.
>>>
>>> This is an interesting and potentially useful approach. Keeping
>>> the iova count (==1) reduces the size of work requests and greatly
>>> simplifies processing.
>>>
>>> But how large are the iova's currently? RPCRDMA_DEF_INLINE_THRESH
>>> is just 4096, which would mean typically <= 2 iova's. The max is
>>> arbitrarily but consistently 64KB, is this complexity worth it?
>>
>> The pool's shard size is RPCRDMA_MAX_INLINE_THRESH, or 64KB. That's the
>> largest inline threshold this implementation allows.
>>
>> The default inline threshold is 4KB, so one shared can hold up to
>> sixteen 4KB Receive buffers. The default credit limit is 64, plus 8
>> batch overflow, so 72 Receive buffers total per connection.
>>
>>
>>> And, allocating large contiguous buffers would seem to shift the
>>> burden to kmalloc and/or the IOMMU, so it's not free, right?
>>
>> Can you elaborate on what you mean by "burden" ?
> 
> Sure, it's that somebody has to manage the iova scatter/gather
> segments.
> 
> Using kmalloc or its moral equivalent offers a contract that the
> memory returned is physically contiguous, 1 segment. That's
> gonna scale badly.

I'm still not sure what's not going to scale. We're already using
kmalloc today, one per Receive buffer. I'm making it one kmalloc per
shard (which can contain more than a dozen Receive buffers).


> Using the IOMMU, when available, stuffs the s/g list into its
> hardware. Simple at the verb layer (again 1 segment) but uses
> the shared hardware resource to provide it.
> 
> Another approach might be to use fast-register for the receive
> buffers, instead of ib_register_mr on the privileged lmr. This
> would be a page list with first-byte-offset and length, which
> would put it the adapter's TPT instead of the PCI-facing IOMMU.
> The fmr's would registerd only once, unlike the fmr's used for
> remote transfers, so the cost would remain low. And fmr's typically
> support 16 segments minimum, so no restriction there.

I can experiment with fast registration. The goal of this work is to
reduce the per-connection hardware footprint.


> My point is that it seems unnecessary somehow in the RPCRDMA
> layer.

Well, if this effort is intriguing to others, it can certainly be moved
into the RDMA core. I already intend to convert the RPC/RDMA client
Receive code to use it too.


> But, that's just my intuition. Finding some way to measure
> any benefit (performance, setup overhead, scalbility, ...) would
> be certainly be useful.

That is a primary purpose of me posting this RFC. As stated in the patch
description, I would like some help quantifying the improvement (if
there is any).


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas
  2025-08-19 17:16       ` Chuck Lever
@ 2025-08-19 17:39         ` Tom Talpey
  0 siblings, 0 replies; 6+ messages in thread
From: Tom Talpey @ 2025-08-19 17:39 UTC (permalink / raw)
  To: Chuck Lever, linux-nfs, linux-rdma; +Cc: Chuck Lever

On 8/19/2025 1:16 PM, Chuck Lever wrote:
> On 8/19/25 12:58 PM, Tom Talpey wrote:
>> On 8/19/2025 12:08 PM, Chuck Lever wrote:
>>> On 8/19/25 12:03 PM, Tom Talpey wrote:
>>>> On 8/11/2025 4:35 PM, Chuck Lever wrote:
>>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>>
>>>>> Reduce the per-connection footprint in the host's and RNIC's memory
>>>>> management TLBs by combining groups of a connection's Receive
>>>>> buffers into fewer IOVAs.
>>>>
>>>> This is an interesting and potentially useful approach. Keeping
>>>> the iova count (==1) reduces the size of work requests and greatly
>>>> simplifies processing.
>>>>
>>>> But how large are the iova's currently? RPCRDMA_DEF_INLINE_THRESH
>>>> is just 4096, which would mean typically <= 2 iova's. The max is
>>>> arbitrarily but consistently 64KB, is this complexity worth it?
>>>
>>> The pool's shard size is RPCRDMA_MAX_INLINE_THRESH, or 64KB. That's the
>>> largest inline threshold this implementation allows.
>>>
>>> The default inline threshold is 4KB, so one shared can hold up to
>>> sixteen 4KB Receive buffers. The default credit limit is 64, plus 8
>>> batch overflow, so 72 Receive buffers total per connection.
>>>
>>>
>>>> And, allocating large contiguous buffers would seem to shift the
>>>> burden to kmalloc and/or the IOMMU, so it's not free, right?
>>>
>>> Can you elaborate on what you mean by "burden" ?
>>
>> Sure, it's that somebody has to manage the iova scatter/gather
>> segments.
>>
>> Using kmalloc or its moral equivalent offers a contract that the
>> memory returned is physically contiguous, 1 segment. That's
>> gonna scale badly.
> 
> I'm still not sure what's not going to scale. We're already using
> kmalloc today, one per Receive buffer. I'm making it one kmalloc per
> shard (which can contain more than a dozen Receive buffers).


Sorry - availability of free contiguous memory (pages). Over
time, fragmentation and demand may limit this or at least make
it more costly/precious.

>> Using the IOMMU, when available, stuffs the s/g list into its
>> hardware. Simple at the verb layer (again 1 segment) but uses
>> the shared hardware resource to provide it.
>>
>> Another approach might be to use fast-register for the receive
>> buffers, instead of ib_register_mr on the privileged lmr. This
>> would be a page list with first-byte-offset and length, which
>> would put it the adapter's TPT instead of the PCI-facing IOMMU.
>> The fmr's would registerd only once, unlike the fmr's used for
>> remote transfers, so the cost would remain low. And fmr's typically
>> support 16 segments minimum, so no restriction there.
> 
> I can experiment with fast registration. The goal of this work is to
> reduce the per-connection hardware footprint.
> 
> 
>> My point is that it seems unnecessary somehow in the RPCRDMA
>> layer.
> 
> Well, if this effort is intriguing to others, it can certainly be moved
> into the RDMA core. I already intend to convert the RPC/RDMA client
> Receive code to use it too.

Not sure. The SMBdirect code doesn't need it, because it uses
somewhat small receive buffers that are much smaller than 4KB.
Block protocols probably not.

I sort-of think this is a special requirement of the rpcrdma
protocol, which was architected in part to preserve older xdr
code by hiding RDMA from the RPC consumer and therefore segmenting
almost everything. But there may be other reasons to consider this,
need to ponder that a bit.

>> But, that's just my intuition. Finding some way to measure
>> any benefit (performance, setup overhead, scalbility, ...) would
>> be certainly be useful.
> 
> That is a primary purpose of me posting this RFC. As stated in the patch
> description, I would like some help quantifying the improvement (if
> there is any).
I'll ponder that too. There are potential benefits at several
layers, this makes things tricky.

Tom.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-19 17:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-11 20:35 [RFC PATCH v2] svcrdma: Introduce Receive buffer arenas Chuck Lever
2025-08-19 16:03 ` Tom Talpey
2025-08-19 16:08   ` Chuck Lever
2025-08-19 16:58     ` Tom Talpey
2025-08-19 17:16       ` Chuck Lever
2025-08-19 17:39         ` Tom Talpey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).