* [RFC PATCH] svcrdma: Introduce Receive buffer arenas
@ 2025-08-08 18:46 Chuck Lever
0 siblings, 0 replies; only message in thread
From: Chuck Lever @ 2025-08-08 18:46 UTC (permalink / raw)
To: linux-nfs, linux-rdma; +Cc: Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Reduce the per-connection footprint in the host's and RNIC's memory
management TLBs by combining each connection's Receive buffers into
a single IOVA.
I don't have a good way to measure whether this approach is
effective.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 3 +
net/sunrpc/xprtrdma/Makefile | 2 +-
net/sunrpc/xprtrdma/pool.c | 162 ++++++++++++++++++++++++
net/sunrpc/xprtrdma/pool.h | 25 ++++
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 43 +++----
5 files changed, 210 insertions(+), 25 deletions(-)
create mode 100644 net/sunrpc/xprtrdma/pool.c
create mode 100644 net/sunrpc/xprtrdma/pool.h
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 22704c2e5b9b..b4f3c01f1b94 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
extern struct percpu_counter svcrdma_stat_sq_starve;
extern struct percpu_counter svcrdma_stat_write;
+struct rpcrdma_pool;
+
struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
@@ -112,6 +114,7 @@ struct svcxprt_rdma {
unsigned long sc_flags;
struct work_struct sc_work;
+ struct rpcrdma_pool *sc_recv_pool;
struct llist_head sc_recv_ctxts;
atomic_t sc_completion_ids;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 3232aa23cdb4..f69456dffe87 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o pool.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
svc_rdma_pcl.o module.o
diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
new file mode 100644
index 000000000000..ef338528a594
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for Send and Receive buffers.
+ *
+ * A buffer pool attempts to conserve both the number of DMA mappings
+ * and the device's IOVA space by collecting small buffers together
+ * into a chunk that has a single DMA mapping.
+ *
+ * Future work:
+ * - Manage pool resources by reference count
+ * - Manage chunk free space via a bitmap
+ */
+
+#include <linux/list.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "pool.h"
+
+struct rpcrdma_pool {
+ struct list_head rp_chunk_list;
+
+ struct ib_device *rp_device;
+ size_t rp_chunksize;
+ size_t rp_bufsize;
+ enum dma_data_direction rp_direction;
+};
+
+struct rpcrdma_pool_chunk {
+ struct list_head pc_next_chunk;
+
+ u8 *pc_cpu_addr;
+ dma_addr_t pc_dma_addr;
+ size_t pc_free_start;
+};
+
+static struct rpcrdma_pool_chunk *
+rpcrdma_pool_chunk_create(struct rpcrdma_pool *pool, gfp_t flags)
+{
+ struct rpcrdma_pool_chunk *chunk;
+
+ chunk = kmalloc(sizeof(*chunk), flags);
+ if (!chunk)
+ return NULL;
+ chunk->pc_cpu_addr = kmalloc_node(pool->rp_chunksize, flags,
+ ibdev_to_node(pool->rp_device));
+ if (!chunk->pc_cpu_addr) {
+ kfree(chunk);
+ return NULL;
+ }
+ chunk->pc_dma_addr = ib_dma_map_single(pool->rp_device,
+ chunk->pc_cpu_addr,
+ pool->rp_chunksize,
+ pool->rp_direction);
+ if (ib_dma_mapping_error(pool->rp_device, chunk->pc_dma_addr)) {
+ kfree(chunk->pc_cpu_addr);
+ kfree(chunk);
+ return NULL;
+ }
+
+ chunk->pc_free_start = 0;
+ return chunk;
+}
+
+/**
+ * rpcrdma_pool_create - Initialize a buffer pool
+ * @args: pool creation arguments
+ * @flags: GFP flags for pool creation
+ *
+ * Returns a pointer to an opaque rpcrdma_pool object or
+ * NULL. If a pool object is returned, caller must free the
+ * returned object using rpcrdma_pool_destroy().
+ */
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
+{
+ struct rpcrdma_pool *pool;
+
+ pool = kmalloc(sizeof(*pool), flags);
+ if (!pool)
+ return NULL;
+
+ INIT_LIST_HEAD(&pool->rp_chunk_list);
+ pool->rp_device = args->pa_device;
+ pool->rp_chunksize = RPCRDMA_MAX_INLINE_THRESH;
+ pool->rp_bufsize = args->pa_bufsize;
+ pool->rp_direction = args->pa_direction;
+ return pool;
+}
+
+/**
+ * rpcrdma_pool_destroy - Release resources owned by a buffer pool
+ * @pool: buffer pool object that will no longer be used
+ */
+void
+rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
+{
+ struct rpcrdma_pool_chunk *chunk;
+
+ while (!list_empty(&pool->rp_chunk_list)) {
+ chunk = list_first_entry(&pool->rp_chunk_list,
+ struct rpcrdma_pool_chunk,
+ pc_next_chunk);
+ list_del(&chunk->pc_next_chunk);
+ ib_dma_unmap_single(pool->rp_device, chunk->pc_dma_addr,
+ pool->rp_chunksize, pool->rp_direction);
+ kfree(chunk->pc_cpu_addr);
+ kfree(chunk);
+ }
+ kfree(pool);
+}
+
+static struct rpcrdma_pool_chunk *
+rpcrdma_pool_find_chunk(struct rpcrdma_pool *pool, gfp_t flags)
+{
+ struct rpcrdma_pool_chunk *chunk;
+
+ list_for_each_entry(chunk, &pool->rp_chunk_list, pc_next_chunk) {
+ size_t remaining = pool->rp_chunksize - chunk->pc_free_start;
+
+ if (pool->rp_bufsize >= remaining)
+ return chunk;
+ }
+
+ chunk = rpcrdma_pool_chunk_create(pool, flags);
+ if (chunk)
+ list_add(&chunk->pc_next_chunk, &pool->rp_chunk_list);
+ return chunk;
+}
+
+/**
+ * rpcrdma_pool_alloc_buffer - Allocate a buffer from a pool
+ * @pool: buffer pool from which to allocate the buffer
+ * @flags: GFP flags for the allocation
+ * @cpu_addr: CPU address of the buffer
+ * @dma_addr: mapped DMA address of the buffer
+ *
+ * Return values:
+ * %true: @cpu_addr and @dma_addr are filled in with a DMA-mapped buffer
+ * %false: No buffer is available
+ *
+ * When successful, the returned buffer is freed automatically when the
+ * buffer pool is released by rpcrdma_pool_destroy().
+ */
+bool
+rpcrdma_pool_alloc_buffer(struct rpcrdma_pool *pool, gfp_t flags,
+ void **cpu_addr, dma_addr_t *dma_addr)
+{
+ struct rpcrdma_pool_chunk *chunk;
+
+ chunk = rpcrdma_pool_find_chunk(pool, flags);
+ if (!chunk)
+ return false;
+
+ *cpu_addr = chunk->pc_cpu_addr + chunk->pc_free_start;
+ *dma_addr = chunk->pc_dma_addr + chunk->pc_free_start;
+ chunk->pc_free_start += pool->rp_bufsize;
+ return true;
+}
diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
new file mode 100644
index 000000000000..666543e22b5b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for Send and Receive buffers.
+ */
+
+#ifndef RPCRDMA_POOL_H
+#define RPCRDMA_POOL_H
+
+struct rpcrdma_pool_args {
+ struct ib_device *pa_device;
+ size_t pa_bufsize;
+ enum dma_data_direction pa_direction;
+};
+
+struct rpcrdma_pool;
+
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
+void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
+bool rpcrdma_pool_alloc_buffer(struct rpcrdma_pool *pool, gfp_t flags,
+ void **cpu_addr, dma_addr_t *dma_addr);
+
+#endif /* RPCRDMA_POOL_H */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e7e4a39ca6c6..8f0328d899d6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -104,9 +104,9 @@
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
-#include <trace/events/rpcrdma.h>
+#include "pool.h"
-static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+#include <trace/events/rpcrdma.h>
static inline struct svc_rdma_recv_ctxt *
svc_rdma_next_recv_ctxt(struct list_head *list)
@@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
rc_list);
}
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_recv_ctxt *ctxt;
unsigned long pages;
- dma_addr_t addr;
- void *buffer;
pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
@@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
if (!ctxt)
goto fail0;
ctxt->rc_maxpages = pages;
- buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
- if (!buffer)
+
+ if (!rpcrdma_pool_alloc_buffer(rdma->sc_recv_pool, GFP_KERNEL,
+ &ctxt->rc_recv_buf,
+ &ctxt->rc_recv_sge.addr))
goto fail1;
- addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
- rdma->sc_max_req_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
- goto fail2;
svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
pcl_init(&ctxt->rc_call_pcl);
@@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
ctxt->rc_recv_wr.num_sge = 1;
ctxt->rc_cqe.done = svc_rdma_wc_receive;
- ctxt->rc_recv_sge.addr = addr;
ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->rc_recv_buf = buffer;
svc_rdma_cc_init(rdma, &ctxt->rc_cc);
return ctxt;
-fail2:
- kfree(buffer);
fail1:
kfree(ctxt);
fail0:
return NULL;
}
-static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
- struct svc_rdma_recv_ctxt *ctxt)
-{
- ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
- ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
- kfree(ctxt->rc_recv_buf);
- kfree(ctxt);
-}
-
/**
* svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
* @rdma: svcxprt_rdma being torn down
@@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
- svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+ kfree(ctxt);
}
+ rpcrdma_pool_destroy(rdma->sc_recv_pool);
}
/**
@@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
*/
bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
+ struct rpcrdma_pool_args args = {
+ .pa_device = rdma->sc_cm_id->device,
+ .pa_bufsize = rdma->sc_max_req_size,
+ .pa_direction = DMA_FROM_DEVICE,
+ };
unsigned int total;
+ rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
+ if (!rdma->sc_recv_pool)
+ return false;
+
/* For each credit, allocate enough recv_ctxts for one
* posted Receive and one RPC in process.
*/
--
2.50.0
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2025-08-08 18:46 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-08 18:46 [RFC PATCH] svcrdma: Introduce Receive buffer arenas Chuck Lever
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).