All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: <linux-nfs@vger.kernel.org>, <linux-rdma@vger.kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH] svcrdma: Introduce Receive buffer arenas
Date: Fri,  8 Aug 2025 14:46:48 -0400	[thread overview]
Message-ID: <20250808184648.120866-1-cel@kernel.org> (raw)

From: Chuck Lever <chuck.lever@oracle.com>

Reduce the per-connection footprint in the host's and RNIC's memory
management TLBs by combining each connection's Receive buffers into
a single IOVA.

I don't have a good way to measure whether this approach is
effective.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |   3 +
 net/sunrpc/xprtrdma/Makefile            |   2 +-
 net/sunrpc/xprtrdma/pool.c              | 162 ++++++++++++++++++++++++
 net/sunrpc/xprtrdma/pool.h              |  25 ++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  43 +++----
 5 files changed, 210 insertions(+), 25 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/pool.c
 create mode 100644 net/sunrpc/xprtrdma/pool.h

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 22704c2e5b9b..b4f3c01f1b94 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -73,6 +73,8 @@ extern struct percpu_counter svcrdma_stat_recv;
 extern struct percpu_counter svcrdma_stat_sq_starve;
 extern struct percpu_counter svcrdma_stat_write;
 
+struct rpcrdma_pool;
+
 struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
@@ -112,6 +114,7 @@ struct svcxprt_rdma {
 	unsigned long	     sc_flags;
 	struct work_struct   sc_work;
 
+	struct rpcrdma_pool  *sc_recv_pool;
 	struct llist_head    sc_recv_ctxts;
 
 	atomic_t	     sc_completion_ids;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 3232aa23cdb4..f69456dffe87 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
-rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o pool.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
 	svc_rdma_pcl.o module.o
diff --git a/net/sunrpc/xprtrdma/pool.c b/net/sunrpc/xprtrdma/pool.c
new file mode 100644
index 000000000000..ef338528a594
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for Send and Receive buffers.
+ *
+ * A buffer pool attempts to conserve both the number of DMA mappings
+ * and the device's IOVA space by collecting small buffers together
+ * into a chunk that has a single DMA mapping.
+ *
+ * Future work:
+ *   - Manage pool resources by reference count
+ *   - Manage chunk free space via a bitmap
+ */
+
+#include <linux/list.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "pool.h"
+
+struct rpcrdma_pool {
+	struct list_head	rp_chunk_list;
+
+	struct ib_device	*rp_device;
+	size_t			rp_chunksize;
+	size_t			rp_bufsize;
+	enum dma_data_direction	rp_direction;
+};
+
+struct rpcrdma_pool_chunk {
+	struct list_head	pc_next_chunk;
+
+	u8			*pc_cpu_addr;
+	dma_addr_t		pc_dma_addr;
+	size_t			pc_free_start;
+};
+
+static struct rpcrdma_pool_chunk *
+rpcrdma_pool_chunk_create(struct rpcrdma_pool *pool, gfp_t flags)
+{
+	struct rpcrdma_pool_chunk *chunk;
+
+	chunk = kmalloc(sizeof(*chunk), flags);
+	if (!chunk)
+		return NULL;
+	chunk->pc_cpu_addr = kmalloc_node(pool->rp_chunksize, flags,
+					  ibdev_to_node(pool->rp_device));
+	if (!chunk->pc_cpu_addr) {
+		kfree(chunk);
+		return NULL;
+	}
+	chunk->pc_dma_addr = ib_dma_map_single(pool->rp_device,
+					       chunk->pc_cpu_addr,
+					       pool->rp_chunksize,
+					       pool->rp_direction);
+	if (ib_dma_mapping_error(pool->rp_device, chunk->pc_dma_addr)) {
+		kfree(chunk->pc_cpu_addr);
+		kfree(chunk);
+		return NULL;
+	}
+
+	chunk->pc_free_start = 0;
+	return chunk;
+}
+
+/**
+ * rpcrdma_pool_create - Initialize a buffer pool
+ * @args: pool creation arguments
+ * @flags: GFP flags for pool creation
+ *
+ * Returns a pointer to an opaque rpcrdma_pool object or
+ * NULL. If a pool object is returned, caller must free the
+ * returned object using rpcrdma_pool_destroy().
+ */
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags)
+{
+	struct rpcrdma_pool *pool;
+
+	pool = kmalloc(sizeof(*pool), flags);
+	if (!pool)
+		return NULL;
+
+	INIT_LIST_HEAD(&pool->rp_chunk_list);
+	pool->rp_device = args->pa_device;
+	pool->rp_chunksize = RPCRDMA_MAX_INLINE_THRESH;
+	pool->rp_bufsize = args->pa_bufsize;
+	pool->rp_direction = args->pa_direction;
+	return pool;
+}
+
+/**
+ * rpcrdma_pool_destroy - Release resources owned by a buffer pool
+ * @pool: buffer pool object that will no longer be used
+ */
+void
+rpcrdma_pool_destroy(struct rpcrdma_pool *pool)
+{
+	struct rpcrdma_pool_chunk *chunk;
+
+	while (!list_empty(&pool->rp_chunk_list)) {
+		chunk = list_first_entry(&pool->rp_chunk_list,
+					 struct rpcrdma_pool_chunk,
+					 pc_next_chunk);
+		list_del(&chunk->pc_next_chunk);
+		ib_dma_unmap_single(pool->rp_device, chunk->pc_dma_addr,
+				    pool->rp_chunksize, pool->rp_direction);
+		kfree(chunk->pc_cpu_addr);
+		kfree(chunk);
+	}
+	kfree(pool);
+}
+
+static struct rpcrdma_pool_chunk *
+rpcrdma_pool_find_chunk(struct rpcrdma_pool *pool, gfp_t flags)
+{
+	struct rpcrdma_pool_chunk *chunk;
+
+	list_for_each_entry(chunk, &pool->rp_chunk_list, pc_next_chunk) {
+		size_t remaining = pool->rp_chunksize - chunk->pc_free_start;
+
+		if (pool->rp_bufsize >= remaining)
+			return chunk;
+	}
+
+	chunk = rpcrdma_pool_chunk_create(pool, flags);
+	if (chunk)
+		list_add(&chunk->pc_next_chunk, &pool->rp_chunk_list);
+	return chunk;
+}
+
+/**
+ * rpcrdma_pool_alloc_buffer - Allocate a buffer from a pool
+ * @pool: buffer pool from which to allocate the buffer
+ * @flags: GFP flags for the allocation
+ * @cpu_addr: CPU address of the buffer
+ * @dma_addr: mapped DMA address of the buffer
+ *
+ * Return values:
+ *   %true: @cpu_addr and @dma_addr are filled in with a DMA-mapped buffer
+ *   %false: No buffer is available
+ *
+ * When successful, the returned buffer is freed automatically when the
+ * buffer pool is released by rpcrdma_pool_destroy().
+ */
+bool
+rpcrdma_pool_alloc_buffer(struct rpcrdma_pool *pool, gfp_t flags,
+			  void **cpu_addr, dma_addr_t *dma_addr)
+{
+	struct rpcrdma_pool_chunk *chunk;
+
+	chunk = rpcrdma_pool_find_chunk(pool, flags);
+	if (!chunk)
+		return false;
+
+	*cpu_addr = chunk->pc_cpu_addr + chunk->pc_free_start;
+	*dma_addr = chunk->pc_dma_addr + chunk->pc_free_start;
+	chunk->pc_free_start += pool->rp_bufsize;
+	return true;
+}
diff --git a/net/sunrpc/xprtrdma/pool.h b/net/sunrpc/xprtrdma/pool.h
new file mode 100644
index 000000000000..666543e22b5b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/pool.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ *
+ * Pools for Send and Receive buffers.
+ */
+
+#ifndef RPCRDMA_POOL_H
+#define RPCRDMA_POOL_H
+
+struct rpcrdma_pool_args {
+	struct ib_device	*pa_device;
+	size_t			pa_bufsize;
+	enum dma_data_direction	pa_direction;
+};
+
+struct rpcrdma_pool;
+
+struct rpcrdma_pool *
+rpcrdma_pool_create(struct rpcrdma_pool_args *args, gfp_t flags);
+void rpcrdma_pool_destroy(struct rpcrdma_pool *pool);
+bool rpcrdma_pool_alloc_buffer(struct rpcrdma_pool *pool, gfp_t flags,
+			       void **cpu_addr, dma_addr_t *dma_addr);
+
+#endif /* RPCRDMA_POOL_H */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e7e4a39ca6c6..8f0328d899d6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -104,9 +104,9 @@
 #include <linux/sunrpc/svc_rdma.h>
 
 #include "xprt_rdma.h"
-#include <trace/events/rpcrdma.h>
+#include "pool.h"
 
-static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+#include <trace/events/rpcrdma.h>
 
 static inline struct svc_rdma_recv_ctxt *
 svc_rdma_next_recv_ctxt(struct list_head *list)
@@ -115,14 +115,14 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
 					rc_list);
 }
 
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+
 static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
 	int node = ibdev_to_node(rdma->sc_cm_id->device);
 	struct svc_rdma_recv_ctxt *ctxt;
 	unsigned long pages;
-	dma_addr_t addr;
-	void *buffer;
 
 	pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
 	ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
@@ -130,13 +130,11 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	if (!ctxt)
 		goto fail0;
 	ctxt->rc_maxpages = pages;
-	buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
-	if (!buffer)
+
+	if (!rpcrdma_pool_alloc_buffer(rdma->sc_recv_pool, GFP_KERNEL,
+				       &ctxt->rc_recv_buf,
+				       &ctxt->rc_recv_sge.addr))
 		goto fail1;
-	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
-				 rdma->sc_max_req_size, DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
-		goto fail2;
 
 	svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
 	pcl_init(&ctxt->rc_call_pcl);
@@ -149,30 +147,17 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
 	ctxt->rc_recv_wr.num_sge = 1;
 	ctxt->rc_cqe.done = svc_rdma_wc_receive;
-	ctxt->rc_recv_sge.addr = addr;
 	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
 	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->rc_recv_buf = buffer;
 	svc_rdma_cc_init(rdma, &ctxt->rc_cc);
 	return ctxt;
 
-fail2:
-	kfree(buffer);
 fail1:
 	kfree(ctxt);
 fail0:
 	return NULL;
 }
 
-static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
-				       struct svc_rdma_recv_ctxt *ctxt)
-{
-	ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
-			    ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
-	kfree(ctxt->rc_recv_buf);
-	kfree(ctxt);
-}
-
 /**
  * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
  * @rdma: svcxprt_rdma being torn down
@@ -185,8 +170,9 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 
 	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
 		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
-		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+		kfree(ctxt);
 	}
+	rpcrdma_pool_destroy(rdma->sc_recv_pool);
 }
 
 /**
@@ -305,8 +291,17 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
+	struct rpcrdma_pool_args args = {
+		.pa_device	= rdma->sc_cm_id->device,
+		.pa_bufsize	= rdma->sc_max_req_size,
+		.pa_direction	= DMA_FROM_DEVICE,
+	};
 	unsigned int total;
 
+	rdma->sc_recv_pool = rpcrdma_pool_create(&args, GFP_KERNEL);
+	if (!rdma->sc_recv_pool)
+		return false;
+
 	/* For each credit, allocate enough recv_ctxts for one
 	 * posted Receive and one RPC in process.
 	 */
-- 
2.50.0


             reply	other threads:[~2025-08-08 18:46 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-08 18:46 Chuck Lever [this message]
2025-08-09 20:42 ` [RFC PATCH] svcrdma: Introduce Receive buffer arenas kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250808184648.120866-1-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=linux-nfs@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.