From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache
Date: Fri, 27 Feb 2026 09:03:39 -0500 [thread overview]
Message-ID: <20260227140345.40488-13-cel@kernel.org> (raw)
In-Reply-To: <20260227140345.40488-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
Parsed chunk list (PCL) processing currently allocates a new
svc_rdma_chunk structure via kmalloc for each chunk in every
incoming RPC. These allocations add overhead to the receive path.
Introduce a per-recv_ctxt single-entry cache. Over 99% of RPC Calls
that specify RPC/RDMA chunks provide only a single chunk, so a
single cached chunk handles the common case. Chunks with up to
SVC_RDMA_CHUNK_SEGMAX (4) segments are eligible for caching; larger
chunks fall back to dynamic allocation.
Using per-recv_ctxt caching instead of a per-transport pool avoids
the need for locking or atomic operations, since a recv_ctxt is
used by only one thread at a time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 2 +
include/linux/sunrpc/svc_rdma_pcl.h | 12 +++++-
net/sunrpc/xprtrdma/svc_rdma_pcl.c | 52 ++++++++++++++++++++++---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +++--
4 files changed, 65 insertions(+), 11 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ef52af656581..2233dec2ae7d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -225,6 +225,8 @@ struct svc_rdma_chunk_ctxt {
struct svc_rdma_recv_ctxt {
struct llist_node rc_node;
+ struct svcxprt_rdma *rc_rdma;
+ struct svc_rdma_chunk *rc_chunk_cache;
struct ib_recv_wr rc_recv_wr;
struct ib_cqe rc_cqe;
struct rpc_rdma_cid rc_cid;
diff --git a/include/linux/sunrpc/svc_rdma_pcl.h b/include/linux/sunrpc/svc_rdma_pcl.h
index 7516ad0fae80..8afd98dc4737 100644
--- a/include/linux/sunrpc/svc_rdma_pcl.h
+++ b/include/linux/sunrpc/svc_rdma_pcl.h
@@ -22,6 +22,7 @@ struct svc_rdma_chunk {
u32 ch_payload_length;
u32 ch_segcount;
+ u32 ch_segmax; /* allocated segment capacity */
struct svc_rdma_segment ch_segments[];
};
@@ -114,7 +115,16 @@ pcl_chunk_end_offset(const struct svc_rdma_chunk *chunk)
struct svc_rdma_recv_ctxt;
-extern void pcl_free(struct svc_rdma_pcl *pcl);
+/*
+ * Cached chunks have capacity for this many segments.
+ * Typical clients can register up to 120KB per segment, so 4
+ * segments covers most NFS I/O operations. Larger chunks fall
+ * back to kmalloc.
+ */
+#define SVC_RDMA_CHUNK_SEGMAX 4
+
+extern void pcl_free(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_pcl *pcl);
extern bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
extern bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
extern bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
index 1f8f7dad8b6f..5c13a74b1f9e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -9,30 +9,70 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
+static struct svc_rdma_chunk *rctxt_chunk_get(struct svc_rdma_recv_ctxt *rctxt)
+{
+ struct svc_rdma_chunk *chunk = rctxt->rc_chunk_cache;
+
+ if (chunk)
+ rctxt->rc_chunk_cache = NULL;
+ return chunk;
+}
+
+static void rctxt_chunk_put(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk)
+{
+ if (rctxt->rc_chunk_cache) {
+ kfree(chunk);
+ return;
+ }
+ rctxt->rc_chunk_cache = chunk;
+}
+
+static void rctxt_chunk_free(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk)
+{
+ if (chunk->ch_segmax == SVC_RDMA_CHUNK_SEGMAX)
+ rctxt_chunk_put(rctxt, chunk);
+ else
+ kfree(chunk);
+}
+
/**
* pcl_free - Release all memory associated with a parsed chunk list
+ * @rctxt: receive context containing @pcl
* @pcl: parsed chunk list
*
*/
-void pcl_free(struct svc_rdma_pcl *pcl)
+void pcl_free(struct svc_rdma_recv_ctxt *rctxt, struct svc_rdma_pcl *pcl)
{
while (!list_empty(&pcl->cl_chunks)) {
struct svc_rdma_chunk *chunk;
chunk = pcl_first_chunk(pcl);
list_del(&chunk->ch_list);
- kfree(chunk);
+ rctxt_chunk_free(rctxt, chunk);
}
}
-static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+static struct svc_rdma_chunk *pcl_alloc_chunk(struct svc_rdma_recv_ctxt *rctxt,
+ u32 segcount, u32 position)
{
struct svc_rdma_chunk *chunk;
+ if (segcount <= SVC_RDMA_CHUNK_SEGMAX) {
+ chunk = rctxt_chunk_get(rctxt);
+ if (chunk)
+ goto out;
+ /* Round up so all fresh allocations are cache-eligible */
+ segcount = SVC_RDMA_CHUNK_SEGMAX;
+ }
+
chunk = kmalloc_flex(*chunk, ch_segments, segcount);
if (!chunk)
return NULL;
+ chunk->ch_segmax = segcount;
+out:
chunk->ch_position = position;
chunk->ch_length = 0;
chunk->ch_payload_length = 0;
@@ -117,7 +157,7 @@ bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
continue;
if (pcl_is_empty(pcl)) {
- chunk = pcl_alloc_chunk(segcount, position);
+ chunk = pcl_alloc_chunk(rctxt, segcount, position);
if (!chunk)
return false;
pcl_insert_position(pcl, chunk);
@@ -172,7 +212,7 @@ bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
chunk = pcl_lookup_position(pcl, position);
if (!chunk) {
- chunk = pcl_alloc_chunk(segcount, position);
+ chunk = pcl_alloc_chunk(rctxt, segcount, position);
if (!chunk)
return false;
pcl_insert_position(pcl, chunk);
@@ -210,7 +250,7 @@ bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
p++; /* skip the list discriminator */
segcount = be32_to_cpup(p++);
- chunk = pcl_alloc_chunk(segcount, 0);
+ chunk = pcl_alloc_chunk(rctxt, segcount, 0);
if (!chunk)
return false;
list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a11e845a7113..45edf57c7285 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -123,6 +123,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
GFP_KERNEL, node);
if (!ctxt)
goto fail0;
+ ctxt->rc_rdma = rdma;
ctxt->rc_maxpages = pages;
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
@@ -161,6 +162,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
+ kfree(ctxt->rc_chunk_cache);
ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr,
ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
kfree(ctxt->rc_recv_buf);
@@ -219,10 +221,10 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
*/
release_pages(ctxt->rc_pages, ctxt->rc_page_count);
- pcl_free(&ctxt->rc_call_pcl);
- pcl_free(&ctxt->rc_read_pcl);
- pcl_free(&ctxt->rc_write_pcl);
- pcl_free(&ctxt->rc_reply_pcl);
+ pcl_free(ctxt, &ctxt->rc_call_pcl);
+ pcl_free(ctxt, &ctxt->rc_read_pcl);
+ pcl_free(ctxt, &ctxt->rc_write_pcl);
+ pcl_free(ctxt, &ctxt->rc_reply_pcl);
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}
--
2.53.0
next prev parent reply other threads:[~2026-02-27 14:03 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
2026-02-27 14:03 ` [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-27 14:03 ` [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
2026-02-27 14:03 ` [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
2026-02-27 14:03 ` [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
2026-02-27 14:03 ` [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper Chuck Lever
2026-02-27 14:03 ` [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
2026-02-27 14:03 ` [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
2026-02-27 14:03 ` [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
2026-02-27 14:03 ` [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-02-27 14:03 ` [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt Chuck Lever
2026-02-27 14:03 ` [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
2026-02-27 14:03 ` Chuck Lever [this message]
2026-02-27 14:03 ` [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently Chuck Lever
2026-02-27 14:03 ` [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending Chuck Lever
2026-02-27 14:03 ` [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle Chuck Lever
2026-02-27 14:03 ` [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260227140345.40488-13-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox