From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
daire@dneg.com, Mike Snitzer <snitzer@kernel.org>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 3/7] sunrpc: add per-transport page recycling pool
Date: Thu, 5 Feb 2026 10:57:25 -0500 [thread overview]
Message-ID: <20260205155729.6841-4-cel@kernel.org> (raw)
In-Reply-To: <20260205155729.6841-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
RPC server transports allocate pages for receiving incoming data on
every request. Under high load, this repeated allocation and freeing
creates unnecessary overhead in the page allocator hot path.
Introduce svc_page_pool, a lock-free page recycling mechanism that
enables efficient page reuse between receive operations. A follow-up
commit wires this into the TCP transport's receive path; svcrdma's
RDMA Read path might also make use of this mechanism some day.
The pool uses llist for lock-free producer-consumer handoff: worker
threads returning pages after RPC processing act as producers, while
receiver threads allocating pages for incoming data act as
consumers. Pages are linked via page->pcp_llist, which is safe
because these pages are owned exclusively by the transport.
Each pool tracks its NUMA node affinity, allowing page allocations
to target the same node as the transport's receiver thread. Provide
svc_pool_node() to enable transports to determine the NUMA node
associated with a service pool for NUMA-aware resource allocation.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc.h | 1 +
include/linux/sunrpc/svc_xprt.h | 32 +++++++
net/sunrpc/svc.c | 13 +++
net/sunrpc/svc_xprt.c | 151 ++++++++++++++++++++++++++++++++
4 files changed, 197 insertions(+)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5506d20857c3..f4efe60f4dad 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -457,6 +457,7 @@ void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space);
void svc_pool_wake_idle_thread(struct svc_pool *pool);
struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv);
+int svc_pool_node(struct svc_pool *pool);
char * svc_print_addr(struct svc_rqst *, char *, size_t);
const char * svc_proc_name(const struct svc_rqst *rqstp);
int svc_encode_result_payload(struct svc_rqst *rqstp,
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index da2a2531e110..e60c2936b1ce 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -9,9 +9,33 @@
#define SUNRPC_SVC_XPRT_H
#include <linux/sunrpc/svc.h>
+#include <linux/llist.h>
struct module;
+/**
+ * struct svc_page_pool - per-transport page recycling pool
+ * @pp_pages: lock-free list of recycled pages
+ * @pp_count: number of pages currently in pool
+ * @pp_numa_node: NUMA node for page allocations
+ * @pp_max: maximum pages to retain in pool
+ *
+ * Lock-free page recycling between producers (svc threads returning
+ * pages) and a single consumer (the thread allocating pages for
+ * receives). Uses llist for efficient producer-consumer handoff
+ * without spinlocks.
+ *
+ * Callers must serialize calls to svc_page_pool_get(); multiple
+ * concurrent consumers are not supported.
+ * Allocate with svc_page_pool_alloc(); free with svc_page_pool_free().
+ */
+struct svc_page_pool {
+ struct llist_head pp_pages;
+ atomic_t pp_count;
+ int pp_numa_node;
+ unsigned int pp_max;
+};
+
struct svc_xprt_ops {
struct svc_xprt *(*xpo_create)(struct svc_serv *,
struct net *net,
@@ -187,6 +211,14 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt);
void svc_age_temp_xprts_now(struct svc_serv *, struct sockaddr *);
void svc_xprt_deferred_close(struct svc_xprt *xprt);
+/* Page pool helpers */
+struct svc_page_pool *svc_page_pool_alloc(int numa_node, unsigned int max);
+void svc_page_pool_free(struct svc_page_pool *pool);
+void svc_page_pool_put(struct svc_page_pool *pool, struct page *page);
+void svc_page_pool_put_bulk(struct svc_page_pool *pool,
+ struct page **pages, unsigned int count);
+struct page *svc_page_pool_get(struct svc_page_pool *pool);
+
static inline void svc_xprt_get(struct svc_xprt *xprt)
{
kref_get(&xprt->xpt_ref);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4704dce7284e..6b350cb7d539 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -418,6 +418,19 @@ struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv)
return &serv->sv_pools[pidx % serv->sv_nrpools];
}
+/**
+ * svc_pool_node - Return the NUMA node affinity of a service pool
+ * @pool: the service pool
+ *
+ * Return value:
+ * The NUMA node the pool is associated with, or the local node
+ * if no explicit mapping exists
+ */
+int svc_pool_node(struct svc_pool *pool)
+{
+ return svc_pool_map_get_node(pool->sp_id);
+}
+
static int svc_rpcb_setup(struct svc_serv *serv, struct net *net)
{
int err;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 6973184ff667..fe31cf6a9c5d 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1497,4 +1497,155 @@ int svc_pool_stats_open(struct svc_info *info, struct file *file)
}
EXPORT_SYMBOL(svc_pool_stats_open);
+static struct llist_node *svc_page_to_llist(struct page *page)
+{
+ return &page->pcp_llist;
+}
+
+static struct page *svc_llist_to_page(struct llist_node *node)
+{
+ return container_of(node, struct page, pcp_llist);
+}
+
+/**
+ * svc_page_pool_alloc - Allocate a page pool
+ * @numa_node: NUMA node for page allocations
+ * @max: maximum pages to retain in pool
+ *
+ * Pages in an svc_page_pool are linked via page->pcp_llist, which is
+ * safe since these pages are owned exclusively by the transport.
+ *
+ * The caller must free the pool with svc_page_pool_free() when
+ * the transport is destroyed.
+ *
+ * Returns a new page pool, or NULL on allocation failure.
+ */
+struct svc_page_pool *svc_page_pool_alloc(int numa_node, unsigned int max)
+{
+ struct svc_page_pool *pool;
+
+ pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, numa_node);
+ if (!pool)
+ return NULL;
+
+ init_llist_head(&pool->pp_pages);
+ atomic_set(&pool->pp_count, 0);
+ pool->pp_numa_node = numa_node;
+ pool->pp_max = max;
+ return pool;
+}
+
+/**
+ * svc_page_pool_free - Free a page pool and all pages in it
+ * @pool: pool to free (may be NULL)
+ */
+void svc_page_pool_free(struct svc_page_pool *pool)
+{
+ struct llist_node *node;
+
+ if (!pool)
+ return;
+
+ while ((node = llist_del_first(&pool->pp_pages)) != NULL)
+ put_page(svc_llist_to_page(node));
+ kfree(pool);
+}
+
+/**
+ * svc_page_pool_put - Return a page to the pool
+ * @pool: pool to return page to (may be NULL)
+ * @page: page to return (may be NULL)
+ *
+ * Transfers ownership of @page to the pool. The caller's reference
+ * is consumed: either the pool retains the page, or put_page() is
+ * called if @pool is NULL or full.
+ */
+void svc_page_pool_put(struct svc_page_pool *pool, struct page *page)
+{
+ if (!page)
+ return;
+ if (!pool || atomic_read(&pool->pp_count) >= pool->pp_max) {
+ put_page(page);
+ return;
+ }
+ llist_add(svc_page_to_llist(page), &pool->pp_pages);
+ atomic_inc(&pool->pp_count);
+}
+
+/**
+ * svc_page_pool_put_bulk - Return multiple pages to the pool
+ * @pool: pool to return pages to (may be NULL)
+ * @pages: array of pages to return
+ * @count: number of pages in @pages array
+ *
+ * Batch version of svc_page_pool_put() that reduces atomic operations
+ * when returning many pages at once. Transfers ownership of all pages
+ * in @pages to the pool. Uses release_pages() for efficient bulk
+ * freeing when the pool is full.
+ *
+ * Unlike svc_page_pool_put(), this function does not handle NULL
+ * entries in @pages. All @count entries must be valid page pointers.
+ */
+void svc_page_pool_put_bulk(struct svc_page_pool *pool,
+ struct page **pages, unsigned int count)
+{
+ struct llist_node *head, *last, *node;
+ unsigned int i, to_add, avail;
+
+ if (!count)
+ return;
+ if (!pool) {
+ release_pages(pages, count);
+ return;
+ }
+
+ avail = pool->pp_max - atomic_read(&pool->pp_count);
+ to_add = min_t(unsigned int, count, avail);
+ if (!to_add) {
+ release_pages(pages, count);
+ return;
+ }
+
+ head = NULL;
+ last = NULL;
+ for (i = 0; i < to_add; i++) {
+ node = svc_page_to_llist(pages[i]);
+ node->next = head;
+ head = node;
+ if (!last)
+ last = node;
+ }
+ llist_add_batch(head, last, &pool->pp_pages);
+ atomic_add(to_add, &pool->pp_count);
+
+ /* Free overflow pages that didn't fit in the pool */
+ if (to_add < count)
+ release_pages(pages + to_add, count - to_add);
+}
+EXPORT_SYMBOL_GPL(svc_page_pool_put_bulk);
+
+/**
+ * svc_page_pool_get - Get a page from the pool
+ * @pool: pool to take from (may be NULL)
+ *
+ * Returns a recycled page with one reference, or NULL if @pool is
+ * NULL or empty. The caller owns the returned page and must either
+ * return it via svc_page_pool_put() or release it with put_page().
+ *
+ * Caller must serialize; concurrent calls for the same pool are
+ * not supported.
+ */
+struct page *svc_page_pool_get(struct svc_page_pool *pool)
+{
+ struct llist_node *node;
+
+ if (!pool)
+ return NULL;
+ node = llist_del_first(&pool->pp_pages);
+ if (!node)
+ return NULL;
+ atomic_dec(&pool->pp_count);
+ return svc_llist_to_page(node);
+}
+
/*----------------------------------------------------------------------------*/
--
2.52.0
next prev parent reply other threads:[~2026-02-05 15:57 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-05 15:57 [RFC PATCH 0/7] sunrpc: Reduce lock contention for NFSD TCP sockets Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 1/7] workqueue: Automatic affinity scope fallback for single-pod topologies Chuck Lever
2026-02-06 14:57 ` Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 2/7] sunrpc: split svc_data_ready into protocol-specific callbacks Chuck Lever
2026-02-05 15:57 ` Chuck Lever [this message]
2026-02-05 15:57 ` [RFC PATCH 4/7] sunrpc: add dedicated TCP receiver thread Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 5/7] sunrpc: implement flat combining for TCP socket sends Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 6/7] sunrpc: unify fore and backchannel server TCP send paths Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 7/7] SUNRPC: Set explicit TCP socket buffer sizes for NFSD Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260205155729.6841-4-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=daire@dneg.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=snitzer@kernel.org \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox