From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 10/15] svcrdma: Use per-transport kthread for send context release
Date: Tue, 10 Feb 2026 11:32:17 -0500 [thread overview]
Message-ID: <20260210163222.2356793-11-cel@kernel.org> (raw)
In-Reply-To: <20260210163222.2356793-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
Each RDMA Send completion queues a separate work item to the
global svcrdma_wq (an unbound workqueue) to handle DMA
unmapping and page release. Under load, many worker threads
contend on the shared workqueue pool lock -- profiling an
NFSv3 8KB read+write workload over RDMA shows ~2.6% of
total CPU cycles spent in native_queued_spin_lock_slowpath
on this lock.
The contention arises from three directions: CQ completion
handlers acquiring the pool lock to enqueue work, a dozen
unbound workers re-acquiring it after each work item
completes, and XFS CIL flush callers hitting the same
unbound pool lock.
Replace the workqueue with a per-transport kthread that
drains a lock-free list. The CQ handler appends completed
send contexts via llist_add() (a single cmpxchg) and wakes
the kthread. The kthread collects all pending items with
llist_del_all() (a single xchg) and releases them in a
batch. Both operations are wait-free, eliminating the pool
lock entirely.
This also removes the global svcrdma_wq workqueue, which
has no remaining users.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 9 ++--
net/sunrpc/xprtrdma/svc_rdma.c | 18 +------
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 62 +++++++++++++++++++++---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++-
4 files changed, 69 insertions(+), 28 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 9691238df47f..874941b22485 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,8 +66,6 @@ extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
extern unsigned int svcrdma_max_bc_requests;
extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
-
extern struct percpu_counter svcrdma_stat_read;
extern struct percpu_counter svcrdma_stat_recv;
extern struct percpu_counter svcrdma_stat_sq_starve;
@@ -120,6 +118,10 @@ struct svcxprt_rdma {
struct llist_head sc_recv_ctxts;
+ struct llist_head sc_send_release_list;
+ wait_queue_head_t sc_release_wait;
+ struct task_struct *sc_release_task;
+
atomic_t sc_completion_ids;
};
/* sc_flags */
@@ -237,7 +239,6 @@ struct svc_rdma_write_info {
struct svc_rdma_send_ctxt {
struct llist_node sc_node;
struct rpc_rdma_cid sc_cid;
- struct work_struct sc_work;
struct svcxprt_rdma *sc_rdma;
struct ib_send_wr sc_send_wr;
@@ -301,6 +302,8 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
/* svc_rdma_sendto.c */
extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern int svc_rdma_start_release_thread(struct svcxprt_rdma *rdma);
+extern void svc_rdma_stop_release_thread(struct svcxprt_rdma *rdma);
extern struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 415c0310101f..f67f0612b1a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -264,38 +264,22 @@ static int svc_rdma_proc_init(void)
return rc;
}
-struct workqueue_struct *svcrdma_wq;
-
void svc_rdma_cleanup(void)
{
svc_unreg_xprt_class(&svc_rdma_class);
svc_rdma_proc_cleanup();
- if (svcrdma_wq) {
- struct workqueue_struct *wq = svcrdma_wq;
-
- svcrdma_wq = NULL;
- destroy_workqueue(wq);
- }
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
}
int svc_rdma_init(void)
{
- struct workqueue_struct *wq;
int rc;
- wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
- if (!wq)
- return -ENOMEM;
-
rc = svc_rdma_proc_init();
- if (rc) {
- destroy_workqueue(wq);
+ if (rc)
return rc;
- }
- svcrdma_wq = wq;
svc_reg_xprt_class(&svc_rdma_class);
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index e9056039c118..1ff39c88b3cb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -99,6 +99,7 @@
* where two different Write segments send portions of the same page.
*/
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/unaligned.h>
@@ -260,12 +261,57 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
}
-static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+static int svc_rdma_release_fn(void *data)
{
- struct svc_rdma_send_ctxt *ctxt;
+ struct svcxprt_rdma *rdma = data;
+ struct svc_rdma_send_ctxt *ctxt, *next;
+ struct llist_node *node;
- ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
- svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+ while (!kthread_should_stop()) {
+ wait_event(rdma->sc_release_wait,
+ !llist_empty(&rdma->sc_send_release_list) ||
+ kthread_should_stop());
+
+ node = llist_del_all(&rdma->sc_send_release_list);
+ llist_for_each_entry_safe(ctxt, next, node, sc_node)
+ svc_rdma_send_ctxt_release(rdma, ctxt);
+ }
+
+ /* Defensive: the list is usually empty here. */
+ node = llist_del_all(&rdma->sc_send_release_list);
+ llist_for_each_entry_safe(ctxt, next, node, sc_node)
+ svc_rdma_send_ctxt_release(rdma, ctxt);
+ return 0;
+}
+
+/**
+ * svc_rdma_start_release_thread - Launch release kthread
+ * @rdma: controlling transport
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int svc_rdma_start_release_thread(struct svcxprt_rdma *rdma)
+{
+ struct task_struct *task;
+
+ task = kthread_run(svc_rdma_release_fn, rdma,
+ "svcrdma-rel");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rdma->sc_release_task = task;
+ return 0;
+}
+
+/**
+ * svc_rdma_stop_release_thread - Stop release kthread
+ * @rdma: controlling transport
+ *
+ * Waits for the kthread to drain and exit.
+ */
+void svc_rdma_stop_release_thread(struct svcxprt_rdma *rdma)
+{
+ if (rdma->sc_release_task)
+ kthread_stop(rdma->sc_release_task);
}
/**
@@ -273,13 +319,15 @@ static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
* @rdma: controlling svcxprt_rdma
* @ctxt: object to return to the free list
*
- * Pages left in sc_pages are DMA unmapped and released.
+ * DMA unmapping and page release are deferred to a
+ * per-transport kthread to keep these costs off the
+ * completion handler's critical path.
*/
void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
{
- INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
- queue_work(svcrdma_wq, &ctxt->sc_work);
+ llist_add(&ctxt->sc_node, &rdma->sc_send_release_list);
+ wake_up(&rdma->sc_release_wait);
}
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 286806ac0739..0a3969d36a80 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -177,7 +177,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
+ init_llist_head(&cma_xprt->sc_send_release_list);
init_waitqueue_head(&cma_xprt->sc_send_wait);
+ init_waitqueue_head(&cma_xprt->sc_release_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_send_lock);
@@ -526,6 +528,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (!svc_rdma_post_recvs(newxprt))
goto errout;
+ ret = svc_rdma_start_release_thread(newxprt);
+ if (ret)
+ goto errout;
+
/* Construct RDMA-CM private message */
pmsg.cp_magic = rpcrdma_cmp_magic;
pmsg.cp_version = RPCRDMA_CMP_VERSION;
@@ -605,7 +611,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
/* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
- flush_workqueue(svcrdma_wq);
+ svc_rdma_stop_release_thread(rdma);
svc_rdma_flush_recv_queues(rdma);
--
2.52.0
next prev parent reply other threads:[~2026-02-10 16:32 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-10 16:32 [RFC PATCH 00/15] svcrdma performance scalability enhancements Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 01/15] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 02/15] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 03/15] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 04/15] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 05/15] svcrdma: Factor out WR chain linking into helper Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 06/15] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 07/15] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 08/15] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 09/15] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-02-10 16:32 ` Chuck Lever [this message]
2026-02-10 16:32 ` [RFC PATCH 11/15] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 12/15] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 13/15] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 14/15] svcrdma: retry when receive queues drain transiently Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 15/15] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260210163222.2356793-11-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=neilb@ownmail.net \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox