All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 10/15] svcrdma: Use per-transport kthread for send context release
Date: Tue, 10 Feb 2026 11:32:17 -0500	[thread overview]
Message-ID: <20260210163222.2356793-11-cel@kernel.org> (raw)
In-Reply-To: <20260210163222.2356793-1-cel@kernel.org>

From: Chuck Lever <chuck.lever@oracle.com>

Each RDMA Send completion queues a separate work item to the
global svcrdma_wq (an unbound workqueue) to handle DMA
unmapping and page release. Under load, many worker threads
contend on the shared workqueue pool lock -- profiling an
NFSv3 8KB read+write workload over RDMA shows ~2.6% of
total CPU cycles spent in native_queued_spin_lock_slowpath
on this lock.

The contention arises from three directions: CQ completion
handlers acquiring the pool lock to enqueue work, a dozen
unbound workers re-acquiring it after each work item
completes, and XFS CIL flush callers hitting the same
unbound pool lock.

Replace the workqueue with a per-transport kthread that
drains a lock-free list. The CQ handler appends completed
send contexts via llist_add() (a single cmpxchg) and wakes
the kthread. The kthread collects all pending items with
llist_del_all() (a single xchg) and releases them in a
batch. Both operations are wait-free, eliminating the pool
lock entirely.

This also removes the global svcrdma_wq workqueue, which
has no remaining users.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  9 ++--
 net/sunrpc/xprtrdma/svc_rdma.c           | 18 +------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 62 +++++++++++++++++++++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  8 ++-
 4 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 9691238df47f..874941b22485 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,8 +66,6 @@ extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
-
 extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
 extern struct percpu_counter svcrdma_stat_sq_starve;
@@ -120,6 +118,10 @@ struct svcxprt_rdma {
 
 	struct llist_head    sc_recv_ctxts;
 
+	struct llist_head    sc_send_release_list;
+	wait_queue_head_t    sc_release_wait;
+	struct task_struct   *sc_release_task;
+
 	atomic_t	     sc_completion_ids;
 };
 /* sc_flags */
@@ -237,7 +239,6 @@ struct svc_rdma_write_info {
 struct svc_rdma_send_ctxt {
 	struct llist_node	sc_node;
 	struct rpc_rdma_cid	sc_cid;
-	struct work_struct	sc_work;
 
 	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
@@ -301,6 +302,8 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 
 /* svc_rdma_sendto.c */
 extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern int svc_rdma_start_release_thread(struct svcxprt_rdma *rdma);
+extern void svc_rdma_stop_release_thread(struct svcxprt_rdma *rdma);
 extern struct svc_rdma_send_ctxt *
 		svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 415c0310101f..f67f0612b1a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -264,38 +264,22 @@ static int svc_rdma_proc_init(void)
 	return rc;
 }
 
-struct workqueue_struct *svcrdma_wq;
-
 void svc_rdma_cleanup(void)
 {
 	svc_unreg_xprt_class(&svc_rdma_class);
 	svc_rdma_proc_cleanup();
-	if (svcrdma_wq) {
-		struct workqueue_struct *wq = svcrdma_wq;
-
-		svcrdma_wq = NULL;
-		destroy_workqueue(wq);
-	}
 
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
 }
 
 int svc_rdma_init(void)
 {
-	struct workqueue_struct *wq;
 	int rc;
 
-	wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
-	if (!wq)
-		return -ENOMEM;
-
 	rc = svc_rdma_proc_init();
-	if (rc) {
-		destroy_workqueue(wq);
+	if (rc)
 		return rc;
-	}
 
-	svcrdma_wq = wq;
 	svc_reg_xprt_class(&svc_rdma_class);
 
 	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index e9056039c118..1ff39c88b3cb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -99,6 +99,7 @@
  * where two different Write segments send portions of the same page.
  */
 
+#include <linux/kthread.h>
 #include <linux/spinlock.h>
 #include <linux/unaligned.h>
 
@@ -260,12 +261,57 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
 	llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
 }
 
-static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+static int svc_rdma_release_fn(void *data)
 {
-	struct svc_rdma_send_ctxt *ctxt;
+	struct svcxprt_rdma *rdma = data;
+	struct svc_rdma_send_ctxt *ctxt, *next;
+	struct llist_node *node;
 
-	ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
-	svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+	while (!kthread_should_stop()) {
+		wait_event(rdma->sc_release_wait,
+			   !llist_empty(&rdma->sc_send_release_list) ||
+			   kthread_should_stop());
+
+		node = llist_del_all(&rdma->sc_send_release_list);
+		llist_for_each_entry_safe(ctxt, next, node, sc_node)
+			svc_rdma_send_ctxt_release(rdma, ctxt);
+	}
+
+	/* Defensive: the list is usually empty here. */
+	node = llist_del_all(&rdma->sc_send_release_list);
+	llist_for_each_entry_safe(ctxt, next, node, sc_node)
+		svc_rdma_send_ctxt_release(rdma, ctxt);
+	return 0;
+}
+
+/**
+ * svc_rdma_start_release_thread - Launch release kthread
+ * @rdma: controlling transport
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int svc_rdma_start_release_thread(struct svcxprt_rdma *rdma)
+{
+	struct task_struct *task;
+
+	task = kthread_run(svc_rdma_release_fn, rdma,
+			   "svcrdma-rel");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rdma->sc_release_task = task;
+	return 0;
+}
+
+/**
+ * svc_rdma_stop_release_thread - Stop release kthread
+ * @rdma: controlling transport
+ *
+ * Waits for the kthread to drain and exit.
+ */
+void svc_rdma_stop_release_thread(struct svcxprt_rdma *rdma)
+{
+	if (rdma->sc_release_task)
+		kthread_stop(rdma->sc_release_task);
 }
 
 /**
@@ -273,13 +319,15 @@ static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
  * @rdma: controlling svcxprt_rdma
  * @ctxt: object to return to the free list
  *
- * Pages left in sc_pages are DMA unmapped and released.
+ * DMA unmapping and page release are deferred to a
+ * per-transport kthread to keep these costs off the
+ * completion handler's critical path.
  */
 void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_send_ctxt *ctxt)
 {
-	INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
-	queue_work(svcrdma_wq, &ctxt->sc_work);
+	llist_add(&ctxt->sc_node, &rdma->sc_send_release_list);
+	wake_up(&rdma->sc_release_wait);
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 286806ac0739..0a3969d36a80 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -177,7 +177,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	init_llist_head(&cma_xprt->sc_send_ctxts);
 	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	init_llist_head(&cma_xprt->sc_rw_ctxts);
+	init_llist_head(&cma_xprt->sc_send_release_list);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
+	init_waitqueue_head(&cma_xprt->sc_release_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_send_lock);
@@ -526,6 +528,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	if (!svc_rdma_post_recvs(newxprt))
 		goto errout;
 
+	ret = svc_rdma_start_release_thread(newxprt);
+	if (ret)
+		goto errout;
+
 	/* Construct RDMA-CM private message */
 	pmsg.cp_magic = rpcrdma_cmp_magic;
 	pmsg.cp_version = RPCRDMA_CMP_VERSION;
@@ -605,7 +611,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 	/* This blocks until the Completion Queues are empty */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_drain_qp(rdma->sc_qp);
-	flush_workqueue(svcrdma_wq);
+	svc_rdma_stop_release_thread(rdma);
 
 	svc_rdma_flush_recv_queues(rdma);
 
-- 
2.52.0


  parent reply	other threads:[~2026-02-10 16:32 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-10 16:32 [RFC PATCH 00/15] svcrdma performance scalability enhancements Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 01/15] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 02/15] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 03/15] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 04/15] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 05/15] svcrdma: Factor out WR chain linking into helper Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 06/15] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 07/15] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 08/15] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 09/15] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-02-10 16:32 ` Chuck Lever [this message]
2026-02-10 16:32 ` [RFC PATCH 11/15] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 12/15] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 13/15] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 14/15] svcrdma: retry when receive queues drain transiently Chuck Lever
2026-02-10 16:32 ` [RFC PATCH 15/15] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260210163222.2356793-11-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=neilb@ownmail.net \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.