From: Chuck Lever <cel@kernel.org>
To: Mike Snitzer <snitzer@kernel.org>,
Jeff Layton <jlayton@kernel.org>, NeilBrown <neil@brown.name>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <Dai.Ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: linux-nfs@vger.kernel.org, linux-rdma@vger.kernel.org,
Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH 2/2] svcrdma: Defer send context release to xpo_release_ctxt
Date: Wed, 06 May 2026 11:26:51 -0400 [thread overview]
Message-ID: <20260506-svcrdma-next-v1-2-915fce8c4fbb@oracle.com> (raw)
In-Reply-To: <20260506-svcrdma-next-v1-0-915fce8c4fbb@oracle.com>
From: Chuck Lever <chuck.lever@oracle.com>
Send completion currently queues a work item to an unbound
workqueue for each completed send context. Under load, the
Send Completion handlers contend for the shared workqueue
pool lock.
Replace the workqueue with a per-transport lock-free list
(llist). The Send completion handler appends the send_ctxt
to sc_send_release_list and does no further teardown. The
nfsd thread drains the list in xpo_release_ctxt between
RPCs, performing DMA unmapping, chunk I/O resource release,
and page release in a batch.
This eliminates both the workqueue pool lock and the DMA
unmap cost from the Send completion path. DMA unmapping can
be expensive when an IOMMU is present in strict mode, as
each unmap triggers a synchronous hardware IOTLB
invalidation. Moving it to the nfsd thread, where that
latency is harmless, avoids penalizing completion handler
throughput.
The nfsd threads absorb the release cost at a point where
the client is no longer waiting on a reply, and natural
batching amortizes the overhead when completions arrive
faster than RPCs complete.
A self-enqueue backstops drain on a quiescing transport.
When svc_rdma_send_ctxt_put() observes that its llist_add()
transitions sc_send_release_list from empty to non-empty,
it sets XPT_DATA and calls svc_xprt_enqueue() so that
svc_xprt_ready() schedules an nfsd thread. The thread
enters svc_rdma_recvfrom(), finds no pending receive,
clears XPT_DATA, and returns 0; svc_xprt_release() then
runs xpo_release_ctxt and drains the list. Under steady
load the foreground drain keeps the list non-empty between
adds and no enqueue fires; only the trailing edge of a
burst pays for a wakeup. Without this path, a Send
completion arriving after the last xpo_release_ctxt on an
idle connection would leave the send_ctxt's DMA mappings
and reply pages pinned until the next RPC, send-context
exhaustion, or transport close.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 5 +-
net/sunrpc/xprtrdma/svc_rdma.c | 18 +------
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 ++++
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 91 +++++++++++++++++++++++---------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-
5 files changed, 82 insertions(+), 44 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 14eb9d52742e..4ba39f07371d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,7 +66,6 @@ extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
extern unsigned int svcrdma_max_bc_requests;
extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
extern struct percpu_counter svcrdma_stat_read;
extern struct percpu_counter svcrdma_stat_recv;
@@ -117,6 +116,8 @@ struct svcxprt_rdma {
struct llist_head sc_recv_ctxts;
+ struct llist_head sc_send_release_list;
+
atomic_t sc_completion_ids;
};
/* sc_flags */
@@ -235,7 +236,6 @@ struct svc_rdma_write_info {
struct svc_rdma_send_ctxt {
struct llist_node sc_node;
struct rpc_rdma_cid sc_cid;
- struct work_struct sc_work;
struct svcxprt_rdma *sc_rdma;
struct ib_send_wr sc_send_wr;
@@ -299,6 +299,7 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
/* svc_rdma_sendto.c */
extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma);
extern struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 415c0310101f..f67f0612b1a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -264,38 +264,22 @@ static int svc_rdma_proc_init(void)
return rc;
}
-struct workqueue_struct *svcrdma_wq;
-
void svc_rdma_cleanup(void)
{
svc_unreg_xprt_class(&svc_rdma_class);
svc_rdma_proc_cleanup();
- if (svcrdma_wq) {
- struct workqueue_struct *wq = svcrdma_wq;
-
- svcrdma_wq = NULL;
- destroy_workqueue(wq);
- }
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
}
int svc_rdma_init(void)
{
- struct workqueue_struct *wq;
int rc;
- wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
- if (!wq)
- return -ENOMEM;
-
rc = svc_rdma_proc_init();
- if (rc) {
- destroy_workqueue(wq);
+ if (rc)
return rc;
- }
- svcrdma_wq = wq;
svc_reg_xprt_class(&svc_rdma_class);
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f8a0638eb095..19503a12d0a2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -242,6 +242,10 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
* Ensure that the recv_ctxt is released whether or not a Reply
* was sent. For example, the client could close the connection,
* or svc_process could drop an RPC, before the Reply is sent.
+ *
+ * Also drain any send_ctxts queued for deferred release so that
+ * DMA unmap and page release run in nfsd thread context between
+ * RPCs rather than on the Send completion path.
*/
void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
{
@@ -251,6 +255,8 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
if (ctxt)
svc_rdma_recv_ctxt_put(rdma, ctxt);
+
+ svc_rdma_send_ctxts_drain(rdma);
}
static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
@@ -384,6 +390,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* svc_rdma_flush_recv_queues - Drain pending Receive work
* @rdma: svcxprt_rdma being shut down
*
+ * Caller must guarantee that @rdma's Send and Recv Completion
+ * Queues are empty (e.g., via ib_drain_qp()), so that no completion
+ * handlers can still produce work on the queues being drained.
*/
void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 8b3f0c8c14b2..eceefd21bec8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -79,21 +79,21 @@
* The ownership of all of the Reply's pages are transferred into that
* ctxt, the Send WR is posted, and sendto returns.
*
- * The svc_rdma_send_ctxt is presented when the Send WR completes. The
- * Send completion handler finally releases the Reply's pages.
- *
- * This mechanism also assumes that completions on the transport's Send
- * Completion Queue do not run in parallel. Otherwise a Write completion
- * and Send completion running at the same time could release pages that
- * are still DMA-mapped.
+ * The svc_rdma_send_ctxt is presented when the Send WR completes.
+ * The Send completion handler queues the send_ctxt onto the
+ * per-transport sc_send_release_list (a lock-free llist). The
+ * nfsd thread drains sc_send_release_list in xpo_release_ctxt
+ * between RPCs, DMA-unmapping SGEs, releasing chunk I/O
+ * resources and pages, and returning send_ctxts to the free
+ * list in a batch.
*
* Error Handling
*
* - If the Send WR is posted successfully, it will either complete
* successfully, or get flushed. Either way, the Send completion
- * handler releases the Reply's pages.
- * - If the Send WR cannot be not posted, the forward path releases
- * the Reply's pages.
+ * handler queues the send_ctxt for deferred release.
+ * - If the Send WR cannot be posted, the forward path releases the
+ * Reply's pages.
*
* This handles the case, without the use of page reference counting,
* where two different Write segments send portions of the same page.
@@ -226,14 +226,25 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
return ctxt;
out_empty:
+ svc_rdma_send_ctxts_drain(rdma);
+
+ spin_lock(&rdma->sc_send_lock);
+ node = llist_del_first(&rdma->sc_send_ctxts);
+ spin_unlock(&rdma->sc_send_lock);
+ if (node) {
+ ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node);
+ goto out;
+ }
+
ctxt = svc_rdma_send_ctxt_alloc(rdma);
if (!ctxt)
return NULL;
goto out;
}
-static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt)
+/* Release chunk I/O resources and DMA-unmap SGEs. */
+static void svc_rdma_send_ctxt_unmap(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
{
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
@@ -241,9 +252,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
svc_rdma_write_chunk_release(rdma, ctxt);
svc_rdma_reply_chunk_release(rdma, ctxt);
- if (ctxt->sc_page_count)
- release_pages(ctxt->sc_pages, ctxt->sc_page_count);
-
/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.
*/
@@ -256,30 +264,56 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
ctxt->sc_sges[i].length,
DMA_TO_DEVICE);
}
+}
+
+/* Unmap, release pages, and return send_ctxt to the free list. */
+static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
+{
+ svc_rdma_send_ctxt_unmap(rdma, ctxt);
+
+ if (ctxt->sc_page_count)
+ release_pages(ctxt->sc_pages, ctxt->sc_page_count);
llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
}
-static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+/**
+ * svc_rdma_send_ctxts_drain - Release completed send_ctxts
+ * @rdma: controlling svcxprt_rdma
+ */
+void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma)
{
- struct svc_rdma_send_ctxt *ctxt;
+ struct svc_rdma_send_ctxt *ctxt, *next;
+ struct llist_node *node;
- ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
- svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+ node = llist_del_all(&rdma->sc_send_release_list);
+ llist_for_each_entry_safe(ctxt, next, node, sc_node)
+ svc_rdma_send_ctxt_release(rdma, ctxt);
}
/**
- * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * svc_rdma_send_ctxt_put - Queue send_ctxt for deferred release
* @rdma: controlling svcxprt_rdma
- * @ctxt: object to return to the free list
+ * @ctxt: send_ctxt to queue for deferred release
*
- * Pages left in sc_pages are DMA unmapped and released.
+ * Queues @ctxt onto sc_send_release_list. DMA unmap and
+ * page release run later in svc_rdma_send_ctxts_drain(),
+ * typically from xpo_release_ctxt.
+ *
+ * On the empty-to-non-empty transition, set XPT_DATA and
+ * enqueue the transport. Without this self-trigger, a Send
+ * completion arriving after the last xpo_release_ctxt on an
+ * idle connection would leave the send_ctxt's DMA mappings
+ * and reply pages pinned until another drain occurred.
*/
void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
{
- INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
- queue_work(svcrdma_wq, &ctxt->sc_work);
+ if (llist_add(&ctxt->sc_node, &rdma->sc_send_release_list)) {
+ set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+ svc_xprt_enqueue(&rdma->sc_xprt);
+ }
}
/**
@@ -367,6 +401,15 @@ int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
atomic_inc(&rdma->sc_sq_ticket_tail);
wake_up(&rdma->sc_sq_ticket_wait);
trace_svcrdma_sq_retry(rdma, cid);
+
+ /*
+ * While this thread sat on sc_send_wait or sc_sq_ticket_wait,
+ * Send completions that tried to enqueue this transport for a
+ * release-list drain were rejected: svc_rdma_has_wspace returns
+ * 0 while either waitqueue is active, and svc_xprt_ready
+ * rejects the enqueue. Drain the release list now.
+ */
+ svc_rdma_send_ctxts_drain(rdma);
return 0;
out_close:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f18bc60d9f4f..f99cd6177504 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -178,6 +178,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
+ init_llist_head(&cma_xprt->sc_send_release_list);
init_waitqueue_head(&cma_xprt->sc_send_wait);
init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
@@ -614,7 +615,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
/* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
- flush_workqueue(svcrdma_wq);
+ svc_rdma_send_ctxts_drain(rdma);
svc_rdma_flush_recv_queues(rdma);
--
2.53.0
prev parent reply other threads:[~2026-05-06 15:27 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-06 15:26 [PATCH 0/2] svcrdma: Reduce svcrdma_wq contention on the Send completion path Chuck Lever
2026-05-06 15:26 ` [PATCH 1/2] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-05-07 20:46 ` Mike Snitzer
2026-05-08 20:14 ` Chuck Lever
2026-05-06 15:26 ` Chuck Lever [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260506-svcrdma-next-v1-2-915fce8c4fbb@oracle.com \
--to=cel@kernel.org \
--cc=Dai.Ngo@oracle.com \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=neil@brown.name \
--cc=okorniev@redhat.com \
--cc=snitzer@kernel.org \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox