[PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt

public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed

From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt
Date: Fri, 27 Feb 2026 09:03:37 -0500	[thread overview]
Message-ID: <20260227140345.40488-11-cel@kernel.org> (raw)
In-Reply-To: <20260227140345.40488-1-cel@kernel.org>

From: Chuck Lever <chuck.lever@oracle.com>

Send completion currently queues a work item to an unbound
workqueue for each completed send context. Under load, the
Send Completion handlers contend for the shared workqueue
pool lock.

Replace the workqueue with a per-transport lock-free list
(llist). The Send completion handler appends the send_ctxt
to sc_send_release_list and does no further teardown. The
nfsd thread drains the list in xpo_release_ctxt between
RPCs, performing DMA unmapping, chunk I/O resource release,
and page release in a batch.

This eliminates both the workqueue pool lock and the DMA
unmap cost from the Send completion path. DMA unmapping can
be expensive when an IOMMU is present in strict mode, as
each unmap triggers a synchronous hardware IOTLB
invalidation. Moving it to the nfsd thread, where that
latency is harmless, avoids penalizing completion handler
throughput.

The nfsd threads absorb the release cost at a point where
the client is no longer waiting on a reply, and natural
batching amortizes the overhead when completions arrive
faster than RPCs complete.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          | 10 ++--
 net/sunrpc/xprtrdma/svc_rdma.c           | 18 +------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 10 ++--
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 68 ++++++++++++++++--------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  3 +-
 5 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a2d3232593a2..562a5f78cd3f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,7 +66,6 @@ extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
 
 extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
@@ -89,8 +88,9 @@ extern struct percpu_counter svcrdma_stat_write;
  *
  * When adding a field, place it in the zone whose code path modifies the
  * field under load. Read-only fields can fill padding in any zone that
- * accesses them. Fields modified by multiple paths remain at the end,
- * outside any aligned zone.
+ * accesses them. Fields modified by multiple paths (e.g.
+ * sc_recv_ctxts, sc_send_release_list) remain at the end, outside
+ * any aligned zone.
  */
 struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
@@ -140,6 +140,8 @@ struct svcxprt_rdma {
 
 	struct llist_head    sc_recv_ctxts;
 
+	struct llist_head    sc_send_release_list;
+
 	atomic_t	     sc_completion_ids;
 };
 /* sc_flags */
@@ -257,7 +259,6 @@ struct svc_rdma_write_info {
 struct svc_rdma_send_ctxt {
 	struct llist_node	sc_node;
 	struct rpc_rdma_cid	sc_cid;
-	struct work_struct	sc_work;
 
 	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
@@ -321,6 +322,7 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 
 /* svc_rdma_sendto.c */
 extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma);
 extern struct svc_rdma_send_ctxt *
 		svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 415c0310101f..f67f0612b1a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -264,38 +264,22 @@ static int svc_rdma_proc_init(void)
 	return rc;
 }
 
-struct workqueue_struct *svcrdma_wq;
-
 void svc_rdma_cleanup(void)
 {
 	svc_unreg_xprt_class(&svc_rdma_class);
 	svc_rdma_proc_cleanup();
-	if (svcrdma_wq) {
-		struct workqueue_struct *wq = svcrdma_wq;
-
-		svcrdma_wq = NULL;
-		destroy_workqueue(wq);
-	}
 
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
 }
 
 int svc_rdma_init(void)
 {
-	struct workqueue_struct *wq;
 	int rc;
 
-	wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
-	if (!wq)
-		return -ENOMEM;
-
 	rc = svc_rdma_proc_init();
-	if (rc) {
-		destroy_workqueue(wq);
+	if (rc)
 		return rc;
-	}
 
-	svcrdma_wq = wq;
 	svc_reg_xprt_class(&svc_rdma_class);
 
 	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 1bd6b0da002f..2281f9adc9f3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -244,6 +244,8 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
 
 	if (ctxt)
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
+
+	svc_rdma_send_ctxts_drain(rdma);
 }
 
 static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
@@ -379,11 +381,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
  * svc_rdma_flush_recv_queues - Drain pending Receive work
  * @rdma: svcxprt_rdma being shut down
  *
- * Called from svc_rdma_free() after ib_drain_qp() has blocked until
- * completion queues are empty and flush_workqueue() has waited for
- * pending work items. These preceding calls guarantee no concurrent
- * producers (completion handlers) or consumers (svc_rdma_recvfrom)
- * can be active, making unsynchronized llist_del_all() safe here.
+ * Caller must guarantee that @rdma's Send Completion queue is empty and
+ * all send contexts have been released. This guarantees concurrent
+ * producers and consumers are no longer active.
  */
 void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 263da6f76267..c8686fdfe788 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -79,21 +79,26 @@
  * The ownership of all of the Reply's pages are transferred into that
  * ctxt, the Send WR is posted, and sendto returns.
  *
- * The svc_rdma_send_ctxt is presented when the Send WR completes. The
- * Send completion handler finally releases the Reply's pages.
+ * The svc_rdma_send_ctxt is presented when the Send WR completes.
+ * The Send completion handler queues the send_ctxt onto the
+ * per-transport sc_send_release_list (a lock-free llist). The
+ * nfsd thread drains sc_send_release_list in xpo_release_ctxt
+ * between RPCs, DMA-unmapping SGEs, releasing chunk I/O
+ * resources and pages, and returning send_ctxts to the free
+ * list in a batch.
  *
- * This mechanism also assumes that completions on the transport's Send
- * Completion Queue do not run in parallel. Otherwise a Write completion
- * and Send completion running at the same time could release pages that
- * are still DMA-mapped.
+ * Correctness depends on completions on the transport's Send
+ * Completion Queue being serialized. Otherwise a Write
+ * completion and Send completion running at the same time could
+ * queue a send_ctxt whose pages are still DMA-mapped.
  *
  * Error Handling
  *
  * - If the Send WR is posted successfully, it will either complete
  *   successfully, or get flushed. Either way, the Send completion
- *   handler releases the Reply's pages.
- * - If the Send WR cannot be not posted, the forward path releases
- *   the Reply's pages.
+ *   handler queues the send_ctxt for deferred release.
+ * - If the Send WR cannot be posted, the forward path releases the
+ *   Reply's pages.
  *
  * This handles the case, without the use of page reference counting,
  * where two different Write segments send portions of the same page.
@@ -232,8 +237,9 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
 	goto out;
 }
 
-static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
-				       struct svc_rdma_send_ctxt *ctxt)
+/* DMA-unmap SGEs and release chunk I/O resources. */
+static void svc_rdma_send_ctxt_unmap(struct svcxprt_rdma *rdma,
+				     struct svc_rdma_send_ctxt *ctxt)
 {
 	struct ib_device *device = rdma->sc_cm_id->device;
 	unsigned int i;
@@ -241,9 +247,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
 	svc_rdma_write_chunk_release(rdma, ctxt);
 	svc_rdma_reply_chunk_release(rdma, ctxt);
 
-	if (ctxt->sc_page_count)
-		release_pages(ctxt->sc_pages, ctxt->sc_page_count);
-
 	/* The first SGE contains the transport header, which
 	 * remains mapped until @ctxt is destroyed.
 	 */
@@ -256,30 +259,49 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
 				  ctxt->sc_sges[i].length,
 				  DMA_TO_DEVICE);
 	}
+}
+
+/* Unmap, release pages, and return send_ctxt to the free list. */
+static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
+				       struct svc_rdma_send_ctxt *ctxt)
+{
+	svc_rdma_send_ctxt_unmap(rdma, ctxt);
+
+	if (ctxt->sc_page_count)
+		release_pages(ctxt->sc_pages, ctxt->sc_page_count);
 
 	llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
 }
 
-static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+/**
+ * svc_rdma_send_ctxts_drain - Release completed send_ctxts
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Called from xpo_release_ctxt and during transport teardown.
+ */
+void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma)
 {
-	struct svc_rdma_send_ctxt *ctxt;
+	struct svc_rdma_send_ctxt *ctxt, *next;
+	struct llist_node *node;
 
-	ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
-	svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+	node = llist_del_all(&rdma->sc_send_release_list);
+	llist_for_each_entry_safe(ctxt, next, node, sc_node)
+		svc_rdma_send_ctxt_release(rdma, ctxt);
 }
 
 /**
- * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * svc_rdma_send_ctxt_put - Queue send_ctxt for deferred release
  * @rdma: controlling svcxprt_rdma
- * @ctxt: object to return to the free list
+ * @ctxt: send_ctxt to queue for deferred release
  *
- * Pages left in sc_pages are DMA unmapped and released.
+ * Queues @ctxt for deferred release via the per-transport
+ * sc_send_release_list. DMA unmapping and page release run
+ * later in svc_rdma_send_ctxts_drain().
  */
 void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_send_ctxt *ctxt)
 {
-	INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
-	queue_work(svcrdma_wq, &ctxt->sc_work);
+	llist_add(&ctxt->sc_node, &rdma->sc_send_release_list);
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9f52d2c6666d..719566234277 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -177,6 +177,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	init_llist_head(&cma_xprt->sc_send_ctxts);
 	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	init_llist_head(&cma_xprt->sc_rw_ctxts);
+	init_llist_head(&cma_xprt->sc_send_release_list);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 	init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
 
@@ -610,7 +611,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 	/* This blocks until the Completion Queues are empty */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_drain_qp(rdma->sc_qp);
-	flush_workqueue(svcrdma_wq);
+	svc_rdma_send_ctxts_drain(rdma);
 
 	svc_rdma_flush_recv_queues(rdma);
 
-- 
2.53.0

next prev parent reply	other threads:[~2026-02-27 14:03 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
2026-02-27 14:03 ` [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-27 14:03 ` [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
2026-02-27 14:03 ` [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
2026-02-27 14:03 ` [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
2026-02-27 14:03 ` [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper Chuck Lever
2026-02-27 14:03 ` [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
2026-02-27 14:03 ` [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
2026-02-27 14:03 ` [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
2026-02-27 14:03 ` [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-02-27 14:03 ` Chuck Lever [this message]
2026-02-27 14:03 ` [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
2026-02-27 14:03 ` [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
2026-02-27 14:03 ` [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently Chuck Lever
2026-02-27 14:03 ` [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending Chuck Lever
2026-02-27 14:03 ` [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle Chuck Lever
2026-02-27 14:03 ` [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports Chuck Lever

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:a2d3232593a dfblob:562a5f78cd3 dfblob:415c0310101
dfblob:f67f0612b1a dfblob:1bd6b0da002 dfblob:2281f9adc9f
dfblob:263da6f7626 dfblob:c8686fdfe78 dfblob:9f52d2c6666
dfblob:71956623427 )
 OR (
bs:"[PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260227140345.40488-11-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neilb@ownmail.net \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox