public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list
Date: Fri, 27 Feb 2026 09:03:35 -0500	[thread overview]
Message-ID: <20260227140345.40488-9-cel@kernel.org> (raw)
In-Reply-To: <20260227140345.40488-1-cel@kernel.org>

From: Chuck Lever <chuck.lever@oracle.com>

Extend the lock-free list conversion to sc_read_complete_q. This
queue tracks receive contexts that have completed RDMA Read
operations for handling Read chunks.

With both sc_rq_dto_q and sc_read_complete_q now using llist,
the sc_rq_dto_lock spinlock is no longer needed and is removed.
This eliminates all locking from the receive and Read completion
paths.

Note that llist provides LIFO ordering rather than FIFO. For
independent RPC requests this has no semantic impact.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  4 +--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 44 +++++++++---------------
 net/sunrpc/xprtrdma/svc_rdma_rw.c        | 10 +++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  5 +--
 4 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 884a29cecfa0..8f6483ed9e5f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -130,8 +130,7 @@ struct svcxprt_rdma {
 	u32		     sc_pending_recvs ____cacheline_aligned_in_smp;
 	u32		     sc_recv_batch;
 	struct llist_head    sc_rq_dto_q;
-	struct list_head     sc_read_complete_q;
-	spinlock_t	     sc_rq_dto_lock;
+	struct llist_head    sc_read_complete_q;
 
 	spinlock_t	     sc_lock;		/* transport lock */
 
@@ -203,7 +202,6 @@ struct svc_rdma_chunk_ctxt {
 
 struct svc_rdma_recv_ctxt {
 	struct llist_node	rc_node;
-	struct list_head	rc_list;
 	struct ib_recv_wr	rc_recv_wr;
 	struct ib_cqe		rc_cqe;
 	struct rpc_rdma_cid	rc_cid;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e99adf14fa9b..1bd6b0da002f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -108,13 +108,6 @@
 
 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
 
-static inline struct svc_rdma_recv_ctxt *
-svc_rdma_next_recv_ctxt(struct list_head *list)
-{
-	return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
-					rc_list);
-}
-
 static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
@@ -386,14 +379,21 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
  * svc_rdma_flush_recv_queues - Drain pending Receive work
  * @rdma: svcxprt_rdma being shut down
  *
+ * Called from svc_rdma_free() after ib_drain_qp() has blocked until
+ * completion queues are empty and flush_workqueue() has waited for
+ * pending work items. These preceding calls guarantee no concurrent
+ * producers (completion handlers) or consumers (svc_rdma_recvfrom)
+ * can be active, making unsynchronized llist_del_all() safe here.
  */
 void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
 	struct llist_node *node;
 
-	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
-		list_del(&ctxt->rc_list);
+	node = llist_del_all(&rdma->sc_read_complete_q);
+	while (node) {
+		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+		node = node->next;
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
 	}
 	node = llist_del_all(&rdma->sc_rq_dto_q);
@@ -946,17 +946,13 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	rqstp->rq_xprt_ctxt = NULL;
 
-	spin_lock(&rdma_xprt->sc_rq_dto_lock);
-	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
-	if (ctxt) {
-		list_del(&ctxt->rc_list);
-		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+	node = llist_del_first(&rdma_xprt->sc_read_complete_q);
+	if (node) {
+		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
 		svc_xprt_received(xprt);
 		svc_rdma_read_complete(rqstp, ctxt);
 		goto complete;
 	}
-	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
-
 	node = llist_del_first(&rdma_xprt->sc_rq_dto_q);
 	if (node) {
 		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
@@ -968,20 +964,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		/*
 		 * If a completion arrived after llist_del_first but
 		 * before clear_bit, the producer's set_bit would be
-		 * cleared above. Recheck to close this race window.
+		 * cleared above. Recheck both queues to close this
+		 * race window.
 		 */
-		if (!llist_empty(&rdma_xprt->sc_rq_dto_q))
+		if (!llist_empty(&rdma_xprt->sc_rq_dto_q) ||
+		    !llist_empty(&rdma_xprt->sc_read_complete_q))
 			set_bit(XPT_DATA, &xprt->xpt_flags);
-
-		/* Recheck sc_read_complete_q under lock for the same
-		 * reason -- svc_rdma_wc_read_done() may have added an
-		 * entry and set XPT_DATA between our earlier unlock
-		 * and the clear_bit above.
-		 */
-		spin_lock(&rdma_xprt->sc_rq_dto_lock);
-		if (!list_empty(&rdma_xprt->sc_read_complete_q))
-			set_bit(XPT_DATA, &xprt->xpt_flags);
-		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 	}
 
 	/* Unblock the transport for the next receive */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index b1237d81075b..554463c72f1f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -370,11 +370,13 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 		trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes,
 				      cc->cc_posttime);
 
-		spin_lock(&rdma->sc_rq_dto_lock);
-		list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q);
-		/* the unlock pairs with the smp_rmb in svc_xprt_ready */
+		llist_add(&ctxt->rc_node, &rdma->sc_read_complete_q);
+		/*
+		 * The implicit barrier of llist_add's cmpxchg pairs with
+		 * the smp_rmb in svc_xprt_ready, ensuring the list update
+		 * is visible before XPT_DATA is observed.
+		 */
 		set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
-		spin_unlock(&rdma->sc_rq_dto_lock);
 		svc_xprt_enqueue(&rdma->sc_xprt);
 		return;
 	case IB_WC_WR_FLUSH_ERR:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ff9bae18a1aa..9f52d2c6666d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -164,7 +164,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 {
 	static struct lock_class_key svcrdma_rwctx_lock;
 	static struct lock_class_key svcrdma_sctx_lock;
-	static struct lock_class_key svcrdma_dto_lock;
 	struct svcxprt_rdma *cma_xprt;
 
 	cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
@@ -174,7 +173,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
 	init_llist_head(&cma_xprt->sc_rq_dto_q);
-	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	init_llist_head(&cma_xprt->sc_read_complete_q);
 	init_llist_head(&cma_xprt->sc_send_ctxts);
 	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	init_llist_head(&cma_xprt->sc_rw_ctxts);
@@ -182,8 +181,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
-	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
-	lockdep_set_class(&cma_xprt->sc_rq_dto_lock, &svcrdma_dto_lock);
 	spin_lock_init(&cma_xprt->sc_send_lock);
 	lockdep_set_class(&cma_xprt->sc_send_lock, &svcrdma_sctx_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
-- 
2.53.0


  parent reply	other threads:[~2026-02-27 14:03 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
2026-02-27 14:03 ` [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-27 14:03 ` [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
2026-02-27 14:03 ` [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
2026-02-27 14:03 ` [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
2026-02-27 14:03 ` [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper Chuck Lever
2026-02-27 14:03 ` [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
2026-02-27 14:03 ` [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
2026-02-27 14:03 ` Chuck Lever [this message]
2026-02-27 14:03 ` [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing Chuck Lever
2026-02-27 14:03 ` [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt Chuck Lever
2026-02-27 14:03 ` [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
2026-02-27 14:03 ` [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
2026-02-27 14:03 ` [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently Chuck Lever
2026-02-27 14:03 ` [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
2026-02-27 14:03 ` [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending Chuck Lever
2026-02-27 14:03 ` [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle Chuck Lever
2026-02-27 14:03 ` [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260227140345.40488-9-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neilb@ownmail.net \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox