* [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
` (16 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
When the Send Queue fills, multiple threads may wait for SQ slots.
The previous implementation had no ordering guarantee, allowing
starvation when one thread repeatedly acquires slots while others
wait indefinitely.
Introduce a ticket-based fair queuing system. Each waiter takes a
ticket number and is served in FIFO order. This ensures forward
progress for all waiters when SQ capacity is constrained.
The implementation has two phases:
1. Fast path: attempt to reserve SQ slots without waiting
2. Slow path: take a ticket, wait for turn, then wait for slots
The ticket system adds two atomic counters to the transport:
- sc_sq_ticket_head: next ticket to issue
- sc_sq_ticket_tail: ticket currently being served
A dedicated wait queue (sc_sq_ticket_wait) handles ticket
ordering, separate from sc_send_wait which handles SQ capacity.
This separation ensures that send completions (the high-frequency
wake source) wake only the current ticket holder rather than all
queued waiters. Ticket handoff wakes only the ticket wait queue,
and each ticket holder that exits via connection close propagates
the wake to the next waiter in line.
When a waiter successfully reserves slots, it advances the tail
counter and wakes the next waiter. This creates an orderly handoff
that prevents starvation while maintaining good throughput on the
fast path when contention is low.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 10 ++
net/sunrpc/xprtrdma/svc_rdma_rw.c | 37 ++----
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 160 +++++++++++++++++------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +-
4 files changed, 145 insertions(+), 68 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 57f4fd94166a..658b8498177e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -84,6 +84,9 @@ struct svcxprt_rdma {
atomic_t sc_sq_avail; /* SQEs ready to be consumed */
unsigned int sc_sq_depth; /* Depth of SQ */
+ atomic_t sc_sq_ticket_head; /* Next ticket to issue */
+ atomic_t sc_sq_ticket_tail; /* Ticket currently serving */
+ wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */
__be32 sc_fc_credits; /* Forward credits */
u32 sc_max_requests; /* Max requests */
u32 sc_max_bc_requests;/* Backward credits */
@@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *rctxt,
int status);
extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail);
+extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid, int sqecount);
+extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid,
+ const struct ib_send_wr *bad_wr,
+ const struct ib_send_wr *first_wr,
+ int sqecount, int ret);
extern int svc_rdma_sendto(struct svc_rqst *);
extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
unsigned int length);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 4ec2f9ae06aa..6626f18de55e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -405,34 +405,17 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
cqe = NULL;
}
- do {
- if (atomic_sub_return(cc->cc_sqecount,
- &rdma->sc_sq_avail) > 0) {
- cc->cc_posttime = ktime_get();
- ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
- if (ret)
- break;
- return 0;
- }
+ ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount);
+ if (ret < 0)
+ return ret;
- percpu_counter_inc(&svcrdma_stat_sq_starve);
- trace_svcrdma_sq_full(rdma, &cc->cc_cid);
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wait_event(rdma->sc_send_wait,
- atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
- trace_svcrdma_sq_retry(rdma, &cc->cc_cid);
- } while (1);
-
- trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret);
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- /* If even one was posted, there will be a completion. */
- if (bad_wr != first_wr)
- return 0;
-
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wake_up(&rdma->sc_send_wait);
- return -ENOTCONN;
+ cc->cc_posttime = ktime_get();
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr,
+ first_wr, cc->cc_sqecount,
+ ret);
+ return 0;
}
/* Build a bvec that covers one kvec in an xdr_buf.
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 914cd263c2f1..22354e12d390 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -294,6 +294,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail)
wake_up(&rdma->sc_send_wait);
}
+/**
+ * svc_rdma_sq_wait - Wait for SQ slots using fair queuing
+ * @rdma: controlling transport
+ * @cid: completion ID for tracing
+ * @sqecount: number of SQ entries needed
+ *
+ * A ticket-based system ensures fair ordering when multiple threads
+ * wait for Send Queue capacity. Each waiter takes a ticket and is
+ * served in order, preventing starvation.
+ *
+ * Protocol invariant: every ticket holder must increment
+ * sc_sq_ticket_tail exactly once, whether the reservation
+ * succeeds or the connection closes. Failing to advance the
+ * tail stalls all subsequent waiters.
+ *
+ * The ticket counters are signed 32-bit atomics. After
+ * wrapping through INT_MAX, the equality check
+ * (tail == ticket) remains correct because both counters
+ * advance monotonically and the comparison uses exact
+ * equality rather than relational operators.
+ *
+ * Return values:
+ * %0: SQ slots were reserved successfully
+ * %-ENOTCONN: The connection was lost
+ */
+int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid, int sqecount)
+{
+ int ticket;
+
+ /* Fast path: try to reserve SQ slots without waiting.
+ *
+ * A failed reservation temporarily understates sc_sq_avail
+ * until the compensating atomic_add restores it. A Send
+ * completion arriving in that window sees a lower count
+ * than reality, but the value self-corrects once the add
+ * completes. No ordering guarantee is needed here because
+ * the slow path serializes all contended waiters.
+ */
+ if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0))
+ return 0;
+ atomic_add(sqecount, &rdma->sc_sq_avail);
+
+ /* Slow path: take a ticket and wait in line */
+ ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head);
+
+ percpu_counter_inc(&svcrdma_stat_sq_starve);
+ trace_svcrdma_sq_full(rdma, cid);
+
+ /* Wait until all earlier tickets have been served */
+ wait_event(rdma->sc_sq_ticket_wait,
+ test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
+ atomic_read(&rdma->sc_sq_ticket_tail) == ticket);
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+ goto out_close;
+
+ /* It's our turn. Wait for enough SQ slots to be available. */
+ while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
+ atomic_add(sqecount, &rdma->sc_sq_avail);
+
+ wait_event(rdma->sc_send_wait,
+ test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
+ atomic_read(&rdma->sc_sq_avail) >= sqecount);
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+ goto out_close;
+ }
+
+ /* Slots reserved successfully. Let the next waiter proceed. */
+ atomic_inc(&rdma->sc_sq_ticket_tail);
+ wake_up(&rdma->sc_sq_ticket_wait);
+ trace_svcrdma_sq_retry(rdma, cid);
+ return 0;
+
+out_close:
+ atomic_inc(&rdma->sc_sq_ticket_tail);
+ wake_up(&rdma->sc_sq_ticket_wait);
+ return -ENOTCONN;
+}
+
+/**
+ * svc_rdma_post_send_err - Handle ib_post_send failure
+ * @rdma: controlling transport
+ * @cid: completion ID for tracing
+ * @bad_wr: first WR that was not posted
+ * @first_wr: first WR in the chain
+ * @sqecount: number of SQ entries that were reserved
+ * @ret: error code from ib_post_send
+ *
+ * Return values:
+ * %0: At least one WR was posted; a completion handles cleanup
+ * %-ENOTCONN: No WRs were posted; SQ slots are released
+ */
+int svc_rdma_post_send_err(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid,
+ const struct ib_send_wr *bad_wr,
+ const struct ib_send_wr *first_wr,
+ int sqecount, int ret)
+{
+ trace_svcrdma_sq_post_err(rdma, cid, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+
+ /* If even one WR was posted, a Send completion will
+ * return the reserved SQ slots.
+ */
+ if (bad_wr != first_wr)
+ return 0;
+
+ svc_rdma_wake_send_waiters(rdma, sqecount);
+ return -ENOTCONN;
+}
+
/**
* svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: Completion Queue context
@@ -336,11 +447,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
* that these values remain available after the ib_post_send() call.
* In some error flow cases, svc_rdma_wc_send() releases @ctxt.
*
- * Note there is potential for starvation when the Send Queue is
- * full because there is no order to when waiting threads are
- * awoken. The transport is typically provisioned with a deep
- * enough Send Queue that SQ exhaustion should be a rare event.
- *
* Return values:
* %0: @ctxt's WR chain was posted successfully
* %-ENOTCONN: The connection was lost
@@ -362,42 +468,16 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma,
send_wr->sg_list[0].length,
DMA_TO_DEVICE);
- /* If the SQ is full, wait until an SQ entry is available */
- while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
- if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
- svc_rdma_wake_send_waiters(rdma, sqecount);
+ ret = svc_rdma_sq_wait(rdma, &cid, sqecount);
+ if (ret < 0)
+ return ret;
- /* When the transport is torn down, assume
- * ib_drain_sq() will trigger enough Send
- * completions to wake us. The XPT_CLOSE test
- * above should then cause the while loop to
- * exit.
- */
- percpu_counter_inc(&svcrdma_stat_sq_starve);
- trace_svcrdma_sq_full(rdma, &cid);
- wait_event(rdma->sc_send_wait,
- atomic_read(&rdma->sc_sq_avail) > 0);
- trace_svcrdma_sq_retry(rdma, &cid);
- continue;
- }
-
- trace_svcrdma_post_send(ctxt);
- ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
- if (ret) {
- trace_svcrdma_sq_post_err(rdma, &cid, ret);
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- /* If even one WR was posted, there will be a
- * Send completion that bumps sc_sq_avail.
- */
- if (bad_wr == first_wr) {
- svc_rdma_wake_send_waiters(rdma, sqecount);
- break;
- }
- }
- return 0;
- }
- return -ENOTCONN;
+ trace_svcrdma_post_send(ctxt);
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ return svc_rdma_post_send_err(rdma, &cid, bad_wr,
+ first_wr, sqecount, ret);
+ return 0;
}
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9b623849723e..b62d0a0ea816 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
+ init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
@@ -478,6 +479,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
+ atomic_set(&newxprt->sc_sq_ticket_head, 0);
+ atomic_set(&newxprt->sc_sq_ticket_tail, 0);
newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
@@ -648,7 +651,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
* If there are already waiters on the SQ,
* return false.
*/
- if (waitqueue_active(&rdma->sc_send_wait))
+ if (waitqueue_active(&rdma->sc_send_wait) ||
+ waitqueue_active(&rdma->sc_sq_ticket_wait))
return 0;
/* Otherwise return true. */
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
2026-02-27 14:03 ` [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
` (15 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
I can't think of a reason why svcrdma is using the PD's device. Most
other consumers of the IB DMA API use the ib_device pointer from the
connection's rdma_cm_id.
I don't believe there's any functional difference between the two,
but it is a little confusing to see some uses of rdma_cm_id->device
and some of ib_pd->device.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e7e4a39ca6c6..0b388d324c4f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -118,7 +118,8 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
- int node = ibdev_to_node(rdma->sc_cm_id->device);
+ struct ib_device *device = rdma->sc_cm_id->device;
+ int node = ibdev_to_node(device);
struct svc_rdma_recv_ctxt *ctxt;
unsigned long pages;
dma_addr_t addr;
@@ -133,9 +134,9 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
- addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
- rdma->sc_max_req_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+ addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(device, addr))
goto fail2;
svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
@@ -167,7 +168,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
+ ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr,
ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
kfree(ctxt->rc_recv_buf);
kfree(ctxt);
@@ -962,7 +963,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return 0;
percpu_counter_inc(&svcrdma_stat_recv);
- ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
+ ib_dma_sync_single_for_cpu(rdma_xprt->sc_cm_id->device,
ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
DMA_FROM_DEVICE);
svc_rdma_build_arg_xdr(rqstp, ctxt);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
2026-02-27 14:03 ` [PATCH v2 01/18] svcrdma: Add fair queuing for Send Queue access Chuck Lever
2026-02-27 14:03 ` [PATCH v2 02/18] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
` (14 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
I can't think of a reason why svcrdma is using the PD's device. Most
other consumers of the IB DMA API use the ib_device pointer from the
connection's rdma_cm_id.
I don't think there's any functional difference between the two, but
it is a little confusing to see some uses of rdma_cm_id and some of
ib_pd.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 22354e12d390..4fff03b96b84 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -116,7 +116,8 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{
- int node = ibdev_to_node(rdma->sc_cm_id->device);
+ struct ib_device *device = rdma->sc_cm_id->device;
+ int node = ibdev_to_node(device);
struct svc_rdma_send_ctxt *ctxt;
unsigned long pages;
dma_addr_t addr;
@@ -136,9 +137,9 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail2;
- addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
- rdma->sc_max_req_size, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+ addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, addr))
goto fail3;
svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
@@ -175,15 +176,14 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
*/
void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
{
+ struct ib_device *device = rdma->sc_cm_id->device;
struct svc_rdma_send_ctxt *ctxt;
struct llist_node *node;
while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) {
ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node);
- ib_dma_unmap_single(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr,
- rdma->sc_max_req_size,
- DMA_TO_DEVICE);
+ ib_dma_unmap_single(device, ctxt->sc_sges[0].addr,
+ rdma->sc_max_req_size, DMA_TO_DEVICE);
kfree(ctxt->sc_xprt_buf);
kfree(ctxt->sc_pages);
kfree(ctxt);
@@ -463,7 +463,7 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma,
might_sleep();
/* Sync the transport header buffer */
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ ib_dma_sync_single_for_device(rdma->sc_cm_id->device,
send_wr->sg_list[0].addr,
send_wr->sg_list[0].length,
DMA_TO_DEVICE);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (2 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 03/18] svcrdma: Clean up use of rdma->sc_pd->device Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper Chuck Lever
` (13 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Previously, Write chunk RDMA Writes were posted via a separate
ib_post_send() call with their own completion handler. Each Write
chunk incurred a doorbell and generated a completion event.
Link Write chunk WRs onto the RPC Reply's Send WR chain so that a
single ib_post_send() call posts both the RDMA Writes and the Send
WR. A single completion event signals that all operations have
finished. This reduces both doorbell rate and completion rate, as
well as eliminating the latency of a round-trip between the Write
chunk completion and the subsequent Send WR posting.
The lifecycle of Write chunk resources changes: previously, the
svc_rdma_write_done() completion handler released Write chunk
resources when RDMA Writes completed. With WR chaining, resources
remain live until the Send completion. A new sc_write_info_list
tracks Write chunk metadata attached to each Send context, and
svc_rdma_write_chunk_release() frees these resources when the
Send context is released.
The svc_rdma_write_done() handler now handles only error cases.
On success it returns immediately since the Send completion handles
resource release. On failure (WR flush), it closes the connection
to signal to the client that the RPC Reply is incomplete.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 13 +++-
net/sunrpc/xprtrdma/svc_rdma_rw.c | 94 ++++++++++++++++++++-------
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 ++-
3 files changed, 91 insertions(+), 26 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 658b8498177e..df6e08aaad57 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt {
*/
struct svc_rdma_write_info {
struct svcxprt_rdma *wi_rdma;
+ struct list_head wi_list;
const struct svc_rdma_chunk *wi_chunk;
@@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt {
struct ib_cqe sc_cqe;
struct xdr_buf sc_hdrbuf;
struct xdr_stream sc_stream;
+
+ struct list_head sc_write_info_list;
struct svc_rdma_write_info sc_reply_info;
+
void *sc_xprt_buf;
int sc_page_count;
int sc_cur_sge_no;
@@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc,
enum dma_data_direction dir);
+extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt);
extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt);
-extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
- const struct svc_rdma_recv_ctxt *rctxt,
- const struct xdr_buf *xdr);
+extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct xdr_buf *xdr);
extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
const struct svc_rdma_pcl *write_pcl,
const struct svc_rdma_pcl *reply_pcl,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 6626f18de55e..c5d65164eae2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -251,6 +251,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
queue_work(svcrdma_wq, &info->wi_work);
}
+/**
+ * svc_rdma_write_chunk_release - Release Write chunk I/O resources
+ * @rdma: controlling transport
+ * @ctxt: Send context that is being released
+ *
+ * Write chunk resources remain live until Send completion because
+ * Write WRs are chained to the Send WR. This function releases all
+ * write_info structures accumulated on @ctxt->sc_write_info_list.
+ */
+void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
+{
+ struct svc_rdma_write_info *info;
+
+ while (!list_empty(&ctxt->sc_write_info_list)) {
+ info = list_first_entry(&ctxt->sc_write_info_list,
+ struct svc_rdma_write_info, wi_list);
+ list_del(&info->wi_list);
+ svc_rdma_write_info_free(info);
+ }
+}
+
/**
* svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
* @rdma: controlling transport
@@ -307,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
- struct svc_rdma_write_info *info =
- container_of(cc, struct svc_rdma_write_info, wi_cc);
switch (wc->status) {
case IB_WC_SUCCESS:
trace_svcrdma_wc_write(&cc->cc_cid);
- break;
+ return;
case IB_WC_WR_FLUSH_ERR:
trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
break;
@@ -321,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
}
- svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
-
- if (unlikely(wc->status != IB_WC_SUCCESS))
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- svc_rdma_write_info_free(info);
+ /* The RDMA Write has flushed, so the client won't get
+ * some of the outgoing RPC message. Signal the loss
+ * to the client by closing the connection.
+ */
+ svc_xprt_deferred_close(&rdma->sc_xprt);
}
/**
@@ -600,13 +619,27 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
return xdr->len;
}
-static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
- const struct svc_rdma_chunk *chunk,
- const struct xdr_buf *xdr)
+/*
+ * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain
+ *
+ * Write WRs are prepended to the Send WR chain so that a single
+ * ib_post_send() posts both RDMA Writes and the final Send. Only
+ * the first WR in each chunk gets a CQE for error detection;
+ * subsequent WRs complete without individual completion events.
+ * The Send WR's signaled completion indicates all chained
+ * operations have finished.
+ */
+static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc;
+ struct ib_send_wr *first_wr;
struct xdr_buf payload;
+ struct list_head *pos;
+ struct ib_cqe *cqe;
int ret;
if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
@@ -622,10 +655,25 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
if (ret != payload.len)
goto out_err;
- trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
- ret = svc_rdma_post_chunk_ctxt(rdma, cc);
- if (ret < 0)
+ ret = -EINVAL;
+ if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth))
goto out_err;
+
+ first_wr = sctxt->sc_wr_chain;
+ cqe = &cc->cc_cqe;
+ list_for_each(pos, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *rwc;
+
+ rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+ sctxt->sc_wr_chain = first_wr;
+ sctxt->sc_sqecount += cc->cc_sqecount;
+ list_add(&info->wi_list, &sctxt->sc_write_info_list);
+
+ trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
return 0;
out_err:
@@ -634,17 +682,19 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
}
/**
- * svc_rdma_send_write_list - Send all chunks on the Write list
+ * svc_rdma_prepare_write_list - Construct WR chain for sending Write list
* @rdma: controlling RDMA transport
* @rctxt: Write list provisioned by the client
+ * @sctxt: Send WR resources
* @xdr: xdr_buf containing an RPC Reply message
*
- * Returns zero on success, or a negative errno if one or more
- * Write chunks could not be sent.
+ * Returns zero on success, or a negative errno if WR chain
+ * construction fails for one or more Write chunks.
*/
-int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
- const struct svc_rdma_recv_ctxt *rctxt,
- const struct xdr_buf *xdr)
+int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_chunk *chunk;
int ret;
@@ -652,7 +702,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
if (!chunk->ch_payload_length)
break;
- ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
+ ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr);
if (ret < 0)
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4fff03b96b84..263da6f76267 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -150,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
ctxt->sc_cqe.done = svc_rdma_wc_send;
+ INIT_LIST_HEAD(&ctxt->sc_write_info_list);
ctxt->sc_xprt_buf = buffer;
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
rdma->sc_max_req_size);
@@ -237,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
+ svc_rdma_write_chunk_release(rdma, ctxt);
svc_rdma_reply_chunk_release(rdma, ctxt);
if (ctxt->sc_page_count)
@@ -1056,6 +1058,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
sctxt->sc_send_wr.num_sge = 1;
sctxt->sc_send_wr.opcode = IB_WR_SEND;
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+
+ /* Ensure only the error message is posted, not any previously
+ * prepared Write chunk WRs.
+ */
+ sctxt->sc_wr_chain = &sctxt->sc_send_wr;
+ sctxt->sc_sqecount = 1;
if (svc_rdma_post_send(rdma, sctxt))
goto put_ctxt;
return;
@@ -1103,7 +1111,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (!p)
goto put_ctxt;
- ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
+ ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res);
if (ret < 0)
goto put_ctxt;
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (3 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 04/18] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
` (12 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
svc_rdma_prepare_write_chunk() and svc_rdma_prepare_reply_chunk()
contain identical code for linking RDMA R/W work requests onto a
Send context's WR chain. This duplication increases maintenance
burden and risks divergent bug fixes.
Introduce svc_rdma_cc_link_wrs() to consolidate the WR chain
linking logic. The helper walks the chunk context's rwctxts list,
chains each WR via rdma_rw_ctx_wrs(), and updates the Send
context's chain head and SQE count. Completion signaling is
requested only for the tail WR (posted first).
No functional change.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_rw.c | 67 +++++++++++++------------------
1 file changed, 28 insertions(+), 39 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index c5d65164eae2..b1237d81075b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -619,15 +619,32 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
return xdr->len;
}
-/*
- * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain
- *
- * Write WRs are prepended to the Send WR chain so that a single
- * ib_post_send() posts both RDMA Writes and the final Send. Only
- * the first WR in each chunk gets a CQE for error detection;
- * subsequent WRs complete without individual completion events.
- * The Send WR's signaled completion indicates all chained
- * operations have finished.
+/* Link chunk WRs onto @sctxt's WR chain. Completion is requested
+ * for the tail WR, which is posted first.
+ */
+static void svc_rdma_cc_link_wrs(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ struct svc_rdma_chunk_ctxt *cc)
+{
+ struct ib_send_wr *first_wr;
+ struct list_head *pos;
+ struct ib_cqe *cqe;
+
+ first_wr = sctxt->sc_wr_chain;
+ cqe = &cc->cc_cqe;
+ list_for_each(pos, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *rwc;
+
+ rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+ sctxt->sc_wr_chain = first_wr;
+ sctxt->sc_sqecount += cc->cc_sqecount;
+}
+
+/* Link Write WRs for @chunk onto @sctxt's WR chain.
*/
static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
@@ -636,10 +653,7 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc;
- struct ib_send_wr *first_wr;
struct xdr_buf payload;
- struct list_head *pos;
- struct ib_cqe *cqe;
int ret;
if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
@@ -659,18 +673,7 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth))
goto out_err;
- first_wr = sctxt->sc_wr_chain;
- cqe = &cc->cc_cqe;
- list_for_each(pos, &cc->cc_rwctxts) {
- struct svc_rdma_rw_ctxt *rwc;
-
- rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
- first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, cqe, first_wr);
- cqe = NULL;
- }
- sctxt->sc_wr_chain = first_wr;
- sctxt->sc_sqecount += cc->cc_sqecount;
+ svc_rdma_cc_link_wrs(rdma, sctxt, cc);
list_add(&info->wi_list, &sctxt->sc_write_info_list);
trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
@@ -732,9 +735,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
- struct ib_send_wr *first_wr;
- struct list_head *pos;
- struct ib_cqe *cqe;
int ret;
info->wi_rdma = rdma;
@@ -748,18 +748,7 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
if (ret < 0)
return ret;
- first_wr = sctxt->sc_wr_chain;
- cqe = &cc->cc_cqe;
- list_for_each(pos, &cc->cc_rwctxts) {
- struct svc_rdma_rw_ctxt *rwc;
-
- rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
- first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, cqe, first_wr);
- cqe = NULL;
- }
- sctxt->sc_wr_chain = first_wr;
- sctxt->sc_sqecount += cc->cc_sqecount;
+ svc_rdma_cc_link_wrs(rdma, sctxt, cc);
trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
return xdr->len;
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (4 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 05/18] svcrdma: Factor out WR chain linking into helper Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
` (11 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Several frequently-modified fields in struct svcxprt_rdma reside
in the same cache line, causing false sharing between independent
code paths:
- sc_sq_avail: atomic, modified on every ib_post_send and
completion
- sc_send_lock/sc_send_ctxts: Send context cache, accessed during
reply construction
- sc_rw_ctxt_lock/sc_rw_ctxts: R/W context cache, accessed during
Read/Write chunk processing
Insert ____cacheline_aligned_in_smp annotations to place the Send
context cache, R/W context cache, and receive-path fields into
separate cache lines.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 48 ++++++++++++++++++++++++---------
1 file changed, 36 insertions(+), 12 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index df6e08aaad57..3cc4408831a3 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -73,13 +73,30 @@ extern struct percpu_counter svcrdma_stat_recv;
extern struct percpu_counter svcrdma_stat_sq_starve;
extern struct percpu_counter svcrdma_stat_write;
+/*
+ * struct svcxprt_rdma - server-side RDMA transport
+ *
+ * Fields are grouped into cache-line-aligned zones to avoid false
+ * sharing between concurrent code paths. Each zone is marked with
+ * ____cacheline_aligned_in_smp on its first field.
+ *
+ * SQ reservation sc_sq_avail, ticket ordering, and connection
+ * state -- no alignment constraint (struct head).
+ * Send context cache sc_send_lock, sc_send_ctxts, sc_pd, and
+ * related
+ * R/W context cache sc_rw_ctxt_lock, sc_rw_ctxts, sc_qp, etc.
+ * Receive path sc_pending_recvs, sc_rq_dto_q, etc.
+ *
+ * When adding a field, place it in the zone whose code path modifies the
+ * field under load. Read-only fields can fill padding in any zone that
+ * accesses them. Fields modified by multiple paths remain at the end,
+ * outside any aligned zone.
+ */
struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
struct list_head sc_accept_q; /* Conn. waiting accept */
struct rpcrdma_notification sc_rn; /* removal notification */
- int sc_ord; /* RDMA read limit */
- int sc_max_send_sges;
bool sc_snd_w_inv; /* OK to use Send With Invalidate */
atomic_t sc_sq_avail; /* SQEs ready to be consumed */
@@ -91,23 +108,30 @@ struct svcxprt_rdma {
u32 sc_max_requests; /* Max requests */
u32 sc_max_bc_requests;/* Backward credits */
int sc_max_req_size; /* Size of each RQ WR buf */
- u8 sc_port_num;
- struct ib_pd *sc_pd;
-
- spinlock_t sc_send_lock;
+ /* Send context cache */
+ spinlock_t sc_send_lock ____cacheline_aligned_in_smp;
struct llist_head sc_send_ctxts;
- spinlock_t sc_rw_ctxt_lock;
- struct llist_head sc_rw_ctxts;
+ /* sc_pd accessed during send context alloc */
+ struct ib_pd *sc_pd;
+ int sc_ord; /* RDMA read limit */
+ int sc_max_send_sges;
- u32 sc_pending_recvs;
+ /* R/W context cache */
+ spinlock_t sc_rw_ctxt_lock ____cacheline_aligned_in_smp;
+ struct llist_head sc_rw_ctxts;
+ /* sc_qp and sc_port_num accessed together */
+ struct ib_qp *sc_qp;
+ u8 sc_port_num;
+ struct ib_cq *sc_rq_cq;
+ struct ib_cq *sc_sq_cq;
+
+ /* Receive path */
+ u32 sc_pending_recvs ____cacheline_aligned_in_smp;
u32 sc_recv_batch;
struct list_head sc_rq_dto_q;
struct list_head sc_read_complete_q;
spinlock_t sc_rq_dto_lock;
- struct ib_qp *sc_qp;
- struct ib_cq *sc_rq_cq;
- struct ib_cq *sc_sq_cq;
spinlock_t sc_lock; /* transport lock */
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (5 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 06/18] svcrdma: Reduce false sharing in struct svcxprt_rdma Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
` (10 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The sc_rq_dto_lock spinlock is acquired on every receive completion
to add the completed receive context to the sc_rq_dto_q list. Under
high message rates this creates contention between softirq contexts
processing completions.
Replace sc_rq_dto_q with a lock-free llist. Receive completions now
use llist_add() which requires no locking. The consumer uses
llist_del_first() to retrieve one item at a time.
The lock remains for sc_read_complete_q, but the primary hot path
(receive completion and consumption) no longer requires it. This
eliminates producer-side contention entirely.
Note that llist provides LIFO ordering rather than FIFO. For
independent RPC requests this has no semantic impact and avoids
the overhead of reversing the list on the consumer side.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 2 +-
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 49 ++++++++++++++++++------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
3 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 3cc4408831a3..884a29cecfa0 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -129,7 +129,7 @@ struct svcxprt_rdma {
/* Receive path */
u32 sc_pending_recvs ____cacheline_aligned_in_smp;
u32 sc_recv_batch;
- struct list_head sc_rq_dto_q;
+ struct llist_head sc_rq_dto_q;
struct list_head sc_read_complete_q;
spinlock_t sc_rq_dto_lock;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0b388d324c4f..e99adf14fa9b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -361,11 +361,13 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* All wc fields are now known to be valid */
ctxt->rc_byte_len = wc->byte_len;
- spin_lock(&rdma->sc_rq_dto_lock);
- list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
- /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
+ llist_add(&ctxt->rc_node, &rdma->sc_rq_dto_q);
+ /*
+ * llist_add's cmpxchg provides full memory ordering,
+ * pairing with the smp_rmb in svc_xprt_ready to ensure
+ * the list update is visible before XPT_DATA is observed.
+ */
set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
- spin_unlock(&rdma->sc_rq_dto_lock);
if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
svc_xprt_enqueue(&rdma->sc_xprt);
return;
@@ -388,13 +390,16 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
list_del(&ctxt->rc_list);
svc_rdma_recv_ctxt_put(rdma, ctxt);
}
- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
- list_del(&ctxt->rc_list);
+ node = llist_del_all(&rdma->sc_rq_dto_q);
+ while (node) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+ node = node->next;
svc_rdma_recv_ctxt_put(rdma, ctxt);
}
}
@@ -930,6 +935,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma_xprt =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
int ret;
/* Prevent svc_xprt_release() from releasing pages in rq_pages
@@ -949,13 +955,34 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
svc_rdma_read_complete(rqstp, ctxt);
goto complete;
}
- ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
- if (ctxt)
- list_del(&ctxt->rc_list);
- else
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+
+ node = llist_del_first(&rdma_xprt->sc_rq_dto_q);
+ if (node) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+ } else {
+ ctxt = NULL;
/* No new incoming requests, terminate the loop */
clear_bit(XPT_DATA, &xprt->xpt_flags);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+
+ /*
+ * If a completion arrived after llist_del_first but
+ * before clear_bit, the producer's set_bit would be
+ * cleared above. Recheck to close this race window.
+ */
+ if (!llist_empty(&rdma_xprt->sc_rq_dto_q))
+ set_bit(XPT_DATA, &xprt->xpt_flags);
+
+ /* Recheck sc_read_complete_q under lock for the same
+ * reason -- svc_rdma_wc_read_done() may have added an
+ * entry and set XPT_DATA between our earlier unlock
+ * and the clear_bit above.
+ */
+ spin_lock(&rdma_xprt->sc_rq_dto_lock);
+ if (!list_empty(&rdma_xprt->sc_read_complete_q))
+ set_bit(XPT_DATA, &xprt->xpt_flags);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+ }
/* Unblock the transport for the next receive */
svc_xprt_received(xprt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b62d0a0ea816..ff9bae18a1aa 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -173,7 +173,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
- INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+ init_llist_head(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (6 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 07/18] svcrdma: Use lock-free list for Receive Queue tracking Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing Chuck Lever
` (9 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Extend the lock-free list conversion to sc_read_complete_q. This
queue tracks receive contexts that have completed RDMA Read
operations for handling Read chunks.
With both sc_rq_dto_q and sc_read_complete_q now using llist,
the sc_rq_dto_lock spinlock is no longer needed and is removed.
This eliminates all locking from the receive and Read completion
paths.
Note that llist provides LIFO ordering rather than FIFO. For
independent RPC requests this has no semantic impact.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 4 +--
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 44 +++++++++---------------
net/sunrpc/xprtrdma/svc_rdma_rw.c | 10 +++---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 +--
4 files changed, 24 insertions(+), 39 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 884a29cecfa0..8f6483ed9e5f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -130,8 +130,7 @@ struct svcxprt_rdma {
u32 sc_pending_recvs ____cacheline_aligned_in_smp;
u32 sc_recv_batch;
struct llist_head sc_rq_dto_q;
- struct list_head sc_read_complete_q;
- spinlock_t sc_rq_dto_lock;
+ struct llist_head sc_read_complete_q;
spinlock_t sc_lock; /* transport lock */
@@ -203,7 +202,6 @@ struct svc_rdma_chunk_ctxt {
struct svc_rdma_recv_ctxt {
struct llist_node rc_node;
- struct list_head rc_list;
struct ib_recv_wr rc_recv_wr;
struct ib_cqe rc_cqe;
struct rpc_rdma_cid rc_cid;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e99adf14fa9b..1bd6b0da002f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -108,13 +108,6 @@
static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
-static inline struct svc_rdma_recv_ctxt *
-svc_rdma_next_recv_ctxt(struct list_head *list)
-{
- return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
- rc_list);
-}
-
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
@@ -386,14 +379,21 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* svc_rdma_flush_recv_queues - Drain pending Receive work
* @rdma: svcxprt_rdma being shut down
*
+ * Called from svc_rdma_free() after ib_drain_qp() has blocked until
+ * completion queues are empty and flush_workqueue() has waited for
+ * pending work items. These preceding calls guarantee no concurrent
+ * producers (completion handlers) or consumers (svc_rdma_recvfrom)
+ * can be active, making unsynchronized llist_del_all() safe here.
*/
void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
struct llist_node *node;
- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
- list_del(&ctxt->rc_list);
+ node = llist_del_all(&rdma->sc_read_complete_q);
+ while (node) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+ node = node->next;
svc_rdma_recv_ctxt_put(rdma, ctxt);
}
node = llist_del_all(&rdma->sc_rq_dto_q);
@@ -946,17 +946,13 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;
- spin_lock(&rdma_xprt->sc_rq_dto_lock);
- ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
- if (ctxt) {
- list_del(&ctxt->rc_list);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+ node = llist_del_first(&rdma_xprt->sc_read_complete_q);
+ if (node) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
svc_xprt_received(xprt);
svc_rdma_read_complete(rqstp, ctxt);
goto complete;
}
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
-
node = llist_del_first(&rdma_xprt->sc_rq_dto_q);
if (node) {
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
@@ -968,20 +964,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
/*
* If a completion arrived after llist_del_first but
* before clear_bit, the producer's set_bit would be
- * cleared above. Recheck to close this race window.
+ * cleared above. Recheck both queues to close this
+ * race window.
*/
- if (!llist_empty(&rdma_xprt->sc_rq_dto_q))
+ if (!llist_empty(&rdma_xprt->sc_rq_dto_q) ||
+ !llist_empty(&rdma_xprt->sc_read_complete_q))
set_bit(XPT_DATA, &xprt->xpt_flags);
-
- /* Recheck sc_read_complete_q under lock for the same
- * reason -- svc_rdma_wc_read_done() may have added an
- * entry and set XPT_DATA between our earlier unlock
- * and the clear_bit above.
- */
- spin_lock(&rdma_xprt->sc_rq_dto_lock);
- if (!list_empty(&rdma_xprt->sc_read_complete_q))
- set_bit(XPT_DATA, &xprt->xpt_flags);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
}
/* Unblock the transport for the next receive */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index b1237d81075b..554463c72f1f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -370,11 +370,13 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes,
cc->cc_posttime);
- spin_lock(&rdma->sc_rq_dto_lock);
- list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q);
- /* the unlock pairs with the smp_rmb in svc_xprt_ready */
+ llist_add(&ctxt->rc_node, &rdma->sc_read_complete_q);
+ /*
+ * The implicit barrier of llist_add's cmpxchg pairs with
+ * the smp_rmb in svc_xprt_ready, ensuring the list update
+ * is visible before XPT_DATA is observed.
+ */
set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
- spin_unlock(&rdma->sc_rq_dto_lock);
svc_xprt_enqueue(&rdma->sc_xprt);
return;
case IB_WC_WR_FLUSH_ERR:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ff9bae18a1aa..9f52d2c6666d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -164,7 +164,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
{
static struct lock_class_key svcrdma_rwctx_lock;
static struct lock_class_key svcrdma_sctx_lock;
- static struct lock_class_key svcrdma_dto_lock;
struct svcxprt_rdma *cma_xprt;
cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
@@ -174,7 +173,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
init_llist_head(&cma_xprt->sc_rq_dto_q);
- INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+ init_llist_head(&cma_xprt->sc_read_complete_q);
init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
@@ -182,8 +181,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
spin_lock_init(&cma_xprt->sc_lock);
- spin_lock_init(&cma_xprt->sc_rq_dto_lock);
- lockdep_set_class(&cma_xprt->sc_rq_dto_lock, &svcrdma_dto_lock);
spin_lock_init(&cma_xprt->sc_send_lock);
lockdep_set_class(&cma_xprt->sc_send_lock, &svcrdma_sctx_lock);
spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (7 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 08/18] svcrdma: Convert Read completion queue to use lock-free list Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt Chuck Lever
` (8 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Each RDMA Send completion triggers a cascade of work items on the
svcrdma_wq unbound workqueue:
ib_cq_poll_work (on ib_comp_wq, per-CPU)
-> svc_rdma_send_ctxt_put -> queue_work [work item 1]
-> svc_rdma_write_info_free -> queue_work [work item 2]
Every transition through queue_work contends on the unbound
pool's spinlock. Profiling an 8KB NFSv3 read/write workload
over RDMA shows about 4% of total CPU cycles spent on this
lock, with the cascading re-queue of write_info release
contributing roughly 1%.
The initial queue_work in svc_rdma_send_ctxt_put is needed to
move release work off the CQ completion context (which runs on
a per-CPU bound workqueue). However, once executing on
svcrdma_wq, there is no need to re-queue for each write_info
structure. svc_rdma_reply_chunk_release already calls
svc_rdma_cc_release inline, confirming these operations are
safe in workqueue and nfsd thread context alike.
Release write chunk resources inline in
svc_rdma_write_info_free, removing the intermediate
svc_rdma_write_info_free_async work item and the wi_work
field from struct svc_rdma_write_info.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 1 -
net/sunrpc/xprtrdma/svc_rdma_rw.c | 13 ++-----------
2 files changed, 2 insertions(+), 12 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 8f6483ed9e5f..a2d3232593a2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -252,7 +252,6 @@ struct svc_rdma_write_info {
unsigned int wi_next_off;
struct svc_rdma_chunk_ctxt wi_cc;
- struct work_struct wi_work;
};
struct svc_rdma_send_ctxt {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 554463c72f1f..3c18b1ab1d35 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -236,19 +236,10 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
return info;
}
-static void svc_rdma_write_info_free_async(struct work_struct *work)
-{
- struct svc_rdma_write_info *info;
-
- info = container_of(work, struct svc_rdma_write_info, wi_work);
- svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE);
- kfree(info);
-}
-
static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
{
- INIT_WORK(&info->wi_work, svc_rdma_write_info_free_async);
- queue_work(svcrdma_wq, &info->wi_work);
+ svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE);
+ kfree(info);
}
/**
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (8 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 09/18] svcrdma: Release write chunk resources without re-queuing Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
` (7 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Send completion currently queues a work item to an unbound
workqueue for each completed send context. Under load, the
Send Completion handlers contend for the shared workqueue
pool lock.
Replace the workqueue with a per-transport lock-free list
(llist). The Send completion handler appends the send_ctxt
to sc_send_release_list and does no further teardown. The
nfsd thread drains the list in xpo_release_ctxt between
RPCs, performing DMA unmapping, chunk I/O resource release,
and page release in a batch.
This eliminates both the workqueue pool lock and the DMA
unmap cost from the Send completion path. DMA unmapping can
be expensive when an IOMMU is present in strict mode, as
each unmap triggers a synchronous hardware IOTLB
invalidation. Moving it to the nfsd thread, where that
latency is harmless, avoids penalizing completion handler
throughput.
The nfsd threads absorb the release cost at a point where
the client is no longer waiting on a reply, and natural
batching amortizes the overhead when completions arrive
faster than RPCs complete.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 10 ++--
net/sunrpc/xprtrdma/svc_rdma.c | 18 +------
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 ++--
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 68 ++++++++++++++++--------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-
5 files changed, 59 insertions(+), 50 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a2d3232593a2..562a5f78cd3f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,7 +66,6 @@ extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
extern unsigned int svcrdma_max_bc_requests;
extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
extern struct percpu_counter svcrdma_stat_read;
extern struct percpu_counter svcrdma_stat_recv;
@@ -89,8 +88,9 @@ extern struct percpu_counter svcrdma_stat_write;
*
* When adding a field, place it in the zone whose code path modifies the
* field under load. Read-only fields can fill padding in any zone that
- * accesses them. Fields modified by multiple paths remain at the end,
- * outside any aligned zone.
+ * accesses them. Fields modified by multiple paths (e.g.
+ * sc_recv_ctxts, sc_send_release_list) remain at the end, outside
+ * any aligned zone.
*/
struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
@@ -140,6 +140,8 @@ struct svcxprt_rdma {
struct llist_head sc_recv_ctxts;
+ struct llist_head sc_send_release_list;
+
atomic_t sc_completion_ids;
};
/* sc_flags */
@@ -257,7 +259,6 @@ struct svc_rdma_write_info {
struct svc_rdma_send_ctxt {
struct llist_node sc_node;
struct rpc_rdma_cid sc_cid;
- struct work_struct sc_work;
struct svcxprt_rdma *sc_rdma;
struct ib_send_wr sc_send_wr;
@@ -321,6 +322,7 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
/* svc_rdma_sendto.c */
extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma);
extern struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 415c0310101f..f67f0612b1a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -264,38 +264,22 @@ static int svc_rdma_proc_init(void)
return rc;
}
-struct workqueue_struct *svcrdma_wq;
-
void svc_rdma_cleanup(void)
{
svc_unreg_xprt_class(&svc_rdma_class);
svc_rdma_proc_cleanup();
- if (svcrdma_wq) {
- struct workqueue_struct *wq = svcrdma_wq;
-
- svcrdma_wq = NULL;
- destroy_workqueue(wq);
- }
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
}
int svc_rdma_init(void)
{
- struct workqueue_struct *wq;
int rc;
- wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
- if (!wq)
- return -ENOMEM;
-
rc = svc_rdma_proc_init();
- if (rc) {
- destroy_workqueue(wq);
+ if (rc)
return rc;
- }
- svcrdma_wq = wq;
svc_reg_xprt_class(&svc_rdma_class);
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 1bd6b0da002f..2281f9adc9f3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -244,6 +244,8 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
if (ctxt)
svc_rdma_recv_ctxt_put(rdma, ctxt);
+
+ svc_rdma_send_ctxts_drain(rdma);
}
static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
@@ -379,11 +381,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* svc_rdma_flush_recv_queues - Drain pending Receive work
* @rdma: svcxprt_rdma being shut down
*
- * Called from svc_rdma_free() after ib_drain_qp() has blocked until
- * completion queues are empty and flush_workqueue() has waited for
- * pending work items. These preceding calls guarantee no concurrent
- * producers (completion handlers) or consumers (svc_rdma_recvfrom)
- * can be active, making unsynchronized llist_del_all() safe here.
+ * Caller must guarantee that @rdma's Send Completion queue is empty and
+ * all send contexts have been released. This guarantees concurrent
+ * producers and consumers are no longer active.
*/
void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 263da6f76267..c8686fdfe788 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -79,21 +79,26 @@
* The ownership of all of the Reply's pages are transferred into that
* ctxt, the Send WR is posted, and sendto returns.
*
- * The svc_rdma_send_ctxt is presented when the Send WR completes. The
- * Send completion handler finally releases the Reply's pages.
+ * The svc_rdma_send_ctxt is presented when the Send WR completes.
+ * The Send completion handler queues the send_ctxt onto the
+ * per-transport sc_send_release_list (a lock-free llist). The
+ * nfsd thread drains sc_send_release_list in xpo_release_ctxt
+ * between RPCs, DMA-unmapping SGEs, releasing chunk I/O
+ * resources and pages, and returning send_ctxts to the free
+ * list in a batch.
*
- * This mechanism also assumes that completions on the transport's Send
- * Completion Queue do not run in parallel. Otherwise a Write completion
- * and Send completion running at the same time could release pages that
- * are still DMA-mapped.
+ * Correctness depends on completions on the transport's Send
+ * Completion Queue being serialized. Otherwise a Write
+ * completion and Send completion running at the same time could
+ * queue a send_ctxt whose pages are still DMA-mapped.
*
* Error Handling
*
* - If the Send WR is posted successfully, it will either complete
* successfully, or get flushed. Either way, the Send completion
- * handler releases the Reply's pages.
- * - If the Send WR cannot be not posted, the forward path releases
- * the Reply's pages.
+ * handler queues the send_ctxt for deferred release.
+ * - If the Send WR cannot be posted, the forward path releases the
+ * Reply's pages.
*
* This handles the case, without the use of page reference counting,
* where two different Write segments send portions of the same page.
@@ -232,8 +237,9 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
goto out;
}
-static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt)
+/* DMA-unmap SGEs and release chunk I/O resources. */
+static void svc_rdma_send_ctxt_unmap(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
{
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
@@ -241,9 +247,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
svc_rdma_write_chunk_release(rdma, ctxt);
svc_rdma_reply_chunk_release(rdma, ctxt);
- if (ctxt->sc_page_count)
- release_pages(ctxt->sc_pages, ctxt->sc_page_count);
-
/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.
*/
@@ -256,30 +259,49 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
ctxt->sc_sges[i].length,
DMA_TO_DEVICE);
}
+}
+
+/* Unmap, release pages, and return send_ctxt to the free list. */
+static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
+{
+ svc_rdma_send_ctxt_unmap(rdma, ctxt);
+
+ if (ctxt->sc_page_count)
+ release_pages(ctxt->sc_pages, ctxt->sc_page_count);
llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
}
-static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+/**
+ * svc_rdma_send_ctxts_drain - Release completed send_ctxts
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Called from xpo_release_ctxt and during transport teardown.
+ */
+void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma)
{
- struct svc_rdma_send_ctxt *ctxt;
+ struct svc_rdma_send_ctxt *ctxt, *next;
+ struct llist_node *node;
- ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
- svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+ node = llist_del_all(&rdma->sc_send_release_list);
+ llist_for_each_entry_safe(ctxt, next, node, sc_node)
+ svc_rdma_send_ctxt_release(rdma, ctxt);
}
/**
- * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * svc_rdma_send_ctxt_put - Queue send_ctxt for deferred release
* @rdma: controlling svcxprt_rdma
- * @ctxt: object to return to the free list
+ * @ctxt: send_ctxt to queue for deferred release
*
- * Pages left in sc_pages are DMA unmapped and released.
+ * Queues @ctxt for deferred release via the per-transport
+ * sc_send_release_list. DMA unmapping and page release run
+ * later in svc_rdma_send_ctxts_drain().
*/
void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
{
- INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
- queue_work(svcrdma_wq, &ctxt->sc_work);
+ llist_add(&ctxt->sc_node, &rdma->sc_send_release_list);
}
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9f52d2c6666d..719566234277 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -177,6 +177,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
+ init_llist_head(&cma_xprt->sc_send_release_list);
init_waitqueue_head(&cma_xprt->sc_send_wait);
init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
@@ -610,7 +611,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
/* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
- flush_workqueue(svcrdma_wq);
+ svc_rdma_send_ctxts_drain(rdma);
svc_rdma_flush_recv_queues(rdma);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (9 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 10/18] svcrdma: Defer send context release to xpo_release_ctxt Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
` (6 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The current Receive posting strategy posts a small fixed batch of
Receives on every completion when the queue depth drops below the
maximum. At high message rates this results in frequent
ib_post_recv() calls, each incurring doorbell overhead.
The Receive Queue is now provisioned with twice the negotiated
credit limit (sc_max_requests). Replenishment is triggered when the
number of posted Receives drops below the credit limit (the low
watermark), posting enough Receives to refill the queue to capacity.
For a typical configuration with a credit limit of 128:
- Receive Queue depth: 256
- Low watermark: 128 (replenish when half consumed)
- Batch size: ~128 Receives per posting
Tying the watermark to the credit limit rather than a percentage of
queue capacity ensures adequate buffering regardless of the
configured credit limit. Even with a small credit limit, at least
one full credit window remains posted, guaranteeing forward
progress.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 23 +++++++++++++-
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 40 ++++++++++++++++--------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 ++++----
3 files changed, 56 insertions(+), 20 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 562a5f78cd3f..ef52af656581 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -128,7 +128,6 @@ struct svcxprt_rdma {
/* Receive path */
u32 sc_pending_recvs ____cacheline_aligned_in_smp;
- u32 sc_recv_batch;
struct llist_head sc_rq_dto_q;
struct llist_head sc_read_complete_q;
@@ -163,6 +162,28 @@ enum {
RPCRDMA_MAX_BC_REQUESTS = 2,
};
+/*
+ * Receive Queue provisioning constants for watermark-based replenishment.
+ *
+ * Queue depth is twice the credit limit to support batched
+ * posting that reduces doorbell overhead. When posted receives
+ * drop below the credit limit (the low watermark),
+ * svc_rdma_wc_receive() posts enough Receives to refill the
+ * queue to capacity.
+ */
+enum {
+ /* Queue depth = sc_max_requests * multiplier */
+ SVCRDMA_RQ_DEPTH_MULT = 2,
+
+ /* Total recv_ctxt pool = sc_max_requests * multiplier
+ * (RQ_DEPTH_MULT for posted receives + 1 for RPCs in process)
+ */
+ SVCRDMA_RECV_CTXT_MULT = SVCRDMA_RQ_DEPTH_MULT + 1,
+
+ /* rdma_rw contexts per request: Read + Write + Reply chunks */
+ SVCRDMA_RW_CTXT_MULT = 3,
+};
+
#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2281f9adc9f3..a11e845a7113 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -303,10 +303,11 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
unsigned int total;
- /* For each credit, allocate enough recv_ctxts for one
- * posted Receive and one RPC in process.
+ /* Allocate enough recv_ctxts for:
+ * - SVCRDMA_RQ_DEPTH_MULT * sc_max_requests posted on the RQ
+ * - sc_max_requests RPCs in process
*/
- total = (rdma->sc_max_requests * 2) + rdma->sc_recv_batch;
+ total = rdma->sc_max_requests * SVCRDMA_RECV_CTXT_MULT;
while (total--) {
struct svc_rdma_recv_ctxt *ctxt;
@@ -316,7 +317,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}
- return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
+ return svc_rdma_refresh_recvs(rdma,
+ rdma->sc_max_requests * SVCRDMA_RQ_DEPTH_MULT);
}
/**
@@ -340,18 +342,30 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
goto flushed;
trace_svcrdma_wc_recv(wc, &ctxt->rc_cid);
- /* If receive posting fails, the connection is about to be
- * lost anyway. The server will not be able to send a reply
- * for this RPC, and the client will retransmit this RPC
- * anyway when it reconnects.
+ /* Watermark-based receive posting: The Receive Queue is
+ * provisioned at SVCRDMA_RQ_DEPTH_MULT times the credit
+ * count (sc_max_requests). When posted Receives drop below
+ * sc_max_requests (the low watermark), this handler posts
+ * enough Receives to refill the queue to capacity.
*
- * Therefore we drop the Receive, even if status was SUCCESS
- * to reduce the likelihood of replayed requests once the
- * client reconnects.
+ * Batched posting reduces doorbell rate compared to posting
+ * a fixed small batch on every completion, while keeping
+ * the Receive Queue populated.
+ *
+ * If posting fails, connection teardown is imminent. No
+ * reply can be sent for this RPC, and the client will
+ * retransmit after reconnecting. Drop the Receive, even
+ * if status was SUCCESS, to reduce replay likelihood after
+ * reconnection.
*/
- if (rdma->sc_pending_recvs < rdma->sc_max_requests)
- if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
+ if (rdma->sc_pending_recvs < rdma->sc_max_requests) {
+ unsigned int target =
+ (rdma->sc_max_requests * SVCRDMA_RQ_DEPTH_MULT) -
+ rdma->sc_pending_recvs;
+
+ if (!svc_rdma_refresh_recvs(rdma, target))
goto dropped;
+ }
/* All wc fields are now known to be valid */
ctxt->rc_byte_len = wc->byte_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 719566234277..772f02317895 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -439,7 +439,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = svcrdma_max_requests;
newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
- newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH;
newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
/* Qualify the transport's resource defaults with the
@@ -452,12 +451,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
- rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
- newxprt->sc_recv_batch + 1 /* drain */;
+ rq_depth = (newxprt->sc_max_requests * SVCRDMA_RQ_DEPTH_MULT) +
+ newxprt->sc_max_bc_requests + 1 /* drain */;
if (rq_depth > dev->attrs.max_qp_wr) {
+ unsigned int overhead = newxprt->sc_max_bc_requests + 1;
+
rq_depth = dev->attrs.max_qp_wr;
- newxprt->sc_recv_batch = 1;
- newxprt->sc_max_requests = rq_depth - 2;
+ newxprt->sc_max_requests =
+ (rq_depth - overhead) / SVCRDMA_RQ_DEPTH_MULT;
newxprt->sc_max_bc_requests = 2;
}
@@ -468,7 +469,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
*/
maxpayload = min(xprt->xpt_server->sv_max_payload,
RPCSVC_MAXPAYLOAD_RDMA);
- ctxts = newxprt->sc_max_requests * 3 *
+ ctxts = newxprt->sc_max_requests * SVCRDMA_RW_CTXT_MULT *
rdma_rw_mr_factor(dev, newxprt->sc_port_num,
maxpayload >> PAGE_SHIFT);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (10 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 11/18] svcrdma: Use watermark-based Receive Queue replenishment Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
` (5 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Parsed chunk list (PCL) processing currently allocates a new
svc_rdma_chunk structure via kmalloc for each chunk in every
incoming RPC. These allocations add overhead to the receive path.
Introduce a per-recv_ctxt single-entry cache. Over 99% of RPC Calls
that specify RPC/RDMA chunks provide only a single chunk, so a
single cached chunk handles the common case. Chunks with up to
SVC_RDMA_CHUNK_SEGMAX (4) segments are eligible for caching; larger
chunks fall back to dynamic allocation.
Using per-recv_ctxt caching instead of a per-transport pool avoids
the need for locking or atomic operations, since a recv_ctxt is
used by only one thread at a time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_rdma.h | 2 +
include/linux/sunrpc/svc_rdma_pcl.h | 12 +++++-
net/sunrpc/xprtrdma/svc_rdma_pcl.c | 52 ++++++++++++++++++++++---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +++--
4 files changed, 65 insertions(+), 11 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ef52af656581..2233dec2ae7d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -225,6 +225,8 @@ struct svc_rdma_chunk_ctxt {
struct svc_rdma_recv_ctxt {
struct llist_node rc_node;
+ struct svcxprt_rdma *rc_rdma;
+ struct svc_rdma_chunk *rc_chunk_cache;
struct ib_recv_wr rc_recv_wr;
struct ib_cqe rc_cqe;
struct rpc_rdma_cid rc_cid;
diff --git a/include/linux/sunrpc/svc_rdma_pcl.h b/include/linux/sunrpc/svc_rdma_pcl.h
index 7516ad0fae80..8afd98dc4737 100644
--- a/include/linux/sunrpc/svc_rdma_pcl.h
+++ b/include/linux/sunrpc/svc_rdma_pcl.h
@@ -22,6 +22,7 @@ struct svc_rdma_chunk {
u32 ch_payload_length;
u32 ch_segcount;
+ u32 ch_segmax; /* allocated segment capacity */
struct svc_rdma_segment ch_segments[];
};
@@ -114,7 +115,16 @@ pcl_chunk_end_offset(const struct svc_rdma_chunk *chunk)
struct svc_rdma_recv_ctxt;
-extern void pcl_free(struct svc_rdma_pcl *pcl);
+/*
+ * Cached chunks have capacity for this many segments.
+ * Typical clients can register up to 120KB per segment, so 4
+ * segments covers most NFS I/O operations. Larger chunks fall
+ * back to kmalloc.
+ */
+#define SVC_RDMA_CHUNK_SEGMAX 4
+
+extern void pcl_free(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_pcl *pcl);
extern bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
extern bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
extern bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
index 1f8f7dad8b6f..5c13a74b1f9e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -9,30 +9,70 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
+static struct svc_rdma_chunk *rctxt_chunk_get(struct svc_rdma_recv_ctxt *rctxt)
+{
+ struct svc_rdma_chunk *chunk = rctxt->rc_chunk_cache;
+
+ if (chunk)
+ rctxt->rc_chunk_cache = NULL;
+ return chunk;
+}
+
+static void rctxt_chunk_put(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk)
+{
+ if (rctxt->rc_chunk_cache) {
+ kfree(chunk);
+ return;
+ }
+ rctxt->rc_chunk_cache = chunk;
+}
+
+static void rctxt_chunk_free(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk)
+{
+ if (chunk->ch_segmax == SVC_RDMA_CHUNK_SEGMAX)
+ rctxt_chunk_put(rctxt, chunk);
+ else
+ kfree(chunk);
+}
+
/**
* pcl_free - Release all memory associated with a parsed chunk list
+ * @rctxt: receive context containing @pcl
* @pcl: parsed chunk list
*
*/
-void pcl_free(struct svc_rdma_pcl *pcl)
+void pcl_free(struct svc_rdma_recv_ctxt *rctxt, struct svc_rdma_pcl *pcl)
{
while (!list_empty(&pcl->cl_chunks)) {
struct svc_rdma_chunk *chunk;
chunk = pcl_first_chunk(pcl);
list_del(&chunk->ch_list);
- kfree(chunk);
+ rctxt_chunk_free(rctxt, chunk);
}
}
-static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+static struct svc_rdma_chunk *pcl_alloc_chunk(struct svc_rdma_recv_ctxt *rctxt,
+ u32 segcount, u32 position)
{
struct svc_rdma_chunk *chunk;
+ if (segcount <= SVC_RDMA_CHUNK_SEGMAX) {
+ chunk = rctxt_chunk_get(rctxt);
+ if (chunk)
+ goto out;
+ /* Round up so all fresh allocations are cache-eligible */
+ segcount = SVC_RDMA_CHUNK_SEGMAX;
+ }
+
chunk = kmalloc_flex(*chunk, ch_segments, segcount);
if (!chunk)
return NULL;
+ chunk->ch_segmax = segcount;
+out:
chunk->ch_position = position;
chunk->ch_length = 0;
chunk->ch_payload_length = 0;
@@ -117,7 +157,7 @@ bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
continue;
if (pcl_is_empty(pcl)) {
- chunk = pcl_alloc_chunk(segcount, position);
+ chunk = pcl_alloc_chunk(rctxt, segcount, position);
if (!chunk)
return false;
pcl_insert_position(pcl, chunk);
@@ -172,7 +212,7 @@ bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
chunk = pcl_lookup_position(pcl, position);
if (!chunk) {
- chunk = pcl_alloc_chunk(segcount, position);
+ chunk = pcl_alloc_chunk(rctxt, segcount, position);
if (!chunk)
return false;
pcl_insert_position(pcl, chunk);
@@ -210,7 +250,7 @@ bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
p++; /* skip the list discriminator */
segcount = be32_to_cpup(p++);
- chunk = pcl_alloc_chunk(segcount, 0);
+ chunk = pcl_alloc_chunk(rctxt, segcount, 0);
if (!chunk)
return false;
list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a11e845a7113..45edf57c7285 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -123,6 +123,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
GFP_KERNEL, node);
if (!ctxt)
goto fail0;
+ ctxt->rc_rdma = rdma;
ctxt->rc_maxpages = pages;
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
@@ -161,6 +162,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
+ kfree(ctxt->rc_chunk_cache);
ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr,
ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
kfree(ctxt->rc_recv_buf);
@@ -219,10 +221,10 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
*/
release_pages(ctxt->rc_pages, ctxt->rc_page_count);
- pcl_free(&ctxt->rc_call_pcl);
- pcl_free(&ctxt->rc_read_pcl);
- pcl_free(&ctxt->rc_write_pcl);
- pcl_free(&ctxt->rc_reply_pcl);
+ pcl_free(ctxt, &ctxt->rc_call_pcl);
+ pcl_free(ctxt, &ctxt->rc_read_pcl);
+ pcl_free(ctxt, &ctxt->rc_write_pcl);
+ pcl_free(ctxt, &ctxt->rc_reply_pcl);
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (11 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 12/18] svcrdma: Add per-recv_ctxt chunk context cache Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently Chuck Lever
` (4 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
svc_rdma_wc_read_done() sets XPT_DATA when adding a
completed RDMA Read context to sc_read_complete_q. The
consumer in svc_rdma_recvfrom() takes the context but
leaves XPT_DATA set. The subsequent svc_xprt_received()
clears XPT_BUSY and re-enqueues the transport; because
XPT_DATA remains set, a second thread awakens. That thread
finds both queues empty, accomplishes nothing, and releases
its slot and reservation.
Trace data from a 256KB NFSv3 WRITE workload over RDMA
shows approximately 14 enqueue attempts per RPC, with 62%
returning immediately due to no pending data. The majority
originate from this spurious dispatch path.
After clearing XPT_DATA to acknowledge consumption, the
XPT_DATA state must be recomputed from both queue states.
A concurrent producer may call llist_add and then
set_bit(XPT_DATA) between this consumer's llist_del_first
and the clear_bit, causing clear_bit to erase the producer's
signal. An smp_mb__after_atomic() barrier after clear_bit
pairs with the implicit barrier in each producer's llist_add
cmpxchg, ensuring llist_empty rechecks observe any add whose
set_bit was erased. This barrier requirement applies at both
call sites: the new sc_read_complete_q path and the
pre-existing sc_rq_dto_q "both queues empty" path.
A new helper svc_rdma_update_xpt_data() centralizes this
clear/barrier/recheck/set pattern to ensure both locations
maintain the required memory ordering.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 33 ++++++++++++++++---------
1 file changed, 22 insertions(+), 11 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 45edf57c7285..54545fcd8762 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -917,6 +917,25 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp,
trace_svcrdma_read_finished(&ctxt->rc_cid);
}
+/*
+ * Recompute XPT_DATA from queue state after consuming a completion. A
+ * concurrent producer may have called llist_add and then set_bit(XPT_DATA)
+ * between this consumer's llist_del_first and the clear_bit below, causing
+ * clear_bit to erase the producer's signal. The barrier pairs with the
+ * implicit barrier in each producer's llist_add so that the llist_empty
+ * rechecks observe any add whose set_bit was erased.
+ */
+static void svc_rdma_update_xpt_data(struct svcxprt_rdma *rdma)
+{
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+
+ clear_bit(XPT_DATA, &xprt->xpt_flags);
+ smp_mb__after_atomic();
+ if (!llist_empty(&rdma->sc_rq_dto_q) ||
+ !llist_empty(&rdma->sc_read_complete_q))
+ set_bit(XPT_DATA, &xprt->xpt_flags);
+}
+
/**
* svc_rdma_recvfrom - Receive an RPC call
* @rqstp: request structure into which to receive an RPC Call
@@ -965,6 +984,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
node = llist_del_first(&rdma_xprt->sc_read_complete_q);
if (node) {
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+
+ svc_rdma_update_xpt_data(rdma_xprt);
svc_xprt_received(xprt);
svc_rdma_read_complete(rqstp, ctxt);
goto complete;
@@ -975,17 +996,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
} else {
ctxt = NULL;
/* No new incoming requests, terminate the loop */
- clear_bit(XPT_DATA, &xprt->xpt_flags);
-
- /*
- * If a completion arrived after llist_del_first but
- * before clear_bit, the producer's set_bit would be
- * cleared above. Recheck both queues to close this
- * race window.
- */
- if (!llist_empty(&rdma_xprt->sc_rq_dto_q) ||
- !llist_empty(&rdma_xprt->sc_read_complete_q))
- set_bit(XPT_DATA, &xprt->xpt_flags);
+ svc_rdma_update_xpt_data(rdma_xprt);
}
/* Unblock the transport for the next receive */
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (12 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 13/18] svcrdma: clear XPT_DATA on sc_read_complete_q consumption Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
` (3 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
When svc_rdma_recvfrom finds both sc_read_complete_q
and sc_rq_dto_q empty, svc_rdma_update_xpt_data clears
XPT_DATA, executes a barrier, and rechecks the queues.
If a completion arrived between the llist_del_first and
the recheck, XPT_DATA is re-set, but recvfrom returns
zero regardless. The thread then traverses the full
svc_recv cycle -- page allocation, dequeue, recvfrom,
release -- only to find the item that was already
available at the time of the recheck.
Trace data from a 256KB NFSv3 workload over RDMA shows
267,848 of 464,355 transport dequeues (57.7%) are these
empty bounces. Each bounce costs roughly 37 us. During
the READ phase, empty bounces consume 8.6% of thread
capacity and inflate inter-RPC gaps by an average of
87 us.
The calling thread holds XPT_BUSY for the duration, so
no other consumer can drain the queue between the
recheck and the retry. A retry is therefore guaranteed
to find data on its first iteration.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 54545fcd8762..d274b03b1958 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -981,6 +981,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;
+retry:
node = llist_del_first(&rdma_xprt->sc_read_complete_q);
if (node) {
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
@@ -995,8 +996,19 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
} else {
ctxt = NULL;
- /* No new incoming requests, terminate the loop */
svc_rdma_update_xpt_data(rdma_xprt);
+ /*
+ * A completion may have arrived between the
+ * llist_del_first above and the queue recheck
+ * inside svc_rdma_update_xpt_data. This thread
+ * holds XPT_BUSY, preventing any other consumer
+ * from draining the queue in the meantime.
+ * Retry at most once to avoid a full svc_recv
+ * round-trip: the second iteration is guaranteed
+ * to find data or clear XPT_DATA.
+ */
+ if (test_bit(XPT_DATA, &xprt->xpt_flags))
+ goto retry;
}
/* Unblock the transport for the next receive */
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (13 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 14/18] svcrdma: retry when receive queues drain transiently Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending Chuck Lever
` (2 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
svc_rdma_wc_receive() sets XPT_DATA when adding a
completed Receive to sc_rq_dto_q. When
svc_rdma_recvfrom() consumes the item from sc_rq_dto_q,
XPT_DATA is left set. The subsequent svc_xprt_received()
clears XPT_BUSY and re-enqueues the transport; because
stale XPT_DATA remains set, svc_xprt_enqueue() dispatches
a second thread. That thread finds both queues empty,
accomplishes nothing, and returns zero.
Trace data from a 256KB NFSv3 workload over RDMA shows
172,280 of 467,171 transport dequeues (36.9%) are these
spurious dispatches. The READ phase averages 1.99
dequeues per RPC (expected 1.0) and the WRITE phase
averages 2.77 (expected 2.0). Each wasted cycle traverses
svc_alloc_arg, svc_thread_wait_for_work,
svc_rdma_recvfrom, and svc_xprt_release before the
thread can accept new work.
Add svc_rdma_update_xpt_data() on the sc_rq_dto_q
success path, matching the existing call on the
sc_read_complete_q path added by commit 6807f36a39b7
("svcrdma: clear XPT_DATA on sc_read_complete_q
consumption"). The same barrier semantics apply: the
clear/recheck pattern in svc_rdma_update_xpt_data()
ensures a concurrent producer's llist_add + set_bit
is not lost.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index d274b03b1958..79e9ca9f44dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -994,6 +994,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
node = llist_del_first(&rdma_xprt->sc_rq_dto_q);
if (node) {
ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+ svc_rdma_update_xpt_data(rdma_xprt);
} else {
ctxt = NULL;
svc_rdma_update_xpt_data(rdma_xprt);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (14 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 15/18] svcrdma: clear XPT_DATA on sc_rq_dto_q consumption Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle Chuck Lever
2026-02-27 14:03 ` [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports Chuck Lever
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
svc_reserve() and svc_xprt_release_slot() call
svc_xprt_enqueue() after modifying xpt_reserved or
xpt_nr_rqsts. The purpose is to re-dispatch the
transport when write-space or a slot becomes available.
However, when neither XPT_DATA nor XPT_DEFERRED is
set, no thread can make progress on the transport and
the enqueue accomplishes nothing.
Trace data from a 256KB NFSv3 WRITE workload over RDMA
shows 11.2 svc_xprt_enqueue() calls per RPC. Of these,
6.9 per RPC lack XPT_DATA and exit svc_xprt_ready()
immediately after executing the smp_rmb(), READ_ONCE(),
and tracepoint. svc_reserve() and svc_xprt_release_slot()
account for roughly five of these per RPC.
A new helper, svc_xprt_resource_released(), checks
XPT_DATA | XPT_DEFERRED before calling
svc_xprt_enqueue(). The existing smp_wmb() barriers
are upgraded to smp_mb() to ensure the flags check
observes a concurrent producer's set_bit(XPT_DATA).
Each producer (svc_rdma_wc_receive, etc.) both sets
XPT_DATA and calls svc_xprt_enqueue(), so even if the
check reads a stale value, the producer's own enqueue
provides a fallback path.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/svc_xprt.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 56a663b8939f..73149280167c 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -425,13 +425,28 @@ static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt)
return true;
}
+/*
+ * After a caller releases write-space or a request slot,
+ * re-enqueue the transport only when there is pending
+ * work that a thread could act on. The smp_mb() pairs
+ * with the smp_rmb() in svc_xprt_ready() and orders the
+ * preceding counter update before the flags read so a
+ * concurrent set_bit(XPT_DATA) is visible here.
+ */
+static void svc_xprt_resource_released(struct svc_xprt *xprt)
+{
+ smp_mb();
+ if (READ_ONCE(xprt->xpt_flags) &
+ (BIT(XPT_DATA) | BIT(XPT_DEFERRED)))
+ svc_xprt_enqueue(xprt);
+}
+
static void svc_xprt_release_slot(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
atomic_dec(&xprt->xpt_nr_rqsts);
- smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */
- svc_xprt_enqueue(xprt);
+ svc_xprt_resource_released(xprt);
}
}
@@ -525,10 +540,10 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
space += rqstp->rq_res.head[0].iov_len;
if (xprt && space < rqstp->rq_reserved) {
- atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+ atomic_sub((rqstp->rq_reserved - space),
+ &xprt->xpt_reserved);
rqstp->rq_reserved = space;
- smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */
- svc_xprt_enqueue(xprt);
+ svc_xprt_resource_released(xprt);
}
}
EXPORT_SYMBOL_GPL(svc_reserve);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (15 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 16/18] sunrpc: skip svc_xprt_enqueue when no work is pending Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
2026-02-27 14:03 ` [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports Chuck Lever
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
svc_xprt_received() unconditionally calls
svc_xprt_enqueue() after clearing XPT_BUSY. When no
work flags are pending, the enqueue traverses
svc_xprt_ready() -- executing an smp_rmb(), READ_ONCE(),
and tracepoint -- before returning false.
Trace data from a 256KB NFSv3 workload over RDMA shows
85% of svc_xprt_received() invocations reach
svc_xprt_enqueue() with no pending work flags. In the
WRITE phase, 167,335 of 196,420 calls find no work; in
the READ phase, 97,165 of 98,276. Each unnecessary call
executes a memory barrier, a flags read, and (when
tracing is active) fires the svc_xprt_enqueue
tracepoint.
Add a flags pre-check between clear_bit(XPT_BUSY) and
svc_xprt_enqueue(). Both the clear and the subsequent
READ_ONCE operate on the same xpt_flags word, so
cache-line serialization of the atomic bitops ensures
the read observes any flag set by a concurrent producer
before the line was acquired for the clear. If a
producer's set_bit occurs after the clear_bit, that
producer's own svc_xprt_enqueue() call observes
!XPT_BUSY and dispatches the transport.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/svc_xprt.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 73149280167c..36c8437cfd8d 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -234,7 +234,19 @@ void svc_xprt_received(struct svc_xprt *xprt)
svc_xprt_get(xprt);
smp_mb__before_atomic();
clear_bit(XPT_BUSY, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
+
+ /*
+ * Skip the enqueue when no actionable flags are set.
+ * Each producer both sets its flag (XPT_DATA, XPT_CLOSE,
+ * etc.) and calls svc_xprt_enqueue(); if a set_bit races
+ * with this check, the producer's own enqueue observes
+ * !XPT_BUSY and dispatches the transport.
+ */
+ if (READ_ONCE(xprt->xpt_flags) &
+ (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE) |
+ BIT(XPT_DATA) | BIT(XPT_DEFERRED)))
+ svc_xprt_enqueue(xprt);
+
svc_xprt_put(xprt);
}
EXPORT_SYMBOL_GPL(svc_xprt_received);
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v2 18/18] sunrpc: Skip xpt_reserved accounting for non-UDP transports
2026-02-27 14:03 [PATCH v2 00/18] svcrdma performance scalability enhancements Chuck Lever
` (16 preceding siblings ...)
2026-02-27 14:03 ` [PATCH v2 17/18] sunrpc: skip svc_xprt_enqueue in svc_xprt_received when idle Chuck Lever
@ 2026-02-27 14:03 ` Chuck Lever
17 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2026-02-27 14:03 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
The xpt_reserved counter exists for UDP socket-buffer back-pressure:
svc_udp_has_wspace() is the only has_wspace implementation that
consults it. Neither svc_tcp_has_wspace() nor svc_rdma_has_wspace()
read this counter.
On TCP and RDMA transports, svc_reserve() fires twice per RPC, each
time executing an atomic_sub on xpt_reserved, smp_mb(), and a
svc_xprt_enqueue() attempt that bails on XPT_BUSY. At 257K ops/sec
over NFS/RDMA, this is measurable dead overhead.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/sunrpc/svc_xprt.h | 2 ++
net/sunrpc/svc_xprt.c | 22 +++++++++++++---------
net/sunrpc/svcsock.c | 1 +
3 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index da2a2531e110..077cec38ed8d 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -37,6 +37,8 @@ struct svc_xprt_class {
struct list_head xcl_list;
u32 xcl_max_payload;
int xcl_ident;
+ u32 xcl_flags;
+#define SVC_XPRT_FLAG_WSPACE_RESERVE BIT(0)
};
/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 36c8437cfd8d..94d21b68c1f8 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -468,11 +468,11 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
/*
* If another cpu has recently updated xpt_flags,
- * sk_sock->flags, xpt_reserved, or xpt_nr_rqsts, we need to
- * know about it; otherwise it's possible that both that cpu and
- * this one could call svc_xprt_enqueue() without either
- * svc_xprt_enqueue() recognizing that the conditions below
- * are satisfied, and we could stall indefinitely:
+ * sk_sock->flags, xpt_reserved (UDP only), or xpt_nr_rqsts,
+ * we need to know about it; otherwise it's possible that both
+ * that cpu and this one could call svc_xprt_enqueue() without
+ * either svc_xprt_enqueue() recognizing that the conditions
+ * below are satisfied, and we could stall indefinitely:
*/
smp_rmb();
xpt_flags = READ_ONCE(xprt->xpt_flags);
@@ -552,10 +552,13 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
space += rqstp->rq_res.head[0].iov_len;
if (xprt && space < rqstp->rq_reserved) {
- atomic_sub((rqstp->rq_reserved - space),
- &xprt->xpt_reserved);
+ if (xprt->xpt_class->xcl_flags & SVC_XPRT_FLAG_WSPACE_RESERVE) {
+ atomic_sub((rqstp->rq_reserved - space),
+ &xprt->xpt_reserved);
+ }
rqstp->rq_reserved = space;
- svc_xprt_resource_released(xprt);
+ if (xprt->xpt_class->xcl_flags & SVC_XPRT_FLAG_WSPACE_RESERVE)
+ svc_xprt_resource_released(xprt);
}
}
EXPORT_SYMBOL_GPL(svc_reserve);
@@ -834,7 +837,8 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
rqstp->rq_reserved = serv->sv_max_mesg;
- atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+ if (xprt->xpt_class->xcl_flags & SVC_XPRT_FLAG_WSPACE_RESERVE)
+ atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
if (len <= 0)
goto out;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f28c6076f7e8..ce840f8e86c6 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -828,6 +828,7 @@ static struct svc_xprt_class svc_udp_class = {
.xcl_ops = &svc_udp_ops,
.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
.xcl_ident = XPRT_TRANSPORT_UDP,
+ .xcl_flags = SVC_XPRT_FLAG_WSPACE_RESERVE,
};
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
--
2.53.0
^ permalink raw reply related [flat|nested] 19+ messages in thread