* [PATCH net-next v5 1/2] net/rds: Add per cp work queue
2026-01-09 22:48 [PATCH net-next v5 0/2] net/rds: RDS-TCP bug fix collection, subset 1: Work queue scalability Allison Henderson
@ 2026-01-09 22:48 ` Allison Henderson
2026-01-09 22:48 ` [PATCH net-next v5 2/2] net/rds: Give each connection path its own workqueue Allison Henderson
2026-01-13 11:40 ` [PATCH net-next v5 0/2] net/rds: RDS-TCP bug fix collection, subset 1: Work queue scalability patchwork-bot+netdevbpf
2 siblings, 0 replies; 4+ messages in thread
From: Allison Henderson @ 2026-01-09 22:48 UTC (permalink / raw)
To: netdev
Cc: pabeni, edumazet, rds-devel, kuba, horms, linux-rdma,
allison.henderson
From: Allison Henderson <allison.henderson@oracle.com>
This patch adds a per connection workqueue which can be initialized
and used independently of the globally shared rds_wq.
This patch is the first in a series that aims to address tcp ack
timeouts during the tcp socket shutdown sequence.
This initial refactoring lays the ground work needed to alleviate
queue congestion during heavy reads and writes. The independently
managed queues will allow shutdowns and reconnects respond more quickly
before the peer(s) timeout waiting for the proper acks.
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/cong.c | 2 +-
net/rds/connection.c | 5 +++--
net/rds/ib_recv.c | 2 +-
net/rds/ib_send.c | 4 ++--
net/rds/rds.h | 1 +
net/rds/send.c | 9 +++++----
net/rds/tcp_recv.c | 2 +-
net/rds/tcp_send.c | 2 +-
net/rds/threads.c | 16 ++++++++--------
9 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 8b689ebbd5b5..ac1f120c10f9 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -242,7 +242,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead.
*/
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
}
rcu_read_unlock();
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 68bc88cce84e..dc7323707f45 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -269,6 +269,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
__rds_conn_path_init(conn, &conn->c_path[i],
is_outgoing);
conn->c_path[i].cp_index = i;
+ conn->c_path[i].cp_wq = rds_wq;
}
rcu_read_lock();
if (rds_destroy_pending(conn))
@@ -884,7 +885,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
rcu_read_unlock();
return;
}
- queue_work(rds_wq, &cp->cp_down_w);
+ queue_work(cp->cp_wq, &cp->cp_down_w);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_conn_path_drop);
@@ -909,7 +910,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
}
if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
- queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4248dfa816eb..357128d34a54 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -457,7 +457,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
(must_wake ||
(can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
rds_ib_ring_empty(&ic->i_recv_ring))) {
- queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_recv_w, 1);
}
if (can_wait)
cond_resched();
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4190b90ff3b1..f9d28ddd168d 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -297,7 +297,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0);
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
@@ -419,7 +419,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0);
WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index a029e5fcdea7..b35afa2658cc 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -118,6 +118,7 @@ struct rds_conn_path {
void *cp_transport_data;
+ struct workqueue_struct *cp_wq;
atomic_t cp_state;
unsigned long cp_send_gen;
unsigned long cp_flags;
diff --git a/net/rds/send.c b/net/rds/send.c
index 0b3d0ef2f008..3e3d028bc21e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -458,7 +458,8 @@ int rds_send_xmit(struct rds_conn_path *cp)
if (rds_destroy_pending(cp->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ queue_delayed_work(cp->cp_wq,
+ &cp->cp_send_w, 1);
rcu_read_unlock();
} else if (raced) {
rds_stats_inc(s_send_lock_queue_raced);
@@ -1380,7 +1381,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rds_destroy_pending(cpath->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+ queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1);
rcu_read_unlock();
}
if (ret)
@@ -1470,10 +1471,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
- /* schedule the send work on rds_wq */
+ /* schedule the send work on cp_wq */
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1);
rcu_read_unlock();
rds_message_put(rm);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 7997a19d1da3..b7cf7f451430 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -327,7 +327,7 @@ void rds_tcp_data_ready(struct sock *sk)
if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
rcu_read_unlock();
}
out:
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7d284ac7e81a..4e82c9644aa6 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -201,7 +201,7 @@ void rds_tcp_write_space(struct sock *sk)
rcu_read_lock();
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
rcu_read_unlock();
out:
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 1f424cbfcbb4..639302bab51e 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -89,8 +89,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
set_bit(0, &cp->cp_conn->c_map_queued);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn)) {
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
}
rcu_read_unlock();
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
@@ -140,7 +140,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
return;
}
@@ -151,7 +151,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_conn_w,
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w,
rand % cp->cp_reconnect_jiffies);
rcu_read_unlock();
@@ -203,11 +203,11 @@ void rds_send_worker(struct work_struct *work)
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 2);
break;
default:
break;
@@ -228,11 +228,11 @@ void rds_recv_worker(struct work_struct *work)
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 2);
break;
default:
break;
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH net-next v5 2/2] net/rds: Give each connection path its own workqueue
2026-01-09 22:48 [PATCH net-next v5 0/2] net/rds: RDS-TCP bug fix collection, subset 1: Work queue scalability Allison Henderson
2026-01-09 22:48 ` [PATCH net-next v5 1/2] net/rds: Add per cp work queue Allison Henderson
@ 2026-01-09 22:48 ` Allison Henderson
2026-01-13 11:40 ` [PATCH net-next v5 0/2] net/rds: RDS-TCP bug fix collection, subset 1: Work queue scalability patchwork-bot+netdevbpf
2 siblings, 0 replies; 4+ messages in thread
From: Allison Henderson @ 2026-01-09 22:48 UTC (permalink / raw)
To: netdev
Cc: pabeni, edumazet, rds-devel, kuba, horms, linux-rdma,
allison.henderson
From: Allison Henderson <allison.henderson@oracle.com>
RDS was written to require ordered workqueues for "cp->cp_wq":
Work is executed in the order scheduled, one item at a time.
If these workqueues are shared across connections,
then work executed on behalf of one connection blocks work
scheduled for a different and unrelated connection.
Luckily we don't need to share these workqueues.
While it obviously makes sense to limit the number of
workers (processes) that ought to be allocated on a system,
a workqueue that doesn't have a rescue worker attached,
has a tiny footprint compared to the connection as a whole:
A workqueue costs ~900 bytes, including the workqueue_struct,
pool_workqueue, workqueue_attrs, wq_node_nr_active and the
node_nr_active flex array. Each connection can have up to 8
(RDS_MPATH_WORKERS) paths for a worst case of ~7 KBytes per
connection. While an RDS/IB connection totals only ~5 MBytes.
So we're getting a signficant performance gain
(90% of connections fail over under 3 seconds vs. 40%)
for a less than 0.02% overhead.
RDS doesn't even benefit from the additional rescue workers:
of all the reasons that RDS blocks workers, allocation under
memory pressue is the least of our concerns. And even if RDS
was stalling due to the memory-reclaim process, the work
executed by the rescue workers are highly unlikely to free up
any memory. If anything, they might try to allocate even more.
By giving each connection path its own workqueues, we allow
RDS to better utilize the unbound workers that the system
has available.
Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/connection.c | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index dc7323707f45..e920c685e4f2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -169,6 +169,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
struct rds_transport *loop_trans;
+ struct rds_conn_path *free_cp = NULL;
unsigned long flags;
int ret, i;
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
@@ -269,7 +270,11 @@ static struct rds_connection *__rds_conn_create(struct net *net,
__rds_conn_path_init(conn, &conn->c_path[i],
is_outgoing);
conn->c_path[i].cp_index = i;
- conn->c_path[i].cp_wq = rds_wq;
+ conn->c_path[i].cp_wq =
+ alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0,
+ rds_conn_count, i);
+ if (!conn->c_path[i].cp_wq)
+ conn->c_path[i].cp_wq = rds_wq;
}
rcu_read_lock();
if (rds_destroy_pending(conn))
@@ -278,7 +283,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
ret = trans->conn_alloc(conn, GFP_ATOMIC);
if (ret) {
rcu_read_unlock();
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = ERR_PTR(ret);
goto out;
@@ -301,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
/* Creating passive conn */
if (parent->c_passive) {
trans->conn_free(conn->c_path[0].cp_transport_data);
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = parent->c_passive;
} else {
@@ -328,7 +333,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (cp->cp_transport_data)
trans->conn_free(cp->cp_transport_data);
}
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
@@ -343,6 +348,13 @@ static struct rds_connection *__rds_conn_create(struct net *net,
rcu_read_unlock();
out:
+ if (free_cp) {
+ for (i = 0; i < npaths; i++)
+ if (free_cp[i].cp_wq != rds_wq)
+ destroy_workqueue(free_cp[i].cp_wq);
+ kfree(free_cp);
+ }
+
return conn;
}
@@ -470,6 +482,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
WARN_ON(delayed_work_pending(&cp->cp_conn_w));
WARN_ON(work_pending(&cp->cp_down_w));
+ if (cp->cp_wq != rds_wq) {
+ destroy_workqueue(cp->cp_wq);
+ cp->cp_wq = NULL;
+ }
+
cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread