* [PATCH net-next v1 1/3] net/rds: Delegate fan-out to a background worker
2026-02-06 2:24 [PATCH net-next v1 0/3] net/rds: RDS-TCP reconnect and fanout improvements Allison Henderson
@ 2026-02-06 2:24 ` Allison Henderson
2026-02-06 2:24 ` [PATCH net-next v1 2/3] net/rds: Use proper peer port number even when not connected Allison Henderson
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Allison Henderson @ 2026-02-06 2:24 UTC (permalink / raw)
To: netdev
Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
linux-rdma, allison.henderson
From: Gerd Rausch <gerd.rausch@oracle.com>
Delegate fan-out to a background worker in order to allow
kernel_getpeername() to acquire a lock on the socket.
This has become necessary since the introduction of
commit "9dfc685e0262d ("inet: remove races in inet{6}_getname()")
The socket is already locked in the context that
"kernel_getpeername" used to get called by either
rds_tcp_recv_path" or "tcp_v{4,6}_rcv",
and therefore causing a deadlock.
Luckily, the fan-out need not happen in-context nor fast,
so we can easily just do the same in a background worker.
Also, while we're doing this, we get rid of the unused
struct members "t_conn_w", "t_send_w", "t_down_w" & "t_recv_w".
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/tcp.c | 3 +++
net/rds/tcp.h | 7 ++----
net/rds/tcp_connect.c | 2 ++
net/rds/tcp_listen.c | 54 +++++++++++++++++++++++++++++++------------
4 files changed, 46 insertions(+), 20 deletions(-)
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 45484a93d75f..02f8f928c20b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -358,6 +358,8 @@ static void rds_tcp_conn_free(void *arg)
rdsdebug("freeing tc %p\n", tc);
+ cancel_work_sync(&tc->t_fan_out_w);
+
spin_lock_irqsave(&rds_tcp_conn_lock, flags);
if (!tc->t_tcp_node_detached)
list_del(&tc->t_tcp_node);
@@ -384,6 +386,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
tc->t_tinc = NULL;
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
+ INIT_WORK(&tc->t_fan_out_w, rds_tcp_fan_out_w);
init_waitqueue_head(&tc->t_recv_done_waitq);
conn->c_path[i].cp_transport_data = tc;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 39c86347188c..9ecb0b6b658a 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -44,11 +44,7 @@ struct rds_tcp_connection {
size_t t_tinc_hdr_rem;
size_t t_tinc_data_rem;
- /* XXX error report? */
- struct work_struct t_conn_w;
- struct work_struct t_send_w;
- struct work_struct t_down_w;
- struct work_struct t_recv_w;
+ struct work_struct t_fan_out_w;
/* for info exporting only */
struct list_head t_list_item;
@@ -90,6 +86,7 @@ void rds_tcp_state_change(struct sock *sk);
struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
+void rds_tcp_fan_out_w(struct work_struct *work);
void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out);
int rds_tcp_accept_one(struct rds_tcp_net *rtn);
void rds_tcp_keepalive(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index b77c88ffb199..6954b8c479f1 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -115,6 +115,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
return -EAGAIN;
+ cancel_work_sync(&tc->t_fan_out_w);
+
mutex_lock(&tc->t_conn_path_lock);
if (rds_conn_path_up(cp)) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 6fb5c928b8fd..8fb8f7d26683 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -123,27 +123,20 @@ rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
return NULL;
}
-void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
+void rds_tcp_fan_out_w(struct work_struct *work)
{
- struct rds_tcp_connection *tc;
- struct rds_tcp_net *rtn;
- struct socket *sock;
+ struct rds_tcp_connection *tc = container_of(work,
+ struct rds_tcp_connection,
+ t_fan_out_w);
+ struct rds_connection *conn = tc->t_cpath->cp_conn;
+ struct rds_tcp_net *rtn = tc->t_rtn;
+ struct socket *sock = tc->t_sock;
int sport, npaths;
- if (rds_destroy_pending(conn))
- return;
-
- tc = conn->c_path->cp_transport_data;
- rtn = tc->t_rtn;
- if (!rtn)
- return;
-
- sock = tc->t_sock;
-
/* During fan-out, check that the connection we already
* accepted in slot#0 carried the proper source port modulo.
*/
- if (fan_out && conn->c_with_sport_idx && sock &&
+ if (conn->c_with_sport_idx && sock &&
rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) {
/* cp->cp_index is encoded in lowest bits of source-port */
sport = rds_tcp_get_peer_sport(sock);
@@ -167,6 +160,37 @@ void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
rds_tcp_accept_work(rtn);
}
+void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
+{
+ struct rds_conn_path *cp0;
+ struct rds_tcp_connection *tc;
+ struct rds_tcp_net *rtn;
+
+ if (rds_destroy_pending(conn))
+ return;
+
+ cp0 = conn->c_path;
+ tc = cp0->cp_transport_data;
+ rtn = tc->t_rtn;
+ if (!rtn)
+ return;
+
+ if (fan_out)
+ /* Delegate fan-out to a background worker in order
+ * to allow "kernel_getpeername" to acquire a lock
+ * on the socket.
+ * The socket is already locked in this context
+ * by either "rds_tcp_recv_path" or "tcp_v{4,6}_rcv",
+ * depending on the origin of the dequeue-request.
+ */
+ queue_work(cp0->cp_wq, &tc->t_fan_out_w);
+ else
+ /* Fan-out either already happened or is unnecessary.
+ * Just go ahead and attempt to accept more connections
+ */
+ rds_tcp_accept_work(rtn);
+}
+
int rds_tcp_accept_one(struct rds_tcp_net *rtn)
{
struct socket *listen_sock = rtn->rds_tcp_listen_sock;
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH net-next v1 2/3] net/rds: Use proper peer port number even when not connected
2026-02-06 2:24 [PATCH net-next v1 0/3] net/rds: RDS-TCP reconnect and fanout improvements Allison Henderson
2026-02-06 2:24 ` [PATCH net-next v1 1/3] net/rds: Delegate fan-out to a background worker Allison Henderson
@ 2026-02-06 2:24 ` Allison Henderson
2026-02-06 2:24 ` [PATCH net-next v1 3/3] net/rds: rds_sendmsg should not discard payload_len Allison Henderson
2026-02-06 8:32 ` [syzbot ci] Re: net/rds: RDS-TCP reconnect and fanout improvements syzbot ci
3 siblings, 0 replies; 5+ messages in thread
From: Allison Henderson @ 2026-02-06 2:24 UTC (permalink / raw)
To: netdev
Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
linux-rdma, allison.henderson
From: Greg Jumper <greg.jumper@oracle.com>
The function rds_tcp_get_peer_sport() should return the peer port of a
socket, even when the socket is not currently connected, so that RDS
can reliably determine the MPRDS "lane" corresponding to the port.
rds_tcp_get_peer_sport() calls kernel_getpeername() to get the port
number; however, when paths between endpoints frequently drop and
reconnect, kernel_getpeername() can return -ENOTCONN, causing
rds_tcp_get_peer_sport() to return an error, and ultimately causing
RDS to use the wrong lane for a port when reconnecting to a peer.
This patch modifies rds_tcp_get_peer_sport() to directly call the
socket-specific get-name function (inet_getname() in this case) that
kernel_getpeername() also calls. The socket-specific function offers
an additional argument which, when set to a value greater than 1,
causes the function to return the socket's peer name even when the
socket is not connected, which in turn allows rds_tcp_get_peer_sport()
to return the correct port number.
Signed-off-by: Greg Jumper <greg.jumper@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/tcp_listen.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 8fb8f7d26683..db4938fd1672 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -67,7 +67,14 @@ rds_tcp_get_peer_sport(struct socket *sock)
} saddr;
int sport;
- if (kernel_getpeername(sock, &saddr.addr) >= 0) {
+ /* Call the socket's getname() function (inet_getname() in this case)
+ * with a final argument greater than 1 to get the peer's port
+ * regardless of whether the socket is currently connected.
+ * Using peer=2 will get the peer port even during reconnection states
+ * (TCPF_CLOSE, TCPF_SYN_SENT). This avoids -ENOTCONN while
+ * inet_dport still contains the correct peer port.
+ */
+ if (sock->ops->getname(sock, &saddr.addr, 2) >= 0) {
switch (saddr.addr.sa_family) {
case AF_INET:
sport = ntohs(saddr.sin.sin_port);
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH net-next v1 3/3] net/rds: rds_sendmsg should not discard payload_len
2026-02-06 2:24 [PATCH net-next v1 0/3] net/rds: RDS-TCP reconnect and fanout improvements Allison Henderson
2026-02-06 2:24 ` [PATCH net-next v1 1/3] net/rds: Delegate fan-out to a background worker Allison Henderson
2026-02-06 2:24 ` [PATCH net-next v1 2/3] net/rds: Use proper peer port number even when not connected Allison Henderson
@ 2026-02-06 2:24 ` Allison Henderson
2026-02-06 8:32 ` [syzbot ci] Re: net/rds: RDS-TCP reconnect and fanout improvements syzbot ci
3 siblings, 0 replies; 5+ messages in thread
From: Allison Henderson @ 2026-02-06 2:24 UTC (permalink / raw)
To: netdev
Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
linux-rdma, allison.henderson
From: Allison Henderson <allison.henderson@oracle.com>
Commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with
connection teardown") modifies rds_sendmsg to avoid enqueueing work
while a tear down is in progress. However, it also changed the return
value of rds_sendmsg to that of rds_send_xmit instead of the
payload_len. This means the user may incorrectly receive errno values
when it should have simply received a payload of 0 while the peer
attempts a reconnections. So this patch corrects the teardown handling
code to only use the out error path in that case, thus restoring the
original payload_len return value.
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/send.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/rds/send.c b/net/rds/send.c
index 6e96f108473e..a1039e422a38 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1431,9 +1431,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
else
queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1);
rcu_read_unlock();
+
+ if (ret)
+ goto out;
}
- if (ret)
- goto out;
+
rds_message_put(rm);
for (ind = 0; ind < vct.indx; ind++)
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [syzbot ci] Re: net/rds: RDS-TCP reconnect and fanout improvements
2026-02-06 2:24 [PATCH net-next v1 0/3] net/rds: RDS-TCP reconnect and fanout improvements Allison Henderson
` (2 preceding siblings ...)
2026-02-06 2:24 ` [PATCH net-next v1 3/3] net/rds: rds_sendmsg should not discard payload_len Allison Henderson
@ 2026-02-06 8:32 ` syzbot ci
3 siblings, 0 replies; 5+ messages in thread
From: syzbot ci @ 2026-02-06 8:32 UTC (permalink / raw)
To: achender, allison.henderson, edumazet, horms, kuba,
linux-kselftest, linux-rdma, netdev, pabeni, rds-devel
Cc: syzbot, syzkaller-bugs
syzbot ci has tested the following series
[v1] net/rds: RDS-TCP reconnect and fanout improvements
https://lore.kernel.org/all/20260206022419.1357513-1-achender@kernel.org
* [PATCH net-next v1 1/3] net/rds: Delegate fan-out to a background worker
* [PATCH net-next v1 2/3] net/rds: Use proper peer port number even when not connected
* [PATCH net-next v1 3/3] net/rds: rds_sendmsg should not discard payload_len
and found the following issue:
BUG: sleeping function called from invalid context in rds_tcp_conn_free
Full report is available here:
https://ci.syzbot.org/series/1a5ef180-c02c-401d-9df7-670b18570a55
***
BUG: sleeping function called from invalid context in rds_tcp_conn_free
tree: net-next
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/netdev/net-next.git
base: 7a4cd71fa4514cd85df39b3cf99da8142660cdcd
arch: amd64
compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config: https://ci.syzbot.org/builds/77f47047-43cb-4c25-b0b6-73b8746cea2a/config
syz repro: https://ci.syzbot.org/findings/49698f1e-4f36-4446-9dd1-c409366e6296/syz_repro
BUG: sleeping function called from invalid context at kernel/workqueue.c:4390
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6005, name: syz.2.19
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
2 locks held by syz.2.19/6005:
#0: ffffffff8e35a360 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:331 [inline]
#0: ffffffff8e35a360 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:867 [inline]
#0: ffffffff8e35a360 (rcu_read_lock){....}-{1:3}, at: __rds_conn_create+0x2e4/0x22d0 net/rds/connection.c:177
#1: ffffffff8fa00a98 (rds_conn_lock){....}-{3:3}, at: __rds_conn_create+0x18e2/0x22d0 net/rds/connection.c:304
irq event stamp: 752
hardirqs last enabled at (751): [<ffffffff8b7cc843>] __raw_spin_unlock_irq include/linux/spinlock_api_smp.h:159 [inline]
hardirqs last enabled at (751): [<ffffffff8b7cc843>] _raw_spin_unlock_irq+0x23/0x50 kernel/locking/spinlock.c:202
hardirqs last disabled at (752): [<ffffffff8b7cc61a>] __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline]
hardirqs last disabled at (752): [<ffffffff8b7cc61a>] _raw_spin_lock_irqsave+0x1a/0x60 kernel/locking/spinlock.c:162
softirqs last enabled at (32): [<ffffffff8ac0cd59>] rds_sendmsg+0x7b9/0x2150 net/rds/send.c:1266
softirqs last disabled at (30): [<ffffffff89463a1f>] spin_lock_bh include/linux/spinlock.h:356 [inline]
softirqs last disabled at (30): [<ffffffff89463a1f>] release_sock+0x2f/0x1f0 net/core/sock.c:3793
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 6005 Comm: syz.2.19 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
__might_resched+0x378/0x4d0 kernel/sched/core.c:8829
__cancel_work_sync+0x6d/0x110 kernel/workqueue.c:4390
rds_tcp_conn_free+0x2c/0x170 net/rds/tcp.c:361
__rds_conn_create+0x1bfb/0x22d0 net/rds/connection.c:334
rds_conn_create_outgoing+0x43/0x60 net/rds/connection.c:377
rds_sendmsg+0xff5/0x2150 net/rds/send.c:1321
sock_sendmsg_nosec net/socket.c:727 [inline]
__sock_sendmsg+0x21c/0x270 net/socket.c:742
____sys_sendmsg+0x4d7/0x810 net/socket.c:2592
___sys_sendmsg+0x2a5/0x360 net/socket.c:2646
__sys_sendmsg net/socket.c:2678 [inline]
__do_sys_sendmsg net/socket.c:2683 [inline]
__se_sys_sendmsg net/socket.c:2681 [inline]
__x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2681
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f5b6bf9acb9
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f5b6cd8b028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007f5b6c216090 RCX: 00007f5b6bf9acb9
RDX: 0000000000000000 RSI: 0000200000000480 RDI: 0000000000000004
RBP: 00007f5b6c008bf7 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f5b6c216128 R14: 00007f5b6c216090 R15: 00007ffc17760058
</TASK>
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
^ permalink raw reply [flat|nested] 5+ messages in thread