* [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
2025-03-01 0:19 ` Jakub Kicinski
2025-02-27 4:26 ` [PATCH 2/6] net/rds: Re-factor and avoid superfluous queuing of reconnect work allison.henderson
` (4 subsequent siblings)
5 siblings, 1 reply; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Håkon Bugge <haakon.bugge@oracle.com>
Queuing work on the connect path's work-queue is done from a plurality
of places and contexts. Two more bits in cp_flags are defined, and
together with some helper functions, the work is only queued if not
already queued.
This to avoid the work queue being overloaded, reducing the connection
management performance, since the connection management uses the same
work queue.
Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/connection.c | 11 ++++++++---
net/rds/ib_recv.c | 2 +-
net/rds/ib_send.c | 2 +-
net/rds/rds.h | 33 +++++++++++++++++++++++++++++++++
net/rds/send.c | 6 +++---
net/rds/tcp.c | 12 ++++++++++--
net/rds/tcp_recv.c | 2 +-
net/rds/tcp_send.c | 2 +-
net/rds/threads.c | 25 +++++++++++++++++--------
9 files changed, 75 insertions(+), 20 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index c749c5525b40..1d80586fdda2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -445,9 +445,14 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
if (!cp->cp_transport_data)
return;
- /* make sure lingering queued work won't try to ref the conn */
- cancel_delayed_work_sync(&cp->cp_send_w);
- cancel_delayed_work_sync(&cp->cp_recv_w);
+ /* make sure lingering queued work won't try to ref the
+ * conn. If there is work queued, we cancel it (and set the
+ * bit to avoid any re-queueing)
+ */
+ if (test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ if (test_and_set_bit(RDS_RECV_WORK_QUEUED, &cp->cp_flags))
+ cancel_delayed_work_sync(&cp->cp_recv_w);
rds_conn_path_drop(cp, true);
flush_work(&cp->cp_down_w);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index e53b7f266bd7..4ecee1ff1c26 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -457,7 +457,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
(must_wake ||
(can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
rds_ib_ring_empty(&ic->i_recv_ring))) {
- queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ rds_cond_queue_recv_work(conn->c_path + 0, 1);
}
if (can_wait)
cond_resched();
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4190b90ff3b1..95edd4cb8528 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -419,7 +419,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ rds_cond_queue_send_work(conn->c_path + 0, 0);
WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index dc360252c515..c9a22d0e887b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -90,6 +90,8 @@ enum {
#define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3
#define RDS_DESTROY_PENDING 4
+#define RDS_SEND_WORK_QUEUED 5
+#define RDS_RECV_WORK_QUEUED 6
/* Max number of multipaths per RDS connection. Must be a power of 2 */
#define RDS_MPATH_WORKERS 8
@@ -791,6 +793,37 @@ void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
#define rds_conn_path_error(cp, fmt...) \
__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
+extern struct workqueue_struct *rds_wq;
+static inline void rds_cond_queue_send_work(struct rds_conn_path *cp, unsigned long delay)
+{
+ if (!test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_send_w, delay);
+}
+
+static inline void rds_clear_queued_send_work_bit(struct rds_conn_path *cp)
+{
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__before_atomic();
+ clear_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags);
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__after_atomic();
+}
+
+static inline void rds_cond_queue_recv_work(struct rds_conn_path *cp, unsigned long delay)
+{
+ if (!test_and_set_bit(RDS_RECV_WORK_QUEUED, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, delay);
+}
+
+static inline void rds_clear_queued_recv_work_bit(struct rds_conn_path *cp)
+{
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__before_atomic();
+ clear_bit(RDS_RECV_WORK_QUEUED, &cp->cp_flags);
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__after_atomic();
+}
+
static inline int
rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
{
diff --git a/net/rds/send.c b/net/rds/send.c
index 09a280110654..6329cc8ec246 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -458,7 +458,7 @@ int rds_send_xmit(struct rds_conn_path *cp)
if (rds_destroy_pending(cp->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ rds_cond_queue_send_work(cp, 1);
rcu_read_unlock();
} else if (raced) {
rds_stats_inc(s_send_lock_queue_raced);
@@ -1380,7 +1380,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rds_destroy_pending(cpath->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+ rds_cond_queue_send_work(cpath, 1);
rcu_read_unlock();
}
if (ret)
@@ -1473,7 +1473,7 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
/* schedule the send work on rds_wq */
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ rds_cond_queue_send_work(cp, 0);
rcu_read_unlock();
rds_message_put(rm);
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 0581c53e6517..b3f2c6e27b59 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -168,8 +168,16 @@ void rds_tcp_reset_callbacks(struct socket *sock,
atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
/* reset receive side state for rds_tcp_data_recv() for osock */
- cancel_delayed_work_sync(&cp->cp_send_w);
- cancel_delayed_work_sync(&cp->cp_recv_w);
+
+ /* make sure lingering queued work won't try to ref the
+ * conn. If there is work queued, we cancel it (and set the bit
+ * to avoid any re-queueing)
+ */
+
+ if (test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ if (test_and_set_bit(RDS_RECV_WORK_QUEUED, &cp->cp_flags))
+ cancel_delayed_work_sync(&cp->cp_recv_w);
lock_sock(osock->sk);
if (tc->t_tinc) {
rds_inc_put(&tc->t_tinc->ti_inc);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 7997a19d1da3..ab9fc150f974 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -327,7 +327,7 @@ void rds_tcp_data_ready(struct sock *sk)
if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ rds_cond_queue_recv_work(cp, 0);
rcu_read_unlock();
}
out:
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7d284ac7e81a..cecd3dbde58d 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -201,7 +201,7 @@ void rds_tcp_write_space(struct sock *sk)
rcu_read_lock();
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ rds_cond_queue_send_work(cp, 0);
rcu_read_unlock();
out:
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 1f424cbfcbb4..eedae5653051 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -89,8 +89,10 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
set_bit(0, &cp->cp_conn->c_map_queued);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn)) {
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ rds_clear_queued_send_work_bit(cp);
+ rds_cond_queue_send_work(cp, 0);
+ rds_clear_queued_recv_work_bit(cp);
+ rds_cond_queue_recv_work(cp, 0);
}
rcu_read_unlock();
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
@@ -193,9 +195,11 @@ void rds_send_worker(struct work_struct *work)
struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_send_w.work);
+ unsigned long delay;
int ret;
if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ rds_clear_queued_send_work_bit(cp);
clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
ret = rds_send_xmit(cp);
cond_resched();
@@ -203,15 +207,17 @@ void rds_send_worker(struct work_struct *work)
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ delay = 0;
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+ delay = 2;
break;
default:
- break;
+ return;
}
+
+ rds_cond_queue_send_work(cp, delay);
}
}
@@ -220,23 +226,26 @@ void rds_recv_worker(struct work_struct *work)
struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_recv_w.work);
+ unsigned long delay;
int ret;
if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ rds_clear_queued_recv_work_bit(cp);
ret = cp->cp_conn->c_trans->recv_path(cp);
rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ delay = 0;
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+ delay = 2;
break;
default:
- break;
+ return;
}
+ rds_cond_queue_recv_work(cp, delay);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-02-27 4:26 ` [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work allison.henderson
@ 2025-03-01 0:19 ` Jakub Kicinski
2025-03-05 0:38 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-01 0:19 UTC (permalink / raw)
To: allison.henderson; +Cc: netdev
On Wed, 26 Feb 2025 21:26:33 -0700 allison.henderson@oracle.com wrote:
> + /* clear_bit() does not imply a memory barrier */
> + smp_mb__before_atomic();
> + clear_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags);
> + /* clear_bit() does not imply a memory barrier */
> + smp_mb__after_atomic();
I'm guessing the comments were added because checkpatch asked for them.
The comments are supposed to indicate what this barrier pairs with.
I don't see the purpose of these barriers, please document..
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-01 0:19 ` Jakub Kicinski
@ 2025-03-05 0:38 ` Allison Henderson
2025-03-05 0:44 ` Jakub Kicinski
0 siblings, 1 reply; 27+ messages in thread
From: Allison Henderson @ 2025-03-05 0:38 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Fri, 2025-02-28 at 16:19 -0800, Jakub Kicinski wrote:
> On Wed, 26 Feb 2025 21:26:33 -0700 allison.henderson@oracle.com wrote:
> > + /* clear_bit() does not imply a memory barrier */
> > + smp_mb__before_atomic();
> > + clear_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags);
> > + /* clear_bit() does not imply a memory barrier */
> > + smp_mb__after_atomic();
>
> I'm guessing the comments were added because checkpatch asked for them.
> The comments are supposed to indicate what this barrier pairs with.
> I don't see the purpose of these barriers, please document..
Hi Jakob,
I think the comments meant to refer to the implicit memory barrier in "test_and_set_bit". It looks like it has assembly
code to set the barrier if CONFIG_SMP is set. How about we change the comments to: "pairs with implicit memory barrier
in test_and_set_bit()" ? Let me know what you think.
Thanks!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-05 0:38 ` Allison Henderson
@ 2025-03-05 0:44 ` Jakub Kicinski
2025-03-06 16:41 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-05 0:44 UTC (permalink / raw)
To: Allison Henderson; +Cc: netdev@vger.kernel.org
On Wed, 5 Mar 2025 00:38:41 +0000 Allison Henderson wrote:
> > I'm guessing the comments were added because checkpatch asked for them.
> > The comments are supposed to indicate what this barrier pairs with.
> > I don't see the purpose of these barriers, please document..
>
> Hi Jakob,
>
> I think the comments meant to refer to the implicit memory barrier in
> "test_and_set_bit". It looks like it has assembly code to set the
> barrier if CONFIG_SMP is set. How about we change the comments to:
> "pairs with implicit memory barrier in test_and_set_bit()" ? Let me
> know what you think.
Okay, but what is the purpose. The commit message does not explain
at all why these barriers are needed.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-05 0:44 ` Jakub Kicinski
@ 2025-03-06 16:41 ` Allison Henderson
2025-03-06 18:18 ` Jakub Kicinski
0 siblings, 1 reply; 27+ messages in thread
From: Allison Henderson @ 2025-03-06 16:41 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Tue, 2025-03-04 at 16:44 -0800, Jakub Kicinski wrote:
> On Wed, 5 Mar 2025 00:38:41 +0000 Allison Henderson wrote:
> > > I'm guessing the comments were added because checkpatch asked for them.
> > > The comments are supposed to indicate what this barrier pairs with.
> > > I don't see the purpose of these barriers, please document..
> >
> > Hi Jakob,
> >
> > I think the comments meant to refer to the implicit memory barrier in
> > "test_and_set_bit". It looks like it has assembly code to set the
> > barrier if CONFIG_SMP is set. How about we change the comments to:
> > "pairs with implicit memory barrier in test_and_set_bit()" ? Let me
> > know what you think.
>
> Okay, but what is the purpose. The commit message does not explain
> at all why these barriers are needed.
Hi Jakub,
I think it's to make sure the clearing of the bit is the last operation done for the calling function, in this case
rds_queue_reconnect. The purpose of the barrier in test_and_set is to make sure the bit is checked before proceeding to
any further operations (in our case queuing reconnect items). So it would make sense that the clearing of the bit
should happen only after we are done with all such operations. I found some documentation for smp_mb__*_atomic in
Documentation/memory-barriers.txt that mentions these functions are used for atomic RMW bitop functions like clear_bit
and set_bit, since they do not imply a memory barrier themselves. Perhaps the original comment meant to reference that
too.
I hope that helps? If so, maybe we can expand the comment further to something like:
/*
* pairs with implicit memory barrier in calls to test_and_set_bit with RDS_RECONNECT_PENDING
* Used to ensure the bit is on only while reconnect operations are in progress
*/
Let me know what you think, or that's too much or too little detail.
Thanks!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-06 16:41 ` Allison Henderson
@ 2025-03-06 18:18 ` Jakub Kicinski
2025-03-07 20:28 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-06 18:18 UTC (permalink / raw)
To: Allison Henderson; +Cc: netdev@vger.kernel.org
On Thu, 6 Mar 2025 16:41:35 +0000 Allison Henderson wrote:
> I think it's to make sure the clearing of the bit is the last
> operation done for the calling function, in this case
> rds_queue_reconnect. The purpose of the barrier in test_and_set is
> to make sure the bit is checked before proceeding to any further
> operations (in our case queuing reconnect items).
Let's be precise, can you give an example of 2 execution threads
and memory accesses which have to be ordered.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-06 18:18 ` Jakub Kicinski
@ 2025-03-07 20:28 ` Allison Henderson
2025-03-08 2:53 ` Jakub Kicinski
0 siblings, 1 reply; 27+ messages in thread
From: Allison Henderson @ 2025-03-07 20:28 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Thu, 2025-03-06 at 10:18 -0800, Jakub Kicinski wrote:
> On Thu, 6 Mar 2025 16:41:35 +0000 Allison Henderson wrote:
> > I think it's to make sure the clearing of the bit is the last
> > operation done for the calling function, in this case
> > rds_queue_reconnect. The purpose of the barrier in test_and_set is
> > to make sure the bit is checked before proceeding to any further
> > operations (in our case queuing reconnect items).
>
> Let's be precise, can you give an example of 2 execution threads
> and memory accesses which have to be ordered.
Hi Jakub,
I just realized my last response referred to bits and functions in the next patch instead this of one. Apologies for
the confusion! For this thread example though, I think a pair of threads in rds_send_worker and rds_sendmsg would be a
good example? How about this:
Thread A:
Calls rds_send_worker()
calls rds_clear_queued_send_work_bit()
clears RDS_SEND_WORK_QUEUED in cp->cp_flags
calls rds_send_xmit()
calls cond_resched()
Thread B:
Calls rds_sendmsg()
Calls rds_send_xmit
Calls rds_cond_queue_send_work
checks and sets RDS_SEND_WORK_QUEUED in cp->cp_flags
So in this example the memory barriers ensure that the clearing of the bit is properly seen by thread B. Without these
memory barriers in rds_clear_queued_send_work_bit(), rds_cond_queue_send_work() could see stale values in cp->cp_flags
and incorrectly assume that the work is still queued, leading to potential missed work processing.
I hope that helps some? Let me know if so/not or if there is anything else that would help clarify. If it helps at
all, I think there's a similar use case in commit 93093ea1f059, though it's the other way around with the barriers
around the set_bit, and the implicit barriers in the test_and_clear_bit(). And I think on CPUs with strongly ordered
memory, the barriers do not expand to anything in that case.
Let me know if this helps!
Thank you!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-07 20:28 ` Allison Henderson
@ 2025-03-08 2:53 ` Jakub Kicinski
2025-03-12 7:50 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-08 2:53 UTC (permalink / raw)
To: Allison Henderson; +Cc: netdev@vger.kernel.org
On Fri, 7 Mar 2025 20:28:57 +0000 Allison Henderson wrote:
> > Let's be precise, can you give an example of 2 execution threads
> > and memory accesses which have to be ordered.
>
> Hi Jakub,
>
> I just realized my last response referred to bits and functions in the next patch instead this of one. Apologies for
> the confusion! For this thread example though, I think a pair of threads in rds_send_worker and rds_sendmsg would be a
> good example? How about this:
>
> Thread A:
> Calls rds_send_worker()
> calls rds_clear_queued_send_work_bit()
> clears RDS_SEND_WORK_QUEUED in cp->cp_flags
> calls rds_send_xmit()
> calls cond_resched()
>
> Thread B:
> Calls rds_sendmsg()
> Calls rds_send_xmit
> Calls rds_cond_queue_send_work
> checks and sets RDS_SEND_WORK_QUEUED in cp->cp_flags
We need at least two memory locations if we want to talk about ordering.
In your example we have cp_flags, but the rest is code.
What's the second memory location.
Take a look at e592b5110b3e9393 for an example of a good side by side
thread execution.. listing(?):
Thread1 (oa_tc6_start_xmit) Thread2 (oa_tc6_spi_thread_handler)
--------------------------- -----------------------------------
- if waiting_tx_skb is NULL
- if ongoing_tx_skb is NULL
- ongoing_tx_skb = waiting_tx_skb
- waiting_tx_skb = skb
- waiting_tx_skb = NULL
...
- ongoing_tx_skb = NULL
- if waiting_tx_skb is NULL
- waiting_tx_skb = skb
This makes it pretty clear what fields are at play and how the race
happens.
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-08 2:53 ` Jakub Kicinski
@ 2025-03-12 7:50 ` Allison Henderson
2025-03-26 16:42 ` Jakub Kicinski
0 siblings, 1 reply; 27+ messages in thread
From: Allison Henderson @ 2025-03-12 7:50 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Fri, 2025-03-07 at 18:53 -0800, Jakub Kicinski wrote:
> On Fri, 7 Mar 2025 20:28:57 +0000 Allison Henderson wrote:
> > > Let's be precise, can you give an example of 2 execution threads
> > > and memory accesses which have to be ordered.
> >
> > Hi Jakub,
> >
> > I just realized my last response referred to bits and functions in the next patch instead this of one. Apologies for
> > the confusion! For this thread example though, I think a pair of threads in rds_send_worker and rds_sendmsg would be a
> > good example? How about this:
> >
> > Thread A:
> > Calls rds_send_worker()
> > calls rds_clear_queued_send_work_bit()
> > clears RDS_SEND_WORK_QUEUED in cp->cp_flags
> > calls rds_send_xmit()
> > calls cond_resched()
> >
> > Thread B:
> > Calls rds_sendmsg()
> > Calls rds_send_xmit
> > Calls rds_cond_queue_send_work
> > checks and sets RDS_SEND_WORK_QUEUED in cp->cp_flags
>
> We need at least two memory locations if we want to talk about ordering.
> In your example we have cp_flags, but the rest is code.
> What's the second memory location.
> Take a look at e592b5110b3e9393 for an example of a good side by side
> thread execution.. listing(?):
>
> Thread1 (oa_tc6_start_xmit) Thread2 (oa_tc6_spi_thread_handler)
> --------------------------- -----------------------------------
> - if waiting_tx_skb is NULL
> - if ongoing_tx_skb is NULL
> - ongoing_tx_skb = waiting_tx_skb
> - waiting_tx_skb = skb
> - waiting_tx_skb = NULL
> ...
> - ongoing_tx_skb = NULL
> - if waiting_tx_skb is NULL
> - waiting_tx_skb = skb
>
>
> This makes it pretty clear what fields are at play and how the race
> happens.
Hi Jakub,
I suppose the second address would have to be the queue itself wouldn't it? We have a flag that's meant to avoid
threads racing to access a queue, so it would make sense that the addresses of interest would be the flag and the queue.
Which is cp->cp_send_w in the send example. So if we adjusted our example to include the queue access, then it would
look like this:
Thread A: Thread B:
----------------------------------- -----------------------------------
Calls rds_sendmsg()
Calls rds_send_xmit()
Calls rds_cond_queue_send_work()
Calls rds_send_worker()
calls rds_clear_queued_send_work_bit()
clears RDS_SEND_WORK_QUEUED in cp->cp_flags
checks RDS_SEND_WORK_QUEUED in cp->cp_flags
but sees stale value
Skips queuing on cp->cp_send_w when it should not
Calls rds_send_xmit()
Calls rds_cond_queue_send_work()
queues work on cp->cp_send_w
And then if we have the barriers, then the example would look like this:
Thread A: Thread B:
----------------------------------- -----------------------------------
Calls rds_sendmsg()
Calls rds_send_xmit()
Calls rds_cond_queue_send_work()
Calls rds_send_worker()
calls rds_clear_queued_send_work_bit()
clears RDS_SEND_WORK_QUEUED in cp->cp_flags
checks RDS_SEND_WORK_QUEUED in cp->cp_flags
Queues work on on cp->cp_send_w
Calls rds_send_xmit()
Calls rds_cond_queue_send_work()
skips queueing work on cp->cp_send_w
I think the barriers also make sure thread A's call to rds_send_xmit() happens after the clear_bit() too. Otherwise it
may be possible that it is reordered, and then we get another missed work item there too. I hope this helps some? Let
me know if that makes sense or if you think there's a better way it could be managed. Thank you!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-12 7:50 ` Allison Henderson
@ 2025-03-26 16:42 ` Jakub Kicinski
2025-04-02 1:34 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-26 16:42 UTC (permalink / raw)
To: Allison Henderson; +Cc: netdev@vger.kernel.org
On Wed, 12 Mar 2025 07:50:11 +0000 Allison Henderson wrote:
> Thread A: Thread B:
> ----------------------------------- -----------------------------------
> Calls rds_sendmsg()
> Calls rds_send_xmit()
> Calls rds_cond_queue_send_work()
> Calls rds_send_worker()
> calls rds_clear_queued_send_work_bit()
> clears RDS_SEND_WORK_QUEUED in cp->cp_flags
> checks RDS_SEND_WORK_QUEUED in cp->cp_flags
But if the two threads run in parallel what prevents this check
to happen fully before the previous line on the "Thread A" side?
Please take a look at netif_txq_try_stop() for an example of
a memory-barrier based algo.
> Queues work on on cp->cp_send_w
> Calls rds_send_xmit()
> Calls rds_cond_queue_send_work()
> skips queueing work on cp->cp_send_w
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-03-26 16:42 ` Jakub Kicinski
@ 2025-04-02 1:34 ` Allison Henderson
2025-04-02 16:18 ` Jakub Kicinski
0 siblings, 1 reply; 27+ messages in thread
From: Allison Henderson @ 2025-04-02 1:34 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Wed, 2025-03-26 at 09:42 -0700, Jakub Kicinski wrote:
> On Wed, 12 Mar 2025 07:50:11 +0000 Allison Henderson wrote:
> > Thread A: Thread B:
> > ----------------------------------- -----------------------------------
> > Calls rds_sendmsg()
> > Calls rds_send_xmit()
> > Calls rds_cond_queue_send_work()
> > Calls rds_send_worker()
> > calls rds_clear_queued_send_work_bit()
> > clears RDS_SEND_WORK_QUEUED in cp->cp_flags
> > checks RDS_SEND_WORK_QUEUED in cp->cp_flags
>
> But if the two threads run in parallel what prevents this check
> to happen fully before the previous line on the "Thread A" side?
>
> Please take a look at netif_txq_try_stop() for an example of
> a memory-barrier based algo.
>
> > Queues work on on cp->cp_send_w
> > Calls rds_send_xmit()
> > Calls rds_cond_queue_send_work()
> > skips queueing work on cp->cp_send_w
Hi Jakub,
I had a look at the example, how about we move the barriers from rds_clear_queued_send_work_bit into
rds_cond_queue_send_work? Then we have something like this:
static inline void rds_cond_queue_send_work(struct rds_conn_path *cp, unsigned long delay)
{
/* Ensure prior clear_bit operations for RDS_SEND_WORK_QUEUED are observed */
smp_mb__before_atomic();
if (!test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_send_w, delay);
/* Ensure the RDS_SEND_WORK_QUEUED bit is observed before proceeding */
smp_mb__after_atomic();
}
I think that's more like whats in the example, and in line with what this patch is trying to do. Let me know what you
think.
Thank you for the reviews!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-04-02 1:34 ` Allison Henderson
@ 2025-04-02 16:18 ` Jakub Kicinski
2025-04-03 1:27 ` Allison Henderson
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-04-02 16:18 UTC (permalink / raw)
To: Allison Henderson; +Cc: netdev@vger.kernel.org
On Wed, 2 Apr 2025 01:34:40 +0000 Allison Henderson wrote:
> I had a look at the example, how about we move the barriers from
> rds_clear_queued_send_work_bit into rds_cond_queue_send_work? Then
> we have something like this:
>
> static inline void rds_cond_queue_send_work(struct rds_conn_path *cp,
> unsigned long delay) {
> /* Ensure prior clear_bit operations for RDS_SEND_WORK_QUEUED are observed */ smp_mb__before_atomic();
>
> if (!test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
> queue_delayed_work(rds_wq, &cp->cp_send_w, delay);
>
> /* Ensure the RDS_SEND_WORK_QUEUED bit is observed before proceeding */ smp_mb__after_atomic();
> }
>
> I think that's more like whats in the example, and in line with what
> this patch is trying to do. Let me know what you think.
Sorry, this still feels like a cargo cult to me.
Let's get a clear explanation of what the barriers order
or just skip the patch.
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work
2025-04-02 16:18 ` Jakub Kicinski
@ 2025-04-03 1:27 ` Allison Henderson
0 siblings, 0 replies; 27+ messages in thread
From: Allison Henderson @ 2025-04-03 1:27 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Wed, 2025-04-02 at 09:18 -0700, Jakub Kicinski wrote:
> On Wed, 2 Apr 2025 01:34:40 +0000 Allison Henderson wrote:
> > I had a look at the example, how about we move the barriers from
> > rds_clear_queued_send_work_bit into rds_cond_queue_send_work? Then
> > we have something like this:
> >
> > static inline void rds_cond_queue_send_work(struct rds_conn_path *cp,
> > unsigned long delay) {
> > /* Ensure prior clear_bit operations for RDS_SEND_WORK_QUEUED are observed */ smp_mb__before_atomic();
> >
> > if (!test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
> > queue_delayed_work(rds_wq, &cp->cp_send_w, delay);
> >
> > /* Ensure the RDS_SEND_WORK_QUEUED bit is observed before proceeding */ smp_mb__after_atomic();
> > }
> >
> > I think that's more like whats in the example, and in line with what
> > this patch is trying to do. Let me know what you think.
>
> Sorry, this still feels like a cargo cult to me.
> Let's get a clear explanation of what the barriers order
> or just skip the patch.
>
Sure, I'm out of town tomorrow through Monday, but I'll see what I can do when I get back Tues. I'm working on some
other fixes I'd like to add to the set, but they need the performance assist to not time out in the selftests, so I'll
have to find something else.
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 2/6] net/rds: Re-factor and avoid superfluous queuing of reconnect work
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
2025-02-27 4:26 ` [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
2025-02-27 4:26 ` [PATCH 3/6] net/rds: RDS/TCP does not initiate a connection allison.henderson
` (3 subsequent siblings)
5 siblings, 0 replies; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Håkon Bugge <haakon.bugge@oracle.com>
rds_conn_path_connect_if_down() queues reconnect work with a zero
delay, not checking if it is a passive or active connector. Re-factor
by calling rds_queue_reconnect(). The zero re-connect delay on the
passive side may cause a connect race.
Helper functions are added to conditionally queue reconnect and
properly clear the RDS_RECONNECT_PENDING bit.
The clearing of said bit is moved to the end of the
rds_connect_worker() function and by that also fixing a bug when
rds_tcp is used, where the bit was never cleared.
Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/connection.c | 5 ++---
net/rds/rds.h | 15 +++++++++++++++
net/rds/threads.c | 9 +++++----
3 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 1d80586fdda2..73de221bd7c2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -913,9 +913,8 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
rcu_read_unlock();
return;
}
- if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
- queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ if (rds_conn_path_state(cp) == RDS_CONN_DOWN)
+ rds_queue_reconnect(cp);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c9a22d0e887b..1fb27e1a2e46 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -794,6 +794,21 @@ void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
extern struct workqueue_struct *rds_wq;
+static inline void rds_cond_queue_reconnect_work(struct rds_conn_path *cp, unsigned long delay)
+{
+ if (!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, delay);
+}
+
+static inline void rds_clear_reconnect_pending_work_bit(struct rds_conn_path *cp)
+{
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__before_atomic();
+ clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ /* clear_bit() does not imply a memory barrier */
+ smp_mb__after_atomic();
+}
+
static inline void rds_cond_queue_send_work(struct rds_conn_path *cp, unsigned long delay)
{
if (!test_and_set_bit(RDS_SEND_WORK_QUEUED, &cp->cp_flags))
diff --git a/net/rds/threads.c b/net/rds/threads.c
index eedae5653051..634e9f431fd6 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -153,8 +153,8 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_conn_w,
- rand % cp->cp_reconnect_jiffies);
+ rds_cond_queue_reconnect_work(cp,
+ rand % cp->cp_reconnect_jiffies);
rcu_read_unlock();
cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
@@ -171,8 +171,7 @@ void rds_connect_worker(struct work_struct *work)
if (cp->cp_index > 0 &&
rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
- return;
- clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ goto out;
ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
if (ret) {
ret = conn->c_trans->conn_path_connect(cp);
@@ -188,6 +187,8 @@ void rds_connect_worker(struct work_struct *work)
rds_conn_path_error(cp, "connect failed\n");
}
}
+out:
+ rds_clear_reconnect_pending_work_bit(cp);
}
void rds_send_worker(struct work_struct *work)
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH 3/6] net/rds: RDS/TCP does not initiate a connection
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
2025-02-27 4:26 ` [PATCH 1/6] net/rds: Avoid queuing superfluous send and recv work allison.henderson
2025-02-27 4:26 ` [PATCH 2/6] net/rds: Re-factor and avoid superfluous queuing of reconnect work allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
2025-02-27 4:26 ` [PATCH 4/6] net/rds: No shortcut out of RDS_CONN_ERROR allison.henderson
` (2 subsequent siblings)
5 siblings, 0 replies; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Commit ("rds: Re-factor and avoid superfluous queuing of shutdown
work") changed rds_conn_path_connect_if_down() to call
rds_queue_reconnect() instead of queueing the connection request. In
rds_queue_reconnect(), if the connection's transport is TCP and if the
local address is "bigger" than the peer's, no request is queued.
Beucause of this, no connection will be initiated to the peer.
This patch keeps the code re-factoring of that commit. But it
initiates a connection request right away to make sure that a
connection is set up to the peer.
Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/af_rds.c | 1 +
net/rds/connection.c | 3 ++-
net/rds/rds.h | 7 +++++--
net/rds/send.c | 46 +++++++++++++++++++++++++++++++++----------
net/rds/tcp_connect.c | 1 +
net/rds/tcp_listen.c | 1 +
6 files changed, 46 insertions(+), 13 deletions(-)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 8435a20968ef..d6cba98f3d45 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -685,6 +685,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
rs->rs_rx_traces = 0;
rs->rs_tos = 0;
rs->rs_conn = NULL;
+ rs->rs_conn_path = NULL;
spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 73de221bd7c2..84034a3c69bd 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -147,6 +147,7 @@ static void __rds_conn_path_init(struct rds_connection *conn,
INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
mutex_init(&cp->cp_cm_lock);
cp->cp_flags = 0;
+ init_waitqueue_head(&cp->cp_up_waitq);
}
/*
@@ -913,7 +914,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
rcu_read_unlock();
return;
}
- if (rds_conn_path_state(cp) == RDS_CONN_DOWN)
+ if (rds_conn_path_down(cp))
rds_queue_reconnect(cp);
rcu_read_unlock();
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 1fb27e1a2e46..85b47ce52266 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -134,6 +134,8 @@ struct rds_conn_path {
unsigned int cp_unacked_packets;
unsigned int cp_unacked_bytes;
unsigned int cp_index;
+
+ wait_queue_head_t cp_up_waitq; /* start up waitq */
};
/* One rds_connection per RDS address pair */
@@ -607,10 +609,11 @@ struct rds_sock {
struct rds_transport *rs_transport;
/*
- * rds_sendmsg caches the conn it used the last time around.
- * This helps avoid costly lookups.
+ * rds_sendmsg caches the conn and conn_path it used the last time
+ * around. This helps avoid costly lookups.
*/
struct rds_connection *rs_conn;
+ struct rds_conn_path *rs_conn_path;
/* flag indicating we were congested or not */
int rs_congested;
diff --git a/net/rds/send.c b/net/rds/send.c
index 6329cc8ec246..85ab9e32105e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1044,15 +1044,15 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
static int rds_send_mprds_hash(struct rds_sock *rs,
struct rds_connection *conn, int nonblock)
{
+ struct rds_conn_path *cp;
int hash;
if (conn->c_npaths == 0)
hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
else
hash = RDS_MPATH_HASH(rs, conn->c_npaths);
- if (conn->c_npaths == 0 && hash != 0) {
- rds_send_ping(conn, 0);
-
+ cp = &conn->c_path[hash];
+ if (!conn->c_npaths && rds_conn_path_down(cp)) {
/* The underlying connection is not up yet. Need to wait
* until it is up to be sure that the non-zero c_path can be
* used. But if we are interrupted, we have to use the zero
@@ -1066,10 +1066,19 @@ static int rds_send_mprds_hash(struct rds_sock *rs,
return 0;
if (wait_event_interruptible(conn->c_hs_waitq,
conn->c_npaths != 0))
- hash = 0;
+ return 0;
}
if (conn->c_npaths == 1)
hash = 0;
+
+ /* Wait until the chosen path is up. If it is interrupted,
+ * just return as this is an optimization to make sure that
+ * the message is sent.
+ */
+ cp = &conn->c_path[hash];
+ if (rds_conn_path_down(cp))
+ wait_event_interruptible(cp->cp_up_waitq,
+ !rds_conn_path_down(cp));
}
return hash;
}
@@ -1290,6 +1299,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
rs->rs_tos == rs->rs_conn->c_tos) {
conn = rs->rs_conn;
+ cpath = rs->rs_conn_path;
} else {
conn = rds_conn_create_outgoing(sock_net(sock->sk),
&rs->rs_bound_addr, &daddr,
@@ -1300,14 +1310,30 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
ret = PTR_ERR(conn);
goto out;
}
+ if (conn->c_trans->t_mp_capable) {
+ /* c_npaths == 0 if we have not talked to this peer
+ * before. Initiate a connection request to the
+ * peer right away.
+ */
+ if (!conn->c_npaths &&
+ rds_conn_path_down(&conn->c_path[0])) {
+ /* rds_connd_queue_reconnect_work() ensures
+ * that only one request is queued. And
+ * rds_send_ping() ensures that only one ping
+ * is outstanding.
+ */
+ rds_cond_queue_reconnect_work(&conn->c_path[0],
+ 0);
+ rds_send_ping(conn, 0);
+ }
+ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, 0)];
+ } else {
+ cpath = &conn->c_path[0];
+ }
rs->rs_conn = conn;
+ rs->rs_conn_path = cpath;
}
- if (conn->c_trans->t_mp_capable)
- cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
- else
- cpath = &conn->c_path[0];
-
rm->m_conn_path = cpath;
/* Parse any control messages the user may have included. */
@@ -1335,7 +1361,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
if (rds_conn_path_down(cpath))
- rds_check_all_paths(conn);
+ rds_conn_path_connect_if_down(cpath);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index a0046e99d6df..97596a3c346a 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -73,6 +73,7 @@ void rds_tcp_state_change(struct sock *sk)
rds_conn_path_drop(cp, false);
} else {
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ wake_up(&cp->cp_up_waitq);
}
break;
case TCP_CLOSE_WAIT:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index d89bd8d0c354..60c52322b896 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -211,6 +211,7 @@ int rds_tcp_accept_one(struct socket *sock)
} else {
rds_tcp_set_callbacks(new_sock, cp);
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ wake_up(&cp->cp_up_waitq);
}
new_sock = NULL;
ret = 0;
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH 4/6] net/rds: No shortcut out of RDS_CONN_ERROR
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
` (2 preceding siblings ...)
2025-02-27 4:26 ` [PATCH 3/6] net/rds: RDS/TCP does not initiate a connection allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
2025-03-01 0:19 ` Jakub Kicinski
2025-02-27 4:26 ` [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages allison.henderson
2025-02-27 4:26 ` [PATCH 6/6] net/rds: Encode cp_index in TCP source port allison.henderson
5 siblings, 1 reply; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Gerd Rausch <gerd.rausch@oracle.com>
RDS connections carry a state "rds_conn_path::cp_state"
and transitions from one state to another are conditional
upon an expected state: "rds_conn_path_transition"
There is one exception to this conditionality, which is
"RDS_CONN_ERROR" that can be enforced by "rds_conn_path_drop"
regardless of what state the condition is currently in.
But as soon as a connection enters state "RDS_CONN_ERROR",
the connection handling code expects it to go through the
shutdown-path.
The RDS/TCP multipath changes added a shortcut out of
"RDS_CONN_ERROR" straight back to "RDS_CONN_CONNECTING"
via "rds_tcp_accept_one_path" (e.g. after "rds_tcp_state_change").
A subsequent "rds_tcp_reset_callbacks" can then transition
the state to "RDS_CONN_RESETTING" with a shutdown-worker queued.
That'll trip up "rds_conn_init_shutdown", which was
never adjust to handle "RDS_CONN_RESETTING" and subsequently
drops the connection and leaves "RDS_SHUTDOWN_WORK_QUEUED"
on forever.
So we do two things here:
a) Don't shortcut "RDS_CONN_ERROR", but take the longer
path through the shutdown code.
b) Add "RDS_CONN_RESETTING" to the expected states in
"rds_conn_init_shutdown" so that we won't get
stuck, if we ever hit weird state transitions like
this again.
Fixes: ("RDS: TCP: fix race windows in send-path quiescence by rds_tcp_accept_one()")
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/connection.c | 2 ++
net/rds/tcp_listen.c | 5 -----
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 84034a3c69bd..b262e6ef6b41 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -382,6 +382,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
if (!rds_conn_path_transition(cp, RDS_CONN_UP,
RDS_CONN_DISCONNECTING) &&
!rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DISCONNECTING) &&
+ !rds_conn_path_transition(cp, RDS_CONN_RESETTING,
RDS_CONN_DISCONNECTING)) {
rds_conn_path_error(cp,
"shutdown called in state %d\n",
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 60c52322b896..886b5373843e 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -59,9 +59,6 @@ void rds_tcp_keepalive(struct socket *sock)
* socket and force a reconneect from smaller -> larger ip addr. The reason
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
- * Since reconnects are only initiated from the node with the numerically
- * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
- * by moving them to CONNECTING in this function.
*/
static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
@@ -86,8 +83,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING) ||
- rds_conn_path_transition(cp, RDS_CONN_ERROR,
RDS_CONN_CONNECTING)) {
return cp->cp_transport_data;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
` (3 preceding siblings ...)
2025-02-27 4:26 ` [PATCH 4/6] net/rds: No shortcut out of RDS_CONN_ERROR allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
2025-03-01 0:21 ` Jakub Kicinski
` (2 more replies)
2025-02-27 4:26 ` [PATCH 6/6] net/rds: Encode cp_index in TCP source port allison.henderson
5 siblings, 3 replies; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Gerd Rausch <gerd.rausch@oracle.com>
RDS/TCP differs from RDS/RDMA in that message acknowledgment
is done based on TCP sequence numbers:
As soon as the last byte of a message has been acknowledged
by the TCP stack of a peer, "rds_tcp_write_space()" goes on
to discard prior messages from the send queue.
Which is fine, for as long as the receiver never throws any messages away.
Unfortunately, that is *not* the case since the introduction of MPRDS:
commit "RDS: TCP: Enable multipath RDS for TCP"
A new function "rds_tcp_accept_one_path" was introduced,
which is entitled to return "NULL", if no connection path is currently
available.
Unfortunately, this happens after the "->accept()" call, and the new socket
often already contains messages, since the peer already transitioned
to "RDS_CONN_UP" on behalf of "TCP_ESTABLISHED".
That's also the case after this [1]:
commit "RDS: TCP: Force every connection to be initiated by numerically
smaller IP address"
which tried to address the situation of pending data by only transitioning
connections from a smaller IP address to "RDS_CONN_UP".
But even in those cases, and in particular if the "RDS_EXTHDR_NPATHS"
handshake has not occurred yet, and therefore we're working with
"c_npaths <= 1", "c_conn[0]" may be in a state distinct from
"RDS_CONN_DOWN", and therefore all messages on the just accepted socket
will be tossed away.
This fix changes "rds_tcp_accept_one":
* If connected from a peer with a larger IP address, the new socket
will continue to get closed right away.
With commit [1] above, there should not be any messages
in the socket receive buffer, since the peer never transitioned
to "RDS_CONN_UP".
Therefore it should be okay to not make any efforts to dispatch
the socket receive buffer.
* If connected from a peer with a smaller IP address,
we call "rds_tcp_accept_one_path" to find a free slot/"path".
If found, business goes on as usual.
If none was found, we save/stash the newly accepted socket
into "rds_tcp_accepted_sock", in order to not lose any
messages that may have arrived already.
We then return from "rds_tcp_accept_one" with "-ENOBUFS".
Later on, when a slot/"path" does become available again
(e.g. state transitioned to "RDS_CONN_DOWN",
or HS extension header was received with "c_npaths > 1")
we call "rds_tcp_conn_slots_available" that simply re-issues
a "rds_tcp_accept_one_path" worker-callback and picks
up the new socket from "rds_tcp_accepted_sock", and thereby
continuing where it left with "-ENOBUFS" last time.
Since a new slot has become available, those messages
won't be lost, since processing proceeds as if that slot
had been available the first time around.
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Jack Vogel <jack.vogel@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/rds.h | 1 +
net/rds/recv.c | 4 ++
net/rds/tcp.c | 27 +++-----
net/rds/tcp.h | 22 +++++-
net/rds/tcp_listen.c | 160 ++++++++++++++++++++++++++++++-------------
5 files changed, 148 insertions(+), 66 deletions(-)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 85b47ce52266..422d5e26410e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -548,6 +548,7 @@ struct rds_transport {
__u32 scope_id);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
+ void (*conn_slots_available)(struct rds_connection *conn);
int (*conn_path_connect)(struct rds_conn_path *cp);
void (*conn_path_shutdown)(struct rds_conn_path *conn);
void (*xmit_path_prepare)(struct rds_conn_path *cp);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 5627f80013f8..c0a89af29d1b 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -230,6 +230,10 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
conn->c_npaths = max_t(int, conn->c_npaths, 1);
conn->c_ping_triggered = 0;
rds_conn_peer_gen_update(conn, new_peer_gen_num);
+
+ if (conn->c_npaths > 1 &&
+ conn->c_trans->conn_slots_available)
+ conn->c_trans->conn_slots_available(conn);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index b3f2c6e27b59..915b35136693 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -221,6 +221,8 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
tc->t_sock = sock;
+ if (!tc->t_rtn)
+ tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid);
tc->t_cpath = cp;
tc->t_orig_data_ready = sock->sk->sk_data_ready;
tc->t_orig_write_space = sock->sk->sk_write_space;
@@ -386,6 +388,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
}
mutex_init(&tc->t_conn_path_lock);
tc->t_sock = NULL;
+ tc->t_rtn = NULL;
tc->t_tinc = NULL;
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
@@ -466,6 +469,7 @@ struct rds_transport rds_tcp_transport = {
.recv_path = rds_tcp_recv_path,
.conn_alloc = rds_tcp_conn_alloc,
.conn_free = rds_tcp_conn_free,
+ .conn_slots_available = rds_tcp_conn_slots_available,
.conn_path_connect = rds_tcp_conn_path_connect,
.conn_path_shutdown = rds_tcp_conn_path_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
@@ -481,17 +485,7 @@ struct rds_transport rds_tcp_transport = {
.t_unloading = rds_tcp_is_unloading,
};
-static unsigned int rds_tcp_netid;
-
-/* per-network namespace private data for this module */
-struct rds_tcp_net {
- struct socket *rds_tcp_listen_sock;
- struct work_struct rds_tcp_accept_w;
- struct ctl_table_header *rds_tcp_sysctl;
- struct ctl_table *ctl_table;
- int sndbuf_size;
- int rcvbuf_size;
-};
+int rds_tcp_netid;
/* All module specific customizations to the RDS-TCP socket should be done in
* rds_tcp_tune() and applied after socket creation.
@@ -538,15 +532,12 @@ static void rds_tcp_accept_worker(struct work_struct *work)
struct rds_tcp_net,
rds_tcp_accept_w);
- while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+ while (rds_tcp_accept_one(rtn) == 0)
cond_resched();
}
-void rds_tcp_accept_work(struct sock *sk)
+void rds_tcp_accept_work(struct rds_tcp_net *rtn)
{
- struct net *net = sock_net(sk);
- struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
-
queue_work(rds_wq, &rtn->rds_tcp_accept_w);
}
@@ -558,6 +549,8 @@ static __net_init int rds_tcp_init_net(struct net *net)
memset(rtn, 0, sizeof(*rtn));
+ mutex_init(&rtn->rds_tcp_accept_lock);
+
/* {snd, rcv}buf_size default to 0, which implies we let the
* stack pick the value, and permit auto-tuning of buffer size.
*/
@@ -621,6 +614,8 @@ static void rds_tcp_kill_sock(struct net *net)
rtn->rds_tcp_listen_sock = NULL;
rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
+ if (rtn->rds_tcp_accepted_sock)
+ sock_release(rtn->rds_tcp_accepted_sock);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 053aa7da87ef..2000f4acd57a 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -4,6 +4,21 @@
#define RDS_TCP_PORT 16385
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+ /* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock"
+ * to protect "rds_tcp_accepted_sock"
+ */
+ struct mutex rds_tcp_accept_lock;
+ struct socket *rds_tcp_listen_sock;
+ struct socket *rds_tcp_accepted_sock;
+ struct work_struct rds_tcp_accept_w;
+ struct ctl_table_header *rds_tcp_sysctl;
+ struct ctl_table *ctl_table;
+ int sndbuf_size;
+ int rcvbuf_size;
+};
+
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
@@ -19,6 +34,7 @@ struct rds_tcp_connection {
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
+ struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;
void *t_orig_state_change;
@@ -49,6 +65,7 @@ struct rds_tcp_statistics {
};
/* tcp.c */
+extern int rds_tcp_netid;
bool rds_tcp_tune(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
@@ -57,7 +74,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
u32 rds_tcp_write_seq(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
extern struct rds_transport rds_tcp_transport;
-void rds_tcp_accept_work(struct sock *sk);
+void rds_tcp_accept_work(struct rds_tcp_net *rtn);
int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
__u32 scope_id);
/* tcp_connect.c */
@@ -69,7 +86,8 @@ void rds_tcp_state_change(struct sock *sk);
struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
-int rds_tcp_accept_one(struct socket *sock);
+void rds_tcp_conn_slots_available(struct rds_connection *conn);
+int rds_tcp_accept_one(struct rds_tcp_net *rtn);
void rds_tcp_keepalive(struct socket *sock);
void *rds_tcp_listen_sock_def_readable(struct net *net);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 886b5373843e..e44384f0adf7 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -35,6 +35,8 @@
#include <linux/in.h>
#include <net/tcp.h>
#include <trace/events/sock.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "rds.h"
#include "tcp.h"
@@ -66,32 +68,47 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
int i;
int npaths = max_t(int, 1, conn->c_npaths);
- /* for mprds, all paths MUST be initiated by the peer
- * with the smaller address.
- */
- if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
- /* Make sure we initiate at least one path if this
- * has not already been done; rds_start_mprds() will
- * take care of additional paths, if necessary.
- */
- if (npaths == 1)
- rds_conn_path_connect_if_down(&conn->c_path[0]);
- return NULL;
- }
-
for (i = 0; i < npaths; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING)) {
+ RDS_CONN_CONNECTING))
return cp->cp_transport_data;
- }
}
return NULL;
}
-int rds_tcp_accept_one(struct socket *sock)
+void rds_tcp_conn_slots_available(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc;
+ struct rds_tcp_net *rtn;
+
+ smp_rmb();
+ if (test_bit(RDS_DESTROY_PENDING, &conn->c_path->cp_flags))
+ return;
+
+ tc = conn->c_path->cp_transport_data;
+ rtn = tc->t_rtn;
+ if (!rtn)
+ return;
+
+ /* As soon as a connection went down,
+ * it is safe to schedule a "rds_tcp_accept_one"
+ * attempt even if there are no connections pending:
+ * Function "rds_tcp_accept_one" won't block
+ * but simply return -EAGAIN in that case.
+ *
+ * Doing so is necessary to address the case where an
+ * incoming connection on "rds_tcp_listen_sock" is ready
+ * to be acccepted prior to a free slot being available:
+ * the -ENOBUFS case in "rds_tcp_accept_one".
+ */
+ rds_tcp_accept_work(rtn);
+}
+
+int rds_tcp_accept_one(struct rds_tcp_net *rtn)
{
+ struct socket *listen_sock = rtn->rds_tcp_listen_sock;
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
@@ -109,37 +126,45 @@ int rds_tcp_accept_one(struct socket *sock)
#endif
int dev_if = 0;
- if (!sock) /* module unload or netns delete in progress */
- return -ENETUNREACH;
+ mutex_lock(&rtn->rds_tcp_accept_lock);
- ret = sock_create_lite(sock->sk->sk_family,
- sock->sk->sk_type, sock->sk->sk_protocol,
- &new_sock);
- if (ret)
- goto out;
+ if (!listen_sock)
+ return -ENETUNREACH;
- ret = sock->ops->accept(sock, new_sock, &arg);
- if (ret < 0)
- goto out;
+ new_sock = rtn->rds_tcp_accepted_sock;
+ rtn->rds_tcp_accepted_sock = NULL;
+
+ if (!new_sock) {
+ ret = sock_create_lite(listen_sock->sk->sk_family,
+ listen_sock->sk->sk_type,
+ listen_sock->sk->sk_protocol,
+ &new_sock);
+ if (ret)
+ goto out;
+
+ ret = listen_sock->ops->accept(listen_sock, new_sock, &arg);
+ if (ret < 0)
+ goto out;
+
+ /* sock_create_lite() does not get a hold on the owner module so we
+ * need to do it here. Note that sock_release() uses sock->ops to
+ * determine if it needs to decrement the reference count. So set
+ * sock->ops after calling accept() in case that fails. And there's
+ * no need to do try_module_get() as the listener should have a hold
+ * already.
+ */
+ new_sock->ops = listen_sock->ops;
+ __module_get(new_sock->ops->owner);
- /* sock_create_lite() does not get a hold on the owner module so we
- * need to do it here. Note that sock_release() uses sock->ops to
- * determine if it needs to decrement the reference count. So set
- * sock->ops after calling accept() in case that fails. And there's
- * no need to do try_module_get() as the listener should have a hold
- * already.
- */
- new_sock->ops = sock->ops;
- __module_get(new_sock->ops->owner);
+ rds_tcp_keepalive(new_sock);
+ if (!rds_tcp_tune(new_sock)) {
+ ret = -EINVAL;
+ goto out;
+ }
- rds_tcp_keepalive(new_sock);
- if (!rds_tcp_tune(new_sock)) {
- ret = -EINVAL;
- goto out;
+ inet = inet_sk(new_sock->sk);
}
- inet = inet_sk(new_sock->sk);
-
#if IS_ENABLED(CONFIG_IPV6)
my_addr = &new_sock->sk->sk_v6_rcv_saddr;
peer_addr = &new_sock->sk->sk_v6_daddr;
@@ -150,7 +175,7 @@ int rds_tcp_accept_one(struct socket *sock)
peer_addr = &daddr;
#endif
rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
- sock->sk->sk_family,
+ listen_sock->sk->sk_family,
my_addr, ntohs(inet->inet_sport),
peer_addr, ntohs(inet->inet_dport));
@@ -170,13 +195,13 @@ int rds_tcp_accept_one(struct socket *sock)
}
#endif
- if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) {
+ if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) {
/* local address connection is only allowed via loopback */
ret = -EOPNOTSUPP;
goto out;
}
- conn = rds_conn_create(sock_net(sock->sk),
+ conn = rds_conn_create(sock_net(listen_sock->sk),
my_addr, peer_addr,
&rds_tcp_transport, 0, GFP_KERNEL, dev_if);
@@ -189,15 +214,51 @@ int rds_tcp_accept_one(struct socket *sock)
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
- rs_tcp = rds_tcp_accept_one_path(conn);
- if (!rs_tcp)
+ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) {
+ /* Try to obtain a free connection slot.
+ * If unsuccessful, we need to preserve "new_sock"
+ * that we just accepted, since its "sk_receive_queue"
+ * may contain messages already that have been acknowledged
+ * to and discarded by the sender.
+ * We must not throw those away!
+ */
+ rs_tcp = rds_tcp_accept_one_path(conn);
+ if (!rs_tcp) {
+ /* It's okay to stash "new_sock", since
+ * "rds_tcp_conn_slots_available" triggers "rds_tcp_accept_one"
+ * again as soon as one of the connection slots
+ * becomes available again
+ */
+ rtn->rds_tcp_accepted_sock = new_sock;
+ new_sock = NULL;
+ ret = -ENOBUFS;
+ goto out;
+ }
+ } else {
+ /* This connection request came from a peer with
+ * a larger address.
+ * Function "rds_tcp_state_change" makes sure
+ * that the connection doesn't transition
+ * to state "RDS_CONN_UP", and therefore
+ * we should not have received any messages
+ * on this socket yet.
+ * This is the only case where it's okay to
+ * not dequeue messages from "sk_receive_queue".
+ */
+ if (conn->c_npaths <= 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
+ rs_tcp = NULL;
goto rst_nsk;
+ }
+
mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp);
WARN_ON(conn_state == RDS_CONN_UP);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) {
+ rds_conn_path_drop(cp, 0);
goto rst_nsk;
+ }
if (rs_tcp->t_sock) {
/* Duelling SYN has been handled in rds_tcp_accept_one() */
rds_tcp_reset_callbacks(new_sock, cp);
@@ -228,6 +289,9 @@ int rds_tcp_accept_one(struct socket *sock)
mutex_unlock(&rs_tcp->t_conn_path_lock);
if (new_sock)
sock_release(new_sock);
+
+ mutex_unlock(&rtn->rds_tcp_accept_lock);
+
return ret;
}
@@ -255,7 +319,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
* the listen socket is being torn down.
*/
if (sk->sk_state == TCP_LISTEN)
- rds_tcp_accept_work(sk);
+ rds_tcp_accept_work(net_generic(sock_net(sk), rds_tcp_netid));
else
ready = rds_tcp_listen_sock_def_readable(sock_net(sk));
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-02-27 4:26 ` [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages allison.henderson
@ 2025-03-01 0:21 ` Jakub Kicinski
2025-03-06 16:41 ` Allison Henderson
2025-03-01 23:22 ` kernel test robot
2025-03-04 10:28 ` Dan Carpenter
2 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2025-03-01 0:21 UTC (permalink / raw)
To: allison.henderson; +Cc: netdev
On Wed, 26 Feb 2025 21:26:37 -0700 allison.henderson@oracle.com wrote:
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 85b47ce52266..422d5e26410e 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -548,6 +548,7 @@ struct rds_transport {
> __u32 scope_id);
> int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
> void (*conn_free)(void *data);
> + void (*conn_slots_available)(struct rds_connection *conn);
> int (*conn_path_connect)(struct rds_conn_path *cp);
> void (*conn_path_shutdown)(struct rds_conn_path *conn);
> void (*xmit_path_prepare)(struct rds_conn_path *cp);
This struct has a kdoc, you need to document the new member.
Or make the comment not a kdoc, if full documentation isn't necessary.
--
pw-bot: cr
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-03-01 0:21 ` Jakub Kicinski
@ 2025-03-06 16:41 ` Allison Henderson
0 siblings, 0 replies; 27+ messages in thread
From: Allison Henderson @ 2025-03-06 16:41 UTC (permalink / raw)
To: kuba@kernel.org; +Cc: netdev@vger.kernel.org
On Fri, 2025-02-28 at 16:21 -0800, Jakub Kicinski wrote:
> On Wed, 26 Feb 2025 21:26:37 -0700 allison.henderson@oracle.com wrote:
> > diff --git a/net/rds/rds.h b/net/rds/rds.h
> > index 85b47ce52266..422d5e26410e 100644
> > --- a/net/rds/rds.h
> > +++ b/net/rds/rds.h
> > @@ -548,6 +548,7 @@ struct rds_transport {
> > __u32 scope_id);
> > int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
> > void (*conn_free)(void *data);
> > + void (*conn_slots_available)(struct rds_connection *conn);
> > int (*conn_path_connect)(struct rds_conn_path *cp);
> > void (*conn_path_shutdown)(struct rds_conn_path *conn);
> > void (*xmit_path_prepare)(struct rds_conn_path *cp);
>
> This struct has a kdoc, you need to document the new member.
> Or make the comment not a kdoc, if full documentation isn't necessary.
Hi Jakub,
Sure, how about I break the kdoc into comments for their respective members and add then a comment for the new function
pointer. How does the below new comment sound:
/*
* conn_slots_available is invoked when a previously unavailable connection slot
* becomes available again. rds_tcp_accept_one_path may return -ENOBUFS if it
* cannot find an available slot, and then stashes the new socket in
* "rds_tcp_accepted_sock". This function re-issues `rds_tcp_accept_one_path`,
* which picks up the stashed socket and continuing where it left with "-ENOBUFS"
* last time. This ensures messages received on the new socket are not discarded
* when no connection path was available at the time.
*/
Let me know what you think. Thanks!
Allison
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-02-27 4:26 ` [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages allison.henderson
2025-03-01 0:21 ` Jakub Kicinski
@ 2025-03-01 23:22 ` kernel test robot
2025-03-05 0:43 ` Allison Henderson
2025-03-04 10:28 ` Dan Carpenter
2 siblings, 1 reply; 27+ messages in thread
From: kernel test robot @ 2025-03-01 23:22 UTC (permalink / raw)
To: allison.henderson, netdev; +Cc: llvm, oe-kbuild-all
Hi,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net/main]
[also build test WARNING on net-next/main linus/master v6.14-rc4 next-20250228]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/allison-henderson-oracle-com/net-rds-Avoid-queuing-superfluous-send-and-recv-work/20250227-123038
base: net/main
patch link: https://lore.kernel.org/r/20250227042638.82553-6-allison.henderson%40oracle.com
patch subject: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
config: riscv-randconfig-002-20250302 (https://download.01.org/0day-ci/archive/20250302/202503020739.xWWyG7zO-lkp@intel.com/config)
compiler: clang version 16.0.6 (https://github.com/llvm/llvm-project 7cbf1a2591520c2491aa35339f227775f4d3adf6)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250302/202503020739.xWWyG7zO-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503020739.xWWyG7zO-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> net/rds/tcp_listen.c:137:6: warning: variable 'inet' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
if (!new_sock) {
^~~~~~~~~
net/rds/tcp_listen.c:179:19: note: uninitialized use occurs here
my_addr, ntohs(inet->inet_sport),
^~~~
include/linux/byteorder/generic.h:142:27: note: expanded from macro 'ntohs'
#define ntohs(x) ___ntohs(x)
^
include/linux/byteorder/generic.h:137:35: note: expanded from macro '___ntohs'
#define ___ntohs(x) __be16_to_cpu(x)
^
include/uapi/linux/byteorder/little_endian.h:43:59: note: expanded from macro '__be16_to_cpu'
#define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))
^
include/uapi/linux/swab.h:105:31: note: expanded from macro '__swab16'
(__u16)(__builtin_constant_p(x) ? \
^
net/rds/tcp_listen.c:137:2: note: remove the 'if' if its condition is always true
if (!new_sock) {
^~~~~~~~~~~~~~~
net/rds/tcp_listen.c:115:24: note: initialize the variable 'inet' to silence this warning
struct inet_sock *inet;
^
= NULL
1 warning generated.
vim +137 net/rds/tcp_listen.c
108
109 int rds_tcp_accept_one(struct rds_tcp_net *rtn)
110 {
111 struct socket *listen_sock = rtn->rds_tcp_listen_sock;
112 struct socket *new_sock = NULL;
113 struct rds_connection *conn;
114 int ret;
115 struct inet_sock *inet;
116 struct rds_tcp_connection *rs_tcp = NULL;
117 int conn_state;
118 struct rds_conn_path *cp;
119 struct in6_addr *my_addr, *peer_addr;
120 struct proto_accept_arg arg = {
121 .flags = O_NONBLOCK,
122 .kern = true,
123 };
124 #if !IS_ENABLED(CONFIG_IPV6)
125 struct in6_addr saddr, daddr;
126 #endif
127 int dev_if = 0;
128
129 mutex_lock(&rtn->rds_tcp_accept_lock);
130
131 if (!listen_sock)
132 return -ENETUNREACH;
133
134 new_sock = rtn->rds_tcp_accepted_sock;
135 rtn->rds_tcp_accepted_sock = NULL;
136
> 137 if (!new_sock) {
138 ret = sock_create_lite(listen_sock->sk->sk_family,
139 listen_sock->sk->sk_type,
140 listen_sock->sk->sk_protocol,
141 &new_sock);
142 if (ret)
143 goto out;
144
145 ret = listen_sock->ops->accept(listen_sock, new_sock, &arg);
146 if (ret < 0)
147 goto out;
148
149 /* sock_create_lite() does not get a hold on the owner module so we
150 * need to do it here. Note that sock_release() uses sock->ops to
151 * determine if it needs to decrement the reference count. So set
152 * sock->ops after calling accept() in case that fails. And there's
153 * no need to do try_module_get() as the listener should have a hold
154 * already.
155 */
156 new_sock->ops = listen_sock->ops;
157 __module_get(new_sock->ops->owner);
158
159 rds_tcp_keepalive(new_sock);
160 if (!rds_tcp_tune(new_sock)) {
161 ret = -EINVAL;
162 goto out;
163 }
164
165 inet = inet_sk(new_sock->sk);
166 }
167
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-03-01 23:22 ` kernel test robot
@ 2025-03-05 0:43 ` Allison Henderson
0 siblings, 0 replies; 27+ messages in thread
From: Allison Henderson @ 2025-03-05 0:43 UTC (permalink / raw)
To: netdev@vger.kernel.org, lkp@intel.com
Cc: llvm@lists.linux.dev, oe-kbuild-all@lists.linux.dev
On Sun, 2025-03-02 at 07:22 +0800, kernel test robot wrote:
> Hi,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on net/main]
> [also build test WARNING on net-next/main linus/master v6.14-rc4 next-20250228]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://urldefense.com/v3/__https://git-scm.com/docs/git-format-patch*_base_tree_information__;Iw!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-C8voWMQ$ ]
>
> url: https://urldefense.com/v3/__https://github.com/intel-lab-lkp/linux/commits/allison-henderson-oracle-com/net-rds-Avoid-queuing-superfluous-send-and-recv-work/20250227-123038__;!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-YEva8Ys$
> base: net/main
> patch link: https://urldefense.com/v3/__https://lore.kernel.org/r/20250227042638.82553-6-allison.henderson*40oracle.com__;JQ!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-O0k5mu0$
> patch subject: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
> config: riscv-randconfig-002-20250302 (https://urldefense.com/v3/__https://download.01.org/0day-ci/archive/20250302/202503020739.xWWyG7zO-lkp@intel.com/config__;!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-4P6Kfjw$ )
> compiler: clang version 16.0.6 (https://urldefense.com/v3/__https://github.com/llvm/llvm-project__;!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-p4RKZfU$ 7cbf1a2591520c2491aa35339f227775f4d3adf6)
> reproduce (this is a W=1 build): (https://urldefense.com/v3/__https://download.01.org/0day-ci/archive/20250302/202503020739.xWWyG7zO-lkp@intel.com/reproduce__;!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-3tTQLjw$ )
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> > Reported-by: kernel test robot <lkp@intel.com>
> > Closes: https://urldefense.com/v3/__https://lore.kernel.org/oe-kbuild-all/202503020739.xWWyG7zO-lkp@intel.com/__;!!ACWV5N9M2RV99hQ!N3fhQzk3WA-OAbKUbFqZo2JRgVJCSPX22UjSEDrigRVQ8QvFOo479ljGaZ4Xkdig5e4AXzn-gr3kV7I$
>
> All warnings (new ones prefixed by >>):
>
> > > net/rds/tcp_listen.c:137:6: warning: variable 'inet' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
> if (!new_sock) {
> ^~~~~~~~~
> net/rds/tcp_listen.c:179:19: note: uninitialized use occurs here
> my_addr, ntohs(inet->inet_sport),
> ^~~~
> include/linux/byteorder/generic.h:142:27: note: expanded from macro 'ntohs'
> #define ntohs(x) ___ntohs(x)
> ^
> include/linux/byteorder/generic.h:137:35: note: expanded from macro '___ntohs'
> #define ___ntohs(x) __be16_to_cpu(x)
> ^
> include/uapi/linux/byteorder/little_endian.h:43:59: note: expanded from macro '__be16_to_cpu'
> #define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))
> ^
> include/uapi/linux/swab.h:105:31: note: expanded from macro '__swab16'
> (__u16)(__builtin_constant_p(x) ? \
> ^
> net/rds/tcp_listen.c:137:2: note: remove the 'if' if its condition is always true
> if (!new_sock) {
> ^~~~~~~~~~~~~~~
> net/rds/tcp_listen.c:115:24: note: initialize the variable 'inet' to silence this warning
> struct inet_sock *inet;
> ^
> = NULL
> 1 warning generated.
>
>
> vim +137 net/rds/tcp_listen.c
>
> 108
> 109 int rds_tcp_accept_one(struct rds_tcp_net *rtn)
> 110 {
> 111 struct socket *listen_sock = rtn->rds_tcp_listen_sock;
> 112 struct socket *new_sock = NULL;
> 113 struct rds_connection *conn;
> 114 int ret;
> 115 struct inet_sock *inet;
> 116 struct rds_tcp_connection *rs_tcp = NULL;
> 117 int conn_state;
> 118 struct rds_conn_path *cp;
> 119 struct in6_addr *my_addr, *peer_addr;
> 120 struct proto_accept_arg arg = {
> 121 .flags = O_NONBLOCK,
> 122 .kern = true,
> 123 };
> 124 #if !IS_ENABLED(CONFIG_IPV6)
> 125 struct in6_addr saddr, daddr;
> 126 #endif
> 127 int dev_if = 0;
> 128
> 129 mutex_lock(&rtn->rds_tcp_accept_lock);
> 130
> 131 if (!listen_sock)
> 132 return -ENETUNREACH;
> 133
> 134 new_sock = rtn->rds_tcp_accepted_sock;
> 135 rtn->rds_tcp_accepted_sock = NULL;
> 136
> > 137 if (!new_sock) {
> 138 ret = sock_create_lite(listen_sock->sk->sk_family,
> 139 listen_sock->sk->sk_type,
> 140 listen_sock->sk->sk_protocol,
> 141 &new_sock);
> 142 if (ret)
> 143 goto out;
> 144
> 145 ret = listen_sock->ops->accept(listen_sock, new_sock, &arg);
> 146 if (ret < 0)
> 147 goto out;
> 148
> 149 /* sock_create_lite() does not get a hold on the owner module so we
> 150 * need to do it here. Note that sock_release() uses sock->ops to
> 151 * determine if it needs to decrement the reference count. So set
> 152 * sock->ops after calling accept() in case that fails. And there's
> 153 * no need to do try_module_get() as the listener should have a hold
> 154 * already.
> 155 */
> 156 new_sock->ops = listen_sock->ops;
> 157 __module_get(new_sock->ops->owner);
> 158
> 159 rds_tcp_keepalive(new_sock);
> 160 if (!rds_tcp_tune(new_sock)) {
> 161 ret = -EINVAL;
> 162 goto out;
> 163 }
> 164
> 165 inet = inet_sk(new_sock->sk);
> 166 }
I think just moving the inet assignment below the last bracket here should correct this warning. Will fix :-)
Thanks!
Allison
> 167
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-02-27 4:26 ` [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages allison.henderson
2025-03-01 0:21 ` Jakub Kicinski
2025-03-01 23:22 ` kernel test robot
@ 2025-03-04 10:28 ` Dan Carpenter
2025-03-05 0:39 ` Allison Henderson
2 siblings, 1 reply; 27+ messages in thread
From: Dan Carpenter @ 2025-03-04 10:28 UTC (permalink / raw)
To: oe-kbuild, allison.henderson, netdev; +Cc: lkp, oe-kbuild-all
Hi,
kernel test robot noticed the following build warnings:
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/allison-henderson-oracle-com/net-rds-Avoid-queuing-superfluous-send-and-recv-work/20250227-123038
base: net/main
patch link: https://lore.kernel.org/r/20250227042638.82553-6-allison.henderson%40oracle.com
patch subject: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
config: x86_64-randconfig-161-20250304 (https://download.01.org/0day-ci/archive/20250304/202503041749.YwkRic8W-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202503041749.YwkRic8W-lkp@intel.com/
smatch warnings:
net/rds/tcp_listen.c:295 rds_tcp_accept_one() warn: inconsistent returns '&rtn->rds_tcp_accept_lock'.
vim +/new_sock +156 net/rds/tcp_listen.c
112b9a7a012048 Gerd Rausch 2025-02-26 109 int rds_tcp_accept_one(struct rds_tcp_net *rtn)
70041088e3b976 Andy Grover 2009-08-21 110 {
112b9a7a012048 Gerd Rausch 2025-02-26 111 struct socket *listen_sock = rtn->rds_tcp_listen_sock;
70041088e3b976 Andy Grover 2009-08-21 112 struct socket *new_sock = NULL;
70041088e3b976 Andy Grover 2009-08-21 113 struct rds_connection *conn;
70041088e3b976 Andy Grover 2009-08-21 114 int ret;
70041088e3b976 Andy Grover 2009-08-21 115 struct inet_sock *inet;
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 116 struct rds_tcp_connection *rs_tcp = NULL;
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 117 int conn_state;
ea3b1ea5393087 Sowmini Varadhan 2016-06-30 118 struct rds_conn_path *cp;
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 119 struct in6_addr *my_addr, *peer_addr;
92ef0fd55ac80d Jens Axboe 2024-05-09 120 struct proto_accept_arg arg = {
92ef0fd55ac80d Jens Axboe 2024-05-09 121 .flags = O_NONBLOCK,
92ef0fd55ac80d Jens Axboe 2024-05-09 122 .kern = true,
92ef0fd55ac80d Jens Axboe 2024-05-09 123 };
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 124 #if !IS_ENABLED(CONFIG_IPV6)
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 125 struct in6_addr saddr, daddr;
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 126 #endif
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 127 int dev_if = 0;
70041088e3b976 Andy Grover 2009-08-21 128
112b9a7a012048 Gerd Rausch 2025-02-26 129 mutex_lock(&rtn->rds_tcp_accept_lock);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
112b9a7a012048 Gerd Rausch 2025-02-26 130
112b9a7a012048 Gerd Rausch 2025-02-26 131 if (!listen_sock)
37e14f4fe2991f Sowmini Varadhan 2016-05-18 132 return -ENETUNREACH;
^^^^^^^^^^^^^^^^^^^^
Do this before taking the lock?
37e14f4fe2991f Sowmini Varadhan 2016-05-18 133
112b9a7a012048 Gerd Rausch 2025-02-26 134 new_sock = rtn->rds_tcp_accepted_sock;
112b9a7a012048 Gerd Rausch 2025-02-26 135 rtn->rds_tcp_accepted_sock = NULL;
112b9a7a012048 Gerd Rausch 2025-02-26 136
112b9a7a012048 Gerd Rausch 2025-02-26 @137 if (!new_sock) {
112b9a7a012048 Gerd Rausch 2025-02-26 138 ret = sock_create_lite(listen_sock->sk->sk_family,
112b9a7a012048 Gerd Rausch 2025-02-26 139 listen_sock->sk->sk_type,
112b9a7a012048 Gerd Rausch 2025-02-26 140 listen_sock->sk->sk_protocol,
d5a8ac28a7ff2f Sowmini Varadhan 2015-08-05 141 &new_sock);
70041088e3b976 Andy Grover 2009-08-21 142 if (ret)
70041088e3b976 Andy Grover 2009-08-21 143 goto out;
70041088e3b976 Andy Grover 2009-08-21 144
112b9a7a012048 Gerd Rausch 2025-02-26 145 ret = listen_sock->ops->accept(listen_sock, new_sock, &arg);
70041088e3b976 Andy Grover 2009-08-21 146 if (ret < 0)
70041088e3b976 Andy Grover 2009-08-21 147 goto out;
70041088e3b976 Andy Grover 2009-08-21 148
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 149 /* sock_create_lite() does not get a hold on the owner module so we
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 150 * need to do it here. Note that sock_release() uses sock->ops to
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 151 * determine if it needs to decrement the reference count. So set
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 152 * sock->ops after calling accept() in case that fails. And there's
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 153 * no need to do try_module_get() as the listener should have a hold
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 154 * already.
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 155 */
112b9a7a012048 Gerd Rausch 2025-02-26 @156 new_sock->ops = listen_sock->ops;
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 157 __module_get(new_sock->ops->owner);
84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 158
480aeb9639d6a0 Christoph Hellwig 2020-05-28 159 rds_tcp_keepalive(new_sock);
6997fbd7a3dafa Tetsuo Handa 2022-05-05 160 if (!rds_tcp_tune(new_sock)) {
6997fbd7a3dafa Tetsuo Handa 2022-05-05 161 ret = -EINVAL;
6997fbd7a3dafa Tetsuo Handa 2022-05-05 162 goto out;
6997fbd7a3dafa Tetsuo Handa 2022-05-05 163 }
70041088e3b976 Andy Grover 2009-08-21 164
70041088e3b976 Andy Grover 2009-08-21 165 inet = inet_sk(new_sock->sk);
112b9a7a012048 Gerd Rausch 2025-02-26 166 }
70041088e3b976 Andy Grover 2009-08-21 167
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 168 #if IS_ENABLED(CONFIG_IPV6)
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 169 my_addr = &new_sock->sk->sk_v6_rcv_saddr;
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 170 peer_addr = &new_sock->sk->sk_v6_daddr;
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 171 #else
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 172 ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 173 ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 174 my_addr = &saddr;
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 175 peer_addr = &daddr;
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 176 #endif
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 177 rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
112b9a7a012048 Gerd Rausch 2025-02-26 178 listen_sock->sk->sk_family,
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 179 my_addr, ntohs(inet->inet_sport),
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 180 peer_addr, ntohs(inet->inet_dport));
70041088e3b976 Andy Grover 2009-08-21 181
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 182 #if IS_ENABLED(CONFIG_IPV6)
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 183 /* sk_bound_dev_if is not set if the peer address is not link local
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 184 * address. In this case, it happens that mcast_oif is set. So
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 185 * just use it.
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 186 */
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 187 if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 188 !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 189 struct ipv6_pinfo *inet6;
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 190
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 191 inet6 = inet6_sk(new_sock->sk);
d2f011a0bf28c0 Eric Dumazet 2023-12-08 192 dev_if = READ_ONCE(inet6->mcast_oif);
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 193 } else {
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 194 dev_if = new_sock->sk->sk_bound_dev_if;
1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 195 }
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 196 #endif
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 197
112b9a7a012048 Gerd Rausch 2025-02-26 198 if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) {
aced3ce57cd37b Rao Shoaib 2021-05-21 199 /* local address connection is only allowed via loopback */
aced3ce57cd37b Rao Shoaib 2021-05-21 200 ret = -EOPNOTSUPP;
aced3ce57cd37b Rao Shoaib 2021-05-21 201 goto out;
aced3ce57cd37b Rao Shoaib 2021-05-21 202 }
aced3ce57cd37b Rao Shoaib 2021-05-21 203
112b9a7a012048 Gerd Rausch 2025-02-26 204 conn = rds_conn_create(sock_net(listen_sock->sk),
e65d4d96334e3f Ka-Cheong Poon 2018-07-30 205 my_addr, peer_addr,
3eb450367d0823 Santosh Shilimkar 2018-10-23 206 &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
eee2fa6ab32251 Ka-Cheong Poon 2018-07-23 207
70041088e3b976 Andy Grover 2009-08-21 208 if (IS_ERR(conn)) {
70041088e3b976 Andy Grover 2009-08-21 209 ret = PTR_ERR(conn);
70041088e3b976 Andy Grover 2009-08-21 210 goto out;
70041088e3b976 Andy Grover 2009-08-21 211 }
f711a6ae062cae Sowmini Varadhan 2015-05-05 212 /* An incoming SYN request came in, and TCP just accepted it.
f711a6ae062cae Sowmini Varadhan 2015-05-05 213 *
f711a6ae062cae Sowmini Varadhan 2015-05-05 214 * If the client reboots, this conn will need to be cleaned up.
f711a6ae062cae Sowmini Varadhan 2015-05-05 215 * rds_tcp_state_change() will do that cleanup
f711a6ae062cae Sowmini Varadhan 2015-05-05 216 */
112b9a7a012048 Gerd Rausch 2025-02-26 217 if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) {
112b9a7a012048 Gerd Rausch 2025-02-26 218 /* Try to obtain a free connection slot.
112b9a7a012048 Gerd Rausch 2025-02-26 219 * If unsuccessful, we need to preserve "new_sock"
112b9a7a012048 Gerd Rausch 2025-02-26 220 * that we just accepted, since its "sk_receive_queue"
112b9a7a012048 Gerd Rausch 2025-02-26 221 * may contain messages already that have been acknowledged
112b9a7a012048 Gerd Rausch 2025-02-26 222 * to and discarded by the sender.
112b9a7a012048 Gerd Rausch 2025-02-26 223 * We must not throw those away!
112b9a7a012048 Gerd Rausch 2025-02-26 224 */
5916e2c1554f3e Sowmini Varadhan 2016-07-14 225 rs_tcp = rds_tcp_accept_one_path(conn);
112b9a7a012048 Gerd Rausch 2025-02-26 226 if (!rs_tcp) {
112b9a7a012048 Gerd Rausch 2025-02-26 227 /* It's okay to stash "new_sock", since
112b9a7a012048 Gerd Rausch 2025-02-26 228 * "rds_tcp_conn_slots_available" triggers "rds_tcp_accept_one"
112b9a7a012048 Gerd Rausch 2025-02-26 229 * again as soon as one of the connection slots
112b9a7a012048 Gerd Rausch 2025-02-26 230 * becomes available again
112b9a7a012048 Gerd Rausch 2025-02-26 231 */
112b9a7a012048 Gerd Rausch 2025-02-26 232 rtn->rds_tcp_accepted_sock = new_sock;
112b9a7a012048 Gerd Rausch 2025-02-26 233 new_sock = NULL;
112b9a7a012048 Gerd Rausch 2025-02-26 234 ret = -ENOBUFS;
112b9a7a012048 Gerd Rausch 2025-02-26 235 goto out;
112b9a7a012048 Gerd Rausch 2025-02-26 236 }
112b9a7a012048 Gerd Rausch 2025-02-26 237 } else {
112b9a7a012048 Gerd Rausch 2025-02-26 238 /* This connection request came from a peer with
112b9a7a012048 Gerd Rausch 2025-02-26 239 * a larger address.
112b9a7a012048 Gerd Rausch 2025-02-26 240 * Function "rds_tcp_state_change" makes sure
112b9a7a012048 Gerd Rausch 2025-02-26 241 * that the connection doesn't transition
112b9a7a012048 Gerd Rausch 2025-02-26 242 * to state "RDS_CONN_UP", and therefore
112b9a7a012048 Gerd Rausch 2025-02-26 243 * we should not have received any messages
112b9a7a012048 Gerd Rausch 2025-02-26 244 * on this socket yet.
112b9a7a012048 Gerd Rausch 2025-02-26 245 * This is the only case where it's okay to
112b9a7a012048 Gerd Rausch 2025-02-26 246 * not dequeue messages from "sk_receive_queue".
112b9a7a012048 Gerd Rausch 2025-02-26 247 */
112b9a7a012048 Gerd Rausch 2025-02-26 248 if (conn->c_npaths <= 1)
112b9a7a012048 Gerd Rausch 2025-02-26 249 rds_conn_path_connect_if_down(&conn->c_path[0]);
112b9a7a012048 Gerd Rausch 2025-02-26 250 rs_tcp = NULL;
5916e2c1554f3e Sowmini Varadhan 2016-07-14 251 goto rst_nsk;
112b9a7a012048 Gerd Rausch 2025-02-26 252 }
112b9a7a012048 Gerd Rausch 2025-02-26 253
02105b2ccdd634 Sowmini Varadhan 2016-06-30 254 mutex_lock(&rs_tcp->t_conn_path_lock);
5916e2c1554f3e Sowmini Varadhan 2016-07-14 255 cp = rs_tcp->t_cpath;
5916e2c1554f3e Sowmini Varadhan 2016-07-14 256 conn_state = rds_conn_path_state(cp);
1a0e100fb2c966 Sowmini Varadhan 2016-11-16 257 WARN_ON(conn_state == RDS_CONN_UP);
112b9a7a012048 Gerd Rausch 2025-02-26 258 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) {
112b9a7a012048 Gerd Rausch 2025-02-26 259 rds_conn_path_drop(cp, 0);
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 260 goto rst_nsk;
112b9a7a012048 Gerd Rausch 2025-02-26 261 }
eb192840266fab Sowmini Varadhan 2016-05-02 262 if (rs_tcp->t_sock) {
41500c3e2a19ff Sowmini Varadhan 2017-06-15 263 /* Duelling SYN has been handled in rds_tcp_accept_one() */
ea3b1ea5393087 Sowmini Varadhan 2016-06-30 264 rds_tcp_reset_callbacks(new_sock, cp);
9c79440e2c5e25 Sowmini Varadhan 2016-06-04 265 /* rds_connect_path_complete() marks RDS_CONN_UP */
ea3b1ea5393087 Sowmini Varadhan 2016-06-30 266 rds_connect_path_complete(cp, RDS_CONN_RESETTING);
335b48d980f631 Sowmini Varadhan 2016-06-04 267 } else {
ea3b1ea5393087 Sowmini Varadhan 2016-06-30 268 rds_tcp_set_callbacks(new_sock, cp);
ea3b1ea5393087 Sowmini Varadhan 2016-06-30 269 rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
e70845e6ecd132 Ka-Cheong Poon 2025-02-26 270 wake_up(&cp->cp_up_waitq);
335b48d980f631 Sowmini Varadhan 2016-06-04 271 }
70041088e3b976 Andy Grover 2009-08-21 272 new_sock = NULL;
70041088e3b976 Andy Grover 2009-08-21 273 ret = 0;
69b92b5b741984 Sowmini Varadhan 2017-06-21 274 if (conn->c_npaths == 0)
69b92b5b741984 Sowmini Varadhan 2017-06-21 275 rds_send_ping(cp->cp_conn, cp->cp_index);
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 276 goto out;
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 277 rst_nsk:
10beea7d7408d0 Sowmini Varadhan 2017-06-15 278 /* reset the newly returned accept sock and bail.
10beea7d7408d0 Sowmini Varadhan 2017-06-15 279 * It is safe to set linger on new_sock because the RDS connection
10beea7d7408d0 Sowmini Varadhan 2017-06-15 280 * has not been brought up on new_sock, so no RDS-level data could
10beea7d7408d0 Sowmini Varadhan 2017-06-15 281 * be pending on it. By setting linger, we achieve the side-effect
10beea7d7408d0 Sowmini Varadhan 2017-06-15 282 * of avoiding TIME_WAIT state on new_sock.
10beea7d7408d0 Sowmini Varadhan 2017-06-15 283 */
c433594c07457d Christoph Hellwig 2020-05-28 284 sock_no_linger(new_sock->sk);
335b48d980f631 Sowmini Varadhan 2016-06-04 285 kernel_sock_shutdown(new_sock, SHUT_RDWR);
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 286 ret = 0;
70041088e3b976 Andy Grover 2009-08-21 287 out:
bd7c5f983f3185 Sowmini Varadhan 2016-05-02 288 if (rs_tcp)
02105b2ccdd634 Sowmini Varadhan 2016-06-30 289 mutex_unlock(&rs_tcp->t_conn_path_lock);
70041088e3b976 Andy Grover 2009-08-21 290 if (new_sock)
70041088e3b976 Andy Grover 2009-08-21 291 sock_release(new_sock);
112b9a7a012048 Gerd Rausch 2025-02-26 292
112b9a7a012048 Gerd Rausch 2025-02-26 293 mutex_unlock(&rtn->rds_tcp_accept_lock);
112b9a7a012048 Gerd Rausch 2025-02-26 294
70041088e3b976 Andy Grover 2009-08-21 @295 return ret;
70041088e3b976 Andy Grover 2009-08-21 296 }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
2025-03-04 10:28 ` Dan Carpenter
@ 2025-03-05 0:39 ` Allison Henderson
0 siblings, 0 replies; 27+ messages in thread
From: Allison Henderson @ 2025-03-05 0:39 UTC (permalink / raw)
To: dan.carpenter@linaro.org, netdev@vger.kernel.org,
oe-kbuild@lists.linux.dev
Cc: lkp@intel.com, oe-kbuild-all@lists.linux.dev
On Tue, 2025-03-04 at 13:28 +0300, Dan Carpenter wrote:
> Hi,
>
> kernel test robot noticed the following build warnings:
>
> https://urldefense.com/v3/__https://git-scm.com/docs/git-format-patch*_base_tree_information__;Iw!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPnB5xus5w$ ]
>
> url: https://urldefense.com/v3/__https://github.com/intel-lab-lkp/linux/commits/allison-henderson-oracle-com/net-rds-Avoid-queuing-superfluous-send-and-recv-work/20250227-123038__;!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPlkiiOVHg$
> base: net/main
> patch link: https://urldefense.com/v3/__https://lore.kernel.org/r/20250227042638.82553-6-allison.henderson*40oracle.com__;JQ!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPl-4cL2HA$
> patch subject: [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages
> config: x86_64-randconfig-161-20250304 (https://urldefense.com/v3/__https://download.01.org/0day-ci/archive/20250304/202503041749.YwkRic8W-lkp@intel.com/config__;!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPko9q8sNw$ )
> compiler: clang version 19.1.7 (https://urldefense.com/v3/__https://github.com/llvm/llvm-project__;!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPlPIG0jYA$ cd708029e0b2869e80abe31ddb175f7c35361f90)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> > Reported-by: kernel test robot <lkp@intel.com>
> > Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> > Closes: https://urldefense.com/v3/__https://lore.kernel.org/r/202503041749.YwkRic8W-lkp@intel.com/__;!!ACWV5N9M2RV99hQ!KxXf0dDbI8bmxH2KHU1J7gopr6EmtIqkgdl5UKsE1BS29rZeqrqJGWZJPQxyKauzGVdgxG8sw0TgBbaK6hO2wPnrmKrVLA$
>
> smatch warnings:
> net/rds/tcp_listen.c:295 rds_tcp_accept_one() warn: inconsistent returns '&rtn->rds_tcp_accept_lock'.
>
> vim +/new_sock +156 net/rds/tcp_listen.c
>
> 112b9a7a012048 Gerd Rausch 2025-02-26 109 int rds_tcp_accept_one(struct rds_tcp_net *rtn)
> 70041088e3b976 Andy Grover 2009-08-21 110 {
> 112b9a7a012048 Gerd Rausch 2025-02-26 111 struct socket *listen_sock = rtn->rds_tcp_listen_sock;
> 70041088e3b976 Andy Grover 2009-08-21 112 struct socket *new_sock = NULL;
> 70041088e3b976 Andy Grover 2009-08-21 113 struct rds_connection *conn;
> 70041088e3b976 Andy Grover 2009-08-21 114 int ret;
> 70041088e3b976 Andy Grover 2009-08-21 115 struct inet_sock *inet;
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 116 struct rds_tcp_connection *rs_tcp = NULL;
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 117 int conn_state;
> ea3b1ea5393087 Sowmini Varadhan 2016-06-30 118 struct rds_conn_path *cp;
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 119 struct in6_addr *my_addr, *peer_addr;
> 92ef0fd55ac80d Jens Axboe 2024-05-09 120 struct proto_accept_arg arg = {
> 92ef0fd55ac80d Jens Axboe 2024-05-09 121 .flags = O_NONBLOCK,
> 92ef0fd55ac80d Jens Axboe 2024-05-09 122 .kern = true,
> 92ef0fd55ac80d Jens Axboe 2024-05-09 123 };
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 124 #if !IS_ENABLED(CONFIG_IPV6)
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 125 struct in6_addr saddr, daddr;
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 126 #endif
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 127 int dev_if = 0;
> 70041088e3b976 Andy Grover 2009-08-21 128
> 112b9a7a012048 Gerd Rausch 2025-02-26 129 mutex_lock(&rtn->rds_tcp_accept_lock);
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> 112b9a7a012048 Gerd Rausch 2025-02-26 130
> 112b9a7a012048 Gerd Rausch 2025-02-26 131 if (!listen_sock)
> 37e14f4fe2991f Sowmini Varadhan 2016-05-18 132 return -ENETUNREACH;
> ^^^^^^^^^^^^^^^^^^^^
> Do this before taking the lock?
Yep, I think you're right. Will fix. Thank you!
Allison
>
> 37e14f4fe2991f Sowmini Varadhan 2016-05-18 133
> 112b9a7a012048 Gerd Rausch 2025-02-26 134 new_sock = rtn->rds_tcp_accepted_sock;
> 112b9a7a012048 Gerd Rausch 2025-02-26 135 rtn->rds_tcp_accepted_sock = NULL;
> 112b9a7a012048 Gerd Rausch 2025-02-26 136
> 112b9a7a012048 Gerd Rausch 2025-02-26 @137 if (!new_sock) {
> 112b9a7a012048 Gerd Rausch 2025-02-26 138 ret = sock_create_lite(listen_sock->sk->sk_family,
> 112b9a7a012048 Gerd Rausch 2025-02-26 139 listen_sock->sk->sk_type,
> 112b9a7a012048 Gerd Rausch 2025-02-26 140 listen_sock->sk->sk_protocol,
> d5a8ac28a7ff2f Sowmini Varadhan 2015-08-05 141 &new_sock);
> 70041088e3b976 Andy Grover 2009-08-21 142 if (ret)
> 70041088e3b976 Andy Grover 2009-08-21 143 goto out;
> 70041088e3b976 Andy Grover 2009-08-21 144
> 112b9a7a012048 Gerd Rausch 2025-02-26 145 ret = listen_sock->ops->accept(listen_sock, new_sock, &arg);
> 70041088e3b976 Andy Grover 2009-08-21 146 if (ret < 0)
> 70041088e3b976 Andy Grover 2009-08-21 147 goto out;
> 70041088e3b976 Andy Grover 2009-08-21 148
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 149 /* sock_create_lite() does not get a hold on the owner module so we
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 150 * need to do it here. Note that sock_release() uses sock->ops to
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 151 * determine if it needs to decrement the reference count. So set
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 152 * sock->ops after calling accept() in case that fails. And there's
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 153 * no need to do try_module_get() as the listener should have a hold
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 154 * already.
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 155 */
> 112b9a7a012048 Gerd Rausch 2025-02-26 @156 new_sock->ops = listen_sock->ops;
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 157 __module_get(new_sock->ops->owner);
> 84eef2b2187ed7 Ka-Cheong Poon 2018-03-01 158
> 480aeb9639d6a0 Christoph Hellwig 2020-05-28 159 rds_tcp_keepalive(new_sock);
> 6997fbd7a3dafa Tetsuo Handa 2022-05-05 160 if (!rds_tcp_tune(new_sock)) {
> 6997fbd7a3dafa Tetsuo Handa 2022-05-05 161 ret = -EINVAL;
> 6997fbd7a3dafa Tetsuo Handa 2022-05-05 162 goto out;
> 6997fbd7a3dafa Tetsuo Handa 2022-05-05 163 }
> 70041088e3b976 Andy Grover 2009-08-21 164
> 70041088e3b976 Andy Grover 2009-08-21 165 inet = inet_sk(new_sock->sk);
> 112b9a7a012048 Gerd Rausch 2025-02-26 166 }
> 70041088e3b976 Andy Grover 2009-08-21 167
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 168 #if IS_ENABLED(CONFIG_IPV6)
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 169 my_addr = &new_sock->sk->sk_v6_rcv_saddr;
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 170 peer_addr = &new_sock->sk->sk_v6_daddr;
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 171 #else
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 172 ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 173 ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 174 my_addr = &saddr;
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 175 peer_addr = &daddr;
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 176 #endif
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 177 rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
> 112b9a7a012048 Gerd Rausch 2025-02-26 178 listen_sock->sk->sk_family,
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 179 my_addr, ntohs(inet->inet_sport),
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 180 peer_addr, ntohs(inet->inet_dport));
> 70041088e3b976 Andy Grover 2009-08-21 181
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 182 #if IS_ENABLED(CONFIG_IPV6)
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 183 /* sk_bound_dev_if is not set if the peer address is not link local
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 184 * address. In this case, it happens that mcast_oif is set. So
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 185 * just use it.
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 186 */
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 187 if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 188 !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 189 struct ipv6_pinfo *inet6;
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 190
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 191 inet6 = inet6_sk(new_sock->sk);
> d2f011a0bf28c0 Eric Dumazet 2023-12-08 192 dev_if = READ_ONCE(inet6->mcast_oif);
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 193 } else {
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 194 dev_if = new_sock->sk->sk_bound_dev_if;
> 1e2b44e78eead7 Ka-Cheong Poon 2018-07-23 195 }
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 196 #endif
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 197
> 112b9a7a012048 Gerd Rausch 2025-02-26 198 if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) {
> aced3ce57cd37b Rao Shoaib 2021-05-21 199 /* local address connection is only allowed via loopback */
> aced3ce57cd37b Rao Shoaib 2021-05-21 200 ret = -EOPNOTSUPP;
> aced3ce57cd37b Rao Shoaib 2021-05-21 201 goto out;
> aced3ce57cd37b Rao Shoaib 2021-05-21 202 }
> aced3ce57cd37b Rao Shoaib 2021-05-21 203
> 112b9a7a012048 Gerd Rausch 2025-02-26 204 conn = rds_conn_create(sock_net(listen_sock->sk),
> e65d4d96334e3f Ka-Cheong Poon 2018-07-30 205 my_addr, peer_addr,
> 3eb450367d0823 Santosh Shilimkar 2018-10-23 206 &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
> eee2fa6ab32251 Ka-Cheong Poon 2018-07-23 207
> 70041088e3b976 Andy Grover 2009-08-21 208 if (IS_ERR(conn)) {
> 70041088e3b976 Andy Grover 2009-08-21 209 ret = PTR_ERR(conn);
> 70041088e3b976 Andy Grover 2009-08-21 210 goto out;
> 70041088e3b976 Andy Grover 2009-08-21 211 }
> f711a6ae062cae Sowmini Varadhan 2015-05-05 212 /* An incoming SYN request came in, and TCP just accepted it.
> f711a6ae062cae Sowmini Varadhan 2015-05-05 213 *
> f711a6ae062cae Sowmini Varadhan 2015-05-05 214 * If the client reboots, this conn will need to be cleaned up.
> f711a6ae062cae Sowmini Varadhan 2015-05-05 215 * rds_tcp_state_change() will do that cleanup
> f711a6ae062cae Sowmini Varadhan 2015-05-05 216 */
> 112b9a7a012048 Gerd Rausch 2025-02-26 217 if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) {
> 112b9a7a012048 Gerd Rausch 2025-02-26 218 /* Try to obtain a free connection slot.
> 112b9a7a012048 Gerd Rausch 2025-02-26 219 * If unsuccessful, we need to preserve "new_sock"
> 112b9a7a012048 Gerd Rausch 2025-02-26 220 * that we just accepted, since its "sk_receive_queue"
> 112b9a7a012048 Gerd Rausch 2025-02-26 221 * may contain messages already that have been acknowledged
> 112b9a7a012048 Gerd Rausch 2025-02-26 222 * to and discarded by the sender.
> 112b9a7a012048 Gerd Rausch 2025-02-26 223 * We must not throw those away!
> 112b9a7a012048 Gerd Rausch 2025-02-26 224 */
> 5916e2c1554f3e Sowmini Varadhan 2016-07-14 225 rs_tcp = rds_tcp_accept_one_path(conn);
> 112b9a7a012048 Gerd Rausch 2025-02-26 226 if (!rs_tcp) {
> 112b9a7a012048 Gerd Rausch 2025-02-26 227 /* It's okay to stash "new_sock", since
> 112b9a7a012048 Gerd Rausch 2025-02-26 228 * "rds_tcp_conn_slots_available" triggers "rds_tcp_accept_one"
> 112b9a7a012048 Gerd Rausch 2025-02-26 229 * again as soon as one of the connection slots
> 112b9a7a012048 Gerd Rausch 2025-02-26 230 * becomes available again
> 112b9a7a012048 Gerd Rausch 2025-02-26 231 */
> 112b9a7a012048 Gerd Rausch 2025-02-26 232 rtn->rds_tcp_accepted_sock = new_sock;
> 112b9a7a012048 Gerd Rausch 2025-02-26 233 new_sock = NULL;
> 112b9a7a012048 Gerd Rausch 2025-02-26 234 ret = -ENOBUFS;
> 112b9a7a012048 Gerd Rausch 2025-02-26 235 goto out;
> 112b9a7a012048 Gerd Rausch 2025-02-26 236 }
> 112b9a7a012048 Gerd Rausch 2025-02-26 237 } else {
> 112b9a7a012048 Gerd Rausch 2025-02-26 238 /* This connection request came from a peer with
> 112b9a7a012048 Gerd Rausch 2025-02-26 239 * a larger address.
> 112b9a7a012048 Gerd Rausch 2025-02-26 240 * Function "rds_tcp_state_change" makes sure
> 112b9a7a012048 Gerd Rausch 2025-02-26 241 * that the connection doesn't transition
> 112b9a7a012048 Gerd Rausch 2025-02-26 242 * to state "RDS_CONN_UP", and therefore
> 112b9a7a012048 Gerd Rausch 2025-02-26 243 * we should not have received any messages
> 112b9a7a012048 Gerd Rausch 2025-02-26 244 * on this socket yet.
> 112b9a7a012048 Gerd Rausch 2025-02-26 245 * This is the only case where it's okay to
> 112b9a7a012048 Gerd Rausch 2025-02-26 246 * not dequeue messages from "sk_receive_queue".
> 112b9a7a012048 Gerd Rausch 2025-02-26 247 */
> 112b9a7a012048 Gerd Rausch 2025-02-26 248 if (conn->c_npaths <= 1)
> 112b9a7a012048 Gerd Rausch 2025-02-26 249 rds_conn_path_connect_if_down(&conn->c_path[0]);
> 112b9a7a012048 Gerd Rausch 2025-02-26 250 rs_tcp = NULL;
> 5916e2c1554f3e Sowmini Varadhan 2016-07-14 251 goto rst_nsk;
> 112b9a7a012048 Gerd Rausch 2025-02-26 252 }
> 112b9a7a012048 Gerd Rausch 2025-02-26 253
> 02105b2ccdd634 Sowmini Varadhan 2016-06-30 254 mutex_lock(&rs_tcp->t_conn_path_lock);
> 5916e2c1554f3e Sowmini Varadhan 2016-07-14 255 cp = rs_tcp->t_cpath;
> 5916e2c1554f3e Sowmini Varadhan 2016-07-14 256 conn_state = rds_conn_path_state(cp);
> 1a0e100fb2c966 Sowmini Varadhan 2016-11-16 257 WARN_ON(conn_state == RDS_CONN_UP);
> 112b9a7a012048 Gerd Rausch 2025-02-26 258 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) {
> 112b9a7a012048 Gerd Rausch 2025-02-26 259 rds_conn_path_drop(cp, 0);
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 260 goto rst_nsk;
> 112b9a7a012048 Gerd Rausch 2025-02-26 261 }
> eb192840266fab Sowmini Varadhan 2016-05-02 262 if (rs_tcp->t_sock) {
> 41500c3e2a19ff Sowmini Varadhan 2017-06-15 263 /* Duelling SYN has been handled in rds_tcp_accept_one() */
> ea3b1ea5393087 Sowmini Varadhan 2016-06-30 264 rds_tcp_reset_callbacks(new_sock, cp);
> 9c79440e2c5e25 Sowmini Varadhan 2016-06-04 265 /* rds_connect_path_complete() marks RDS_CONN_UP */
> ea3b1ea5393087 Sowmini Varadhan 2016-06-30 266 rds_connect_path_complete(cp, RDS_CONN_RESETTING);
> 335b48d980f631 Sowmini Varadhan 2016-06-04 267 } else {
> ea3b1ea5393087 Sowmini Varadhan 2016-06-30 268 rds_tcp_set_callbacks(new_sock, cp);
> ea3b1ea5393087 Sowmini Varadhan 2016-06-30 269 rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
> e70845e6ecd132 Ka-Cheong Poon 2025-02-26 270 wake_up(&cp->cp_up_waitq);
> 335b48d980f631 Sowmini Varadhan 2016-06-04 271 }
> 70041088e3b976 Andy Grover 2009-08-21 272 new_sock = NULL;
> 70041088e3b976 Andy Grover 2009-08-21 273 ret = 0;
> 69b92b5b741984 Sowmini Varadhan 2017-06-21 274 if (conn->c_npaths == 0)
> 69b92b5b741984 Sowmini Varadhan 2017-06-21 275 rds_send_ping(cp->cp_conn, cp->cp_index);
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 276 goto out;
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 277 rst_nsk:
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 278 /* reset the newly returned accept sock and bail.
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 279 * It is safe to set linger on new_sock because the RDS connection
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 280 * has not been brought up on new_sock, so no RDS-level data could
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 281 * be pending on it. By setting linger, we achieve the side-effect
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 282 * of avoiding TIME_WAIT state on new_sock.
> 10beea7d7408d0 Sowmini Varadhan 2017-06-15 283 */
> c433594c07457d Christoph Hellwig 2020-05-28 284 sock_no_linger(new_sock->sk);
> 335b48d980f631 Sowmini Varadhan 2016-06-04 285 kernel_sock_shutdown(new_sock, SHUT_RDWR);
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 286 ret = 0;
> 70041088e3b976 Andy Grover 2009-08-21 287 out:
> bd7c5f983f3185 Sowmini Varadhan 2016-05-02 288 if (rs_tcp)
> 02105b2ccdd634 Sowmini Varadhan 2016-06-30 289 mutex_unlock(&rs_tcp->t_conn_path_lock);
> 70041088e3b976 Andy Grover 2009-08-21 290 if (new_sock)
> 70041088e3b976 Andy Grover 2009-08-21 291 sock_release(new_sock);
> 112b9a7a012048 Gerd Rausch 2025-02-26 292
> 112b9a7a012048 Gerd Rausch 2025-02-26 293 mutex_unlock(&rtn->rds_tcp_accept_lock);
> 112b9a7a012048 Gerd Rausch 2025-02-26 294
> 70041088e3b976 Andy Grover 2009-08-21 @295 return ret;
> 70041088e3b976 Andy Grover 2009-08-21 296 }
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 6/6] net/rds: Encode cp_index in TCP source port
2025-02-27 4:26 [PATCH 0/6] RDS bug fix collection allison.henderson
` (4 preceding siblings ...)
2025-02-27 4:26 ` [PATCH 5/6] net/rds: rds_tcp_accept_one ought to not discard messages allison.henderson
@ 2025-02-27 4:26 ` allison.henderson
5 siblings, 0 replies; 27+ messages in thread
From: allison.henderson @ 2025-02-27 4:26 UTC (permalink / raw)
To: netdev
From: Gerd Rausch <gerd.rausch@oracle.com>
Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").
However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.
Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.
However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.
So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.
A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/message.c | 1 +
net/rds/rds.h | 3 +++
net/rds/recv.c | 7 +++++++
net/rds/send.c | 5 +++++
net/rds/tcp.h | 1 +
net/rds/tcp_connect.c | 22 ++++++++++++++++++++-
net/rds/tcp_listen.c | 45 +++++++++++++++++++++++++++++++++++++------
7 files changed, 77 insertions(+), 7 deletions(-)
diff --git a/net/rds/message.c b/net/rds/message.c
index 7af59d2443e5..31990ac4f3ef 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -46,6 +46,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_NPATHS] = sizeof(u16),
[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
+[RDS_EXTHDR_SPORT_IDX] = 1,
};
void rds_message_addref(struct rds_message *rm)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 422d5e26410e..1df58011e796 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -150,6 +150,7 @@ struct rds_connection {
c_ping_triggered:1,
c_pad_to_32:29;
int c_npaths;
+ bool c_with_sport_idx;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@@ -266,8 +267,10 @@ struct rds_ext_header_rdma_dest {
*/
#define RDS_EXTHDR_NPATHS 5
#define RDS_EXTHDR_GEN_NUM 6
+#define RDS_EXTHDR_SPORT_IDX 8
#define __RDS_EXTHDR_MAX 16 /* for now */
+
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
#define RDS_MSG_RX_HDR 0
#define RDS_MSG_RX_START 1
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c0a89af29d1b..f33b4904073d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
struct rds_ext_header_version version;
u16 rds_npaths;
u32 rds_gen_num;
+ u8 dummy;
} buffer;
+ bool new_with_sport_idx = false;
u32 new_peer_gen_num = 0;
while (1) {
@@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
break;
+ case RDS_EXTHDR_SPORT_IDX:
+ new_with_sport_idx = true;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
}
}
+
+ conn->c_with_sport_idx = new_with_sport_idx;
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
conn->c_ping_triggered = 0;
diff --git a/net/rds/send.c b/net/rds/send.c
index 85ab9e32105e..4a08b9774420 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1482,6 +1482,7 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
cp->cp_conn->c_trans->t_mp_capable) {
u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+ u8 dummy = 0;
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
@@ -1490,6 +1491,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
RDS_EXTHDR_GEN_NUM,
&my_gen_num,
sizeof(u32));
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_SPORT_IDX,
+ &dummy,
+ sizeof(u8));
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 2000f4acd57a..3beb0557104e 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -34,6 +34,7 @@ struct rds_tcp_connection {
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
+ u32 t_client_port_group;
struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 97596a3c346a..950de39ac942 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -94,6 +94,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
struct sockaddr *addr;
+ int port_low, port_high, port;
+ int port_groups, groups_left;
int addrlen;
bool isv6;
int ret;
@@ -146,7 +148,25 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
addrlen = sizeof(sin);
}
- ret = kernel_bind(sock, addr, addrlen);
+ /* encode cp->cp_index in lowest bits of source-port */
+ inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
+ port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
+ port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
+ ret = -EADDRINUSE;
+ groups_left = port_groups;
+ while (groups_left-- > 0 && ret) {
+ if (++tc->t_client_port_group >= port_groups)
+ tc->t_client_port_group = 0;
+ port = port_low +
+ tc->t_client_port_group * RDS_MPATH_WORKERS +
+ cp->cp_index;
+
+ if (isv6)
+ sin6.sin6_port = htons(port);
+ else
+ sin.sin_port = htons(port);
+ ret = sock->ops->bind(sock, addr, addrlen);
+ }
if (ret) {
rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index e44384f0adf7..9590db118457 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock)
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
*/
-static
-struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+static struct rds_tcp_connection *
+rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
{
- int i;
- int npaths = max_t(int, 1, conn->c_npaths);
+ union {
+ struct sockaddr_storage storage;
+ struct sockaddr addr;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } saddr;
+ int sport, npaths, i_min, i_max, i;
+
+ if (conn->c_with_sport_idx &&
+ kernel_getpeername(sock, &saddr.addr) == 0) {
+ /* cp->cp_index is encoded in lowest bits of source-port */
+ switch (saddr.addr.sa_family) {
+ case AF_INET:
+ sport = ntohs(saddr.sin.sin_port);
+ break;
+ case AF_INET6:
+ sport = ntohs(saddr.sin6.sin6_port);
+ break;
+ default:
+ sport = -1;
+ }
+ } else {
+ sport = -1;
+ }
+
+ npaths = max_t(int, 1, conn->c_npaths);
- for (i = 0; i < npaths; i++) {
+ if (sport >= 0) {
+ i_min = sport % npaths;
+ i_max = i_min;
+ } else {
+ i_min = 0;
+ i_max = npaths - 1;
+ }
+
+ for (i = i_min; i <= i_max; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
RDS_CONN_CONNECTING))
return cp->cp_transport_data;
}
+
return NULL;
}
@@ -222,7 +255,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn)
* to and discarded by the sender.
* We must not throw those away!
*/
- rs_tcp = rds_tcp_accept_one_path(conn);
+ rs_tcp = rds_tcp_accept_one_path(conn, listen_sock);
if (!rs_tcp) {
/* It's okay to stash "new_sock", since
* "rds_tcp_conn_slots_available" triggers "rds_tcp_accept_one"
--
2.43.0
^ permalink raw reply related [flat|nested] 27+ messages in thread