* [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel
@ 2023-09-27 18:27 David Morley
2023-09-27 18:27 ` [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup David Morley
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: David Morley @ 2023-09-27 18:27 UTC (permalink / raw)
To: David Miller, Eric Dumazet, Jakub Kicinski
Cc: netdev, David Morley, Neal Cardwell, Yuchung Cheng
From: David Morley <morleyd@google.com>
In order to better estimate whether a data packet has been
retransmitted or is the result of a TLP, we save the last received
ipv6 flowlabel.
To make space for this field we resize the "ato" field in
inet_connection_sock as the current value of TCP_DELACK_MAX can be
fully contained in 8 bits and add a compile_time_assert ensuring this
field is the required size.
Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
---
include/net/inet_connection_sock.h | 5 ++++-
include/net/tcp.h | 2 ++
net/ipv4/tcp_input.c | 15 +++++++++++++++
net/ipv4/tcp_timer.c | 2 +-
4 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5d2fcc137b88..d6d9d1c1985a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -114,7 +114,10 @@ struct inet_connection_sock {
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 retry; /* Number of attempts */
- __u32 ato; /* Predicted tick of soft clock */
+ #define ATO_BITS 8
+ __u32 ato:ATO_BITS, /* Predicted tick of soft clock */
+ lrcv_flowlabel:20, /* last received ipv6 flowlabel */
+ unused:4;
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 91688d0dadcd..8a3720c7d082 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -131,6 +131,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
+static_assert(1<<ATO_BITS > TCP_DELACK_MAX);
+
#if HZ >= 100
#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN ((unsigned)(HZ/25))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 584825ddd0a0..abe7494361c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -765,6 +765,16 @@ void tcp_rcv_space_adjust(struct sock *sk)
tp->rcvq_space.time = tp->tcp_mstamp;
}
+static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
+#endif
+}
+
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
@@ -813,6 +823,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
}
}
icsk->icsk_ack.lrcvtime = now;
+ tcp_save_lrcv_flowlabel(sk, skb);
tcp_ecn_check_ce(sk, skb);
@@ -4506,6 +4517,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
sk_rethink_txhash(sk))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+
+ /* Save last flowlabel after a spurious retrans. */
+ tcp_save_lrcv_flowlabel(sk, skb);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4822,6 +4836,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
u32 seq, end_seq;
bool fragstolen;
+ tcp_save_lrcv_flowlabel(sk, skb);
tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3f61c6a70a1f..0862b73dd3b5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -322,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk)
if (inet_csk_ack_scheduled(sk)) {
if (!inet_csk_in_pingpong_mode(sk)) {
/* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
--
2.42.0.582.g8ccd20d70d-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup
2023-09-27 18:27 [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel David Morley
@ 2023-09-27 18:27 ` David Morley
2023-09-29 0:54 ` Eric Dumazet
2023-09-29 0:52 ` [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel Eric Dumazet
2023-09-29 5:57 ` kernel test robot
2 siblings, 1 reply; 5+ messages in thread
From: David Morley @ 2023-09-27 18:27 UTC (permalink / raw)
To: David Miller, Eric Dumazet, Jakub Kicinski
Cc: netdev, David Morley, Neal Cardwell, Yuchung Cheng
From: David Morley <morleyd@google.com>
This commit changes the data receiver repath behavior to occur after
receiving a single duplicate. This can help recover ACK connectivity
quicker if a TLP was sent along a nonworking path.
For instance, consider the case where we have an initially nonworking
forward path and reverse path and subsequently switch to only working
forward paths. Before this patch we would have the following behavior.
+---------+--------+--------+----------+----------+----------+
| Event | For FL | Rev FL | FP Works | RP Works | Data Del |
+---------+--------+--------+----------+----------+----------+
| Initial | A | 1 | N | N | 0 |
+---------+--------+--------+----------+----------+----------+
| TLP | A | 1 | N | N | 0 |
+---------+--------+--------+----------+----------+----------+
| RTO 1 | B | 1 | Y | N | 1 |
+---------+--------+--------+----------+----------+----------+
| RTO 2 | C | 1 | Y | N | 2 |
+---------+--------+--------+----------+----------+----------+
| RTO 3 | D | 2 | Y | Y | 3 |
+---------+--------+--------+----------+----------+----------+
This patch gets rid of at least RTO 3, avoiding additional unnecessary
repaths of a working forward path to a (potentially) nonworking one.
In addition, this commit changes the behavior to avoid repathing upon
rx of duplicate data if the local endpoint is in CA_Loss (in which
case the RTOs will already be changing the outgoing flowlabel).
Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
---
net/ipv4/tcp_input.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index abe7494361c0..f77fbdb3103d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4511,15 +4511,23 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
{
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
- * The receiver remembers and reflects via DSACKs. Leverage the
- * DSACK state and change the txhash to re-route speculatively.
+ * If it seems our ACKs are not reaching the other side,
+ * based on receiving a duplicate data segment with new flowlabel
+ * (suggesting the sender suffered an RTO), and we are not already
+ * repathing due to our own RTO, then rehash the socket to repath our
+ * packets.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
+ ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
sk_rethink_txhash(sk))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
/* Save last flowlabel after a spurious retrans. */
tcp_save_lrcv_flowlabel(sk, skb);
+#endif
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
--
2.42.0.582.g8ccd20d70d-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup
2023-09-27 18:27 ` [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup David Morley
@ 2023-09-29 0:54 ` Eric Dumazet
0 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2023-09-29 0:54 UTC (permalink / raw)
To: David Morley
Cc: David Miller, Jakub Kicinski, netdev, David Morley, Neal Cardwell,
Yuchung Cheng
On Wed, Sep 27, 2023 at 8:28 PM David Morley <morleyd.kernel@gmail.com> wrote:
>
> From: David Morley <morleyd@google.com>
>
> This commit changes the data receiver repath behavior to occur after
> receiving a single duplicate. This can help recover ACK connectivity
> quicker if a TLP was sent along a nonworking path.
>
> For instance, consider the case where we have an initially nonworking
> forward path and reverse path and subsequently switch to only working
> forward paths. Before this patch we would have the following behavior.
>
> +---------+--------+--------+----------+----------+----------+
> | Event | For FL | Rev FL | FP Works | RP Works | Data Del |
> +---------+--------+--------+----------+----------+----------+
> | Initial | A | 1 | N | N | 0 |
> +---------+--------+--------+----------+----------+----------+
> | TLP | A | 1 | N | N | 0 |
> +---------+--------+--------+----------+----------+----------+
> | RTO 1 | B | 1 | Y | N | 1 |
> +---------+--------+--------+----------+----------+----------+
> | RTO 2 | C | 1 | Y | N | 2 |
> +---------+--------+--------+----------+----------+----------+
> | RTO 3 | D | 2 | Y | Y | 3 |
> +---------+--------+--------+----------+----------+----------+
>
> This patch gets rid of at least RTO 3, avoiding additional unnecessary
> repaths of a working forward path to a (potentially) nonworking one.
>
> In addition, this commit changes the behavior to avoid repathing upon
> rx of duplicate data if the local endpoint is in CA_Loss (in which
> case the RTOs will already be changing the outgoing flowlabel).
>
> Signed-off-by: David Morley <morleyd@google.com>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Tested-by: David Morley <morleyd@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel
2023-09-27 18:27 [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel David Morley
2023-09-27 18:27 ` [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup David Morley
@ 2023-09-29 0:52 ` Eric Dumazet
2023-09-29 5:57 ` kernel test robot
2 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2023-09-29 0:52 UTC (permalink / raw)
To: David Morley
Cc: David Miller, Jakub Kicinski, netdev, David Morley, Neal Cardwell,
Yuchung Cheng
On Wed, Sep 27, 2023 at 8:28 PM David Morley <morleyd.kernel@gmail.com> wrote:
>
> From: David Morley <morleyd@google.com>
>
> In order to better estimate whether a data packet has been
> retransmitted or is the result of a TLP, we save the last received
> ipv6 flowlabel.
>
> To make space for this field we resize the "ato" field in
> inet_connection_sock as the current value of TCP_DELACK_MAX can be
> fully contained in 8 bits and add a compile_time_assert ensuring this
> field is the required size.
>
> Signed-off-by: David Morley <morleyd@google.com>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Tested-by: David Morley <morleyd@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel
2023-09-27 18:27 [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel David Morley
2023-09-27 18:27 ` [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup David Morley
2023-09-29 0:52 ` [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel Eric Dumazet
@ 2023-09-29 5:57 ` kernel test robot
2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2023-09-29 5:57 UTC (permalink / raw)
To: David Morley, David Miller, Eric Dumazet, Jakub Kicinski
Cc: llvm, oe-kbuild-all, netdev, David Morley, Neal Cardwell,
Yuchung Cheng
Hi David,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/David-Morley/tcp-change-data-receiver-flowlabel-after-one-dup/20230928-022955
base: net-next/main
patch link: https://lore.kernel.org/r/20230927182747.2005960-1-morleyd.kernel%40gmail.com
patch subject: [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20230929/202309291312.vJyV8G9L-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230929/202309291312.vJyV8G9L-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202309291312.vJyV8G9L-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> net/dccp/timer.c:199:25: warning: comparison of distinct pointer types ('typeof (icsk->icsk_ack.ato << 1) *' (aka 'int *') and 'typeof (icsk->icsk_rto) *' (aka 'unsigned int *')) [-Wcompare-distinct-pointer-types]
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/minmax.h:68:19: note: expanded from macro 'min'
#define min(x, y) __careful_cmp(x, y, <)
^~~~~~~~~~~~~~~~~~~~~~
include/linux/minmax.h:37:24: note: expanded from macro '__careful_cmp'
__builtin_choose_expr(__safe_cmp(x, y), \
^~~~~~~~~~~~~~~~
include/linux/minmax.h:27:4: note: expanded from macro '__safe_cmp'
(__typecheck(x, y) && __no_side_effects(x, y))
^~~~~~~~~~~~~~~~~
include/linux/minmax.h:21:28: note: expanded from macro '__typecheck'
(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~
1 warning generated.
vim +199 net/dccp/timer.c
4ed800d02cfb63 Gerrit Renker 2006-11-13 168
4ed800d02cfb63 Gerrit Renker 2006-11-13 169 /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
59f379f9046a9e Kees Cook 2017-10-16 170 static void dccp_delack_timer(struct timer_list *t)
4ed800d02cfb63 Gerrit Renker 2006-11-13 171 {
59f379f9046a9e Kees Cook 2017-10-16 172 struct inet_connection_sock *icsk =
59f379f9046a9e Kees Cook 2017-10-16 173 from_timer(icsk, t, icsk_delack_timer);
59f379f9046a9e Kees Cook 2017-10-16 174 struct sock *sk = &icsk->icsk_inet.sk;
4ed800d02cfb63 Gerrit Renker 2006-11-13 175
4ed800d02cfb63 Gerrit Renker 2006-11-13 176 bh_lock_sock(sk);
4ed800d02cfb63 Gerrit Renker 2006-11-13 177 if (sock_owned_by_user(sk)) {
4ed800d02cfb63 Gerrit Renker 2006-11-13 178 /* Try again later. */
02a1d6e7a6bb02 Eric Dumazet 2016-04-27 179 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
4ed800d02cfb63 Gerrit Renker 2006-11-13 180 sk_reset_timer(sk, &icsk->icsk_delack_timer,
4ed800d02cfb63 Gerrit Renker 2006-11-13 181 jiffies + TCP_DELACK_MIN);
4ed800d02cfb63 Gerrit Renker 2006-11-13 182 goto out;
4ed800d02cfb63 Gerrit Renker 2006-11-13 183 }
4ed800d02cfb63 Gerrit Renker 2006-11-13 184
4ed800d02cfb63 Gerrit Renker 2006-11-13 185 if (sk->sk_state == DCCP_CLOSED ||
4ed800d02cfb63 Gerrit Renker 2006-11-13 186 !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
4ed800d02cfb63 Gerrit Renker 2006-11-13 187 goto out;
4ed800d02cfb63 Gerrit Renker 2006-11-13 188 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
4ed800d02cfb63 Gerrit Renker 2006-11-13 189 sk_reset_timer(sk, &icsk->icsk_delack_timer,
4ed800d02cfb63 Gerrit Renker 2006-11-13 190 icsk->icsk_ack.timeout);
4ed800d02cfb63 Gerrit Renker 2006-11-13 191 goto out;
4ed800d02cfb63 Gerrit Renker 2006-11-13 192 }
4ed800d02cfb63 Gerrit Renker 2006-11-13 193
4ed800d02cfb63 Gerrit Renker 2006-11-13 194 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
4ed800d02cfb63 Gerrit Renker 2006-11-13 195
4ed800d02cfb63 Gerrit Renker 2006-11-13 196 if (inet_csk_ack_scheduled(sk)) {
31954cd8bb6670 Wei Wang 2019-01-25 197 if (!inet_csk_in_pingpong_mode(sk)) {
4ed800d02cfb63 Gerrit Renker 2006-11-13 198 /* Delayed ACK missed: inflate ATO. */
4ed800d02cfb63 Gerrit Renker 2006-11-13 @199 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
4ed800d02cfb63 Gerrit Renker 2006-11-13 200 icsk->icsk_rto);
4ed800d02cfb63 Gerrit Renker 2006-11-13 201 } else {
4ed800d02cfb63 Gerrit Renker 2006-11-13 202 /* Delayed ACK missed: leave pingpong mode and
4ed800d02cfb63 Gerrit Renker 2006-11-13 203 * deflate ATO.
4ed800d02cfb63 Gerrit Renker 2006-11-13 204 */
31954cd8bb6670 Wei Wang 2019-01-25 205 inet_csk_exit_pingpong_mode(sk);
4ed800d02cfb63 Gerrit Renker 2006-11-13 206 icsk->icsk_ack.ato = TCP_ATO_MIN;
4ed800d02cfb63 Gerrit Renker 2006-11-13 207 }
4ed800d02cfb63 Gerrit Renker 2006-11-13 208 dccp_send_ack(sk);
02a1d6e7a6bb02 Eric Dumazet 2016-04-27 209 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
4ed800d02cfb63 Gerrit Renker 2006-11-13 210 }
4ed800d02cfb63 Gerrit Renker 2006-11-13 211 out:
4ed800d02cfb63 Gerrit Renker 2006-11-13 212 bh_unlock_sock(sk);
4ed800d02cfb63 Gerrit Renker 2006-11-13 213 sock_put(sk);
4ed800d02cfb63 Gerrit Renker 2006-11-13 214 }
4ed800d02cfb63 Gerrit Renker 2006-11-13 215
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-09-29 5:57 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-09-27 18:27 [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel David Morley
2023-09-27 18:27 ` [PATCH net-next 2/2] tcp: change data receiver flowlabel after one dup David Morley
2023-09-29 0:54 ` Eric Dumazet
2023-09-29 0:52 ` [PATCH net-next 1/2] tcp: record last received ipv6 flowlabel Eric Dumazet
2023-09-29 5:57 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).