netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net 0/2] fix DCTCP delayed ACK
@ 2018-07-12 13:04 Yuchung Cheng
  2018-07-12 13:04 ` [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule Yuchung Cheng
  2018-07-12 13:04 ` [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP Yuchung Cheng
  0 siblings, 2 replies; 5+ messages in thread
From: Yuchung Cheng @ 2018-07-12 13:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, brakmo, ncardwell, ysseung, edumazet, Yuchung Cheng

This patch series addresses the issue that sometimes DCTCP
fail to acknowledge the latest sequence and result in sender timeout
if inflight is small.

Yuchung Cheng (2):
  tcp: fix dctcp delayed ACK schedule
  tcp: remove DELAYED ACK events in DCTCP

 include/net/tcp.h     |  2 --
 net/ipv4/tcp_dctcp.c  | 31 ++++---------------------------
 net/ipv4/tcp_output.c |  4 ----
 3 files changed, 4 insertions(+), 33 deletions(-)

-- 
2.18.0.203.gfac676dfb9-goog

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule
  2018-07-12 13:04 [PATCH net 0/2] fix DCTCP delayed ACK Yuchung Cheng
@ 2018-07-12 13:04 ` Yuchung Cheng
  2018-07-12 13:09   ` Lawrence Brakmo
  2018-07-12 13:04 ` [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP Yuchung Cheng
  1 sibling, 1 reply; 5+ messages in thread
From: Yuchung Cheng @ 2018-07-12 13:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, brakmo, ncardwell, ysseung, edumazet, Yuchung Cheng

Previously, when a data segment was sent an ACK was piggybacked
on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
event to notify congestion control modules. So the DCTCP
ca->delayed_ack_reserved flag could incorrectly stay set when
in fact there were no delayed ACKs being reserved. This could result
in sending a special ECN notification ACK that carries an older
ACK sequence, when in fact there was no need for such an ACK.
DCTCP keeps track of the delayed ACK status with its own separate
state ca->delayed_ack_reserved. Previously it may accidentally cancel
the delayed ACK without updating this field upon sending a special
ACK that carries a older ACK sequence. This inconsistency would
lead to DCTCP receiver never acknowledging the latest data until the
sender times out and retry in some cases.

Packetdrill script (provided by Larry Brakmo)

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 2:3(1) ack 2001

0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
0.200 > [ect01] . 3:3(0) ack 4001

0.210 < [ce] P. 4001:4501(500) ack 3 win 257

+0.001 read(4, ..., 4500) = 4500
+0 write(4, ..., 1) = 1
+0 > [ect01] PE. 3:4(1) ack 4501

+0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
// Previously the ACK sequence below would be 4501, causing a long RTO
+0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack

+0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
+0 > [ect01] . 4:4(0) ack 6501     // now acks everything

+0.500 < F. 9501:9501(0) ack 4 win 257

Reported-by: Larry Brakmo <brakmo@fb.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
---
 net/ipv4/tcp_dctcp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..89f88b0d8167 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	/* State has changed from CE=0 to CE=1 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (!ca->ce_state && ca->delayed_ack_reserved) {
+	if (!ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	/* State has changed from CE=1 to CE=0 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (ca->ce_state && ca->delayed_ack_reserved) {
+	if (ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
-- 
2.18.0.203.gfac676dfb9-goog

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP
  2018-07-12 13:04 [PATCH net 0/2] fix DCTCP delayed ACK Yuchung Cheng
  2018-07-12 13:04 ` [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule Yuchung Cheng
@ 2018-07-12 13:04 ` Yuchung Cheng
  2018-07-12 13:09   ` Lawrence Brakmo
  1 sibling, 1 reply; 5+ messages in thread
From: Yuchung Cheng @ 2018-07-12 13:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, brakmo, ncardwell, ysseung, edumazet, Yuchung Cheng

After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK
related callbacks are no longer needed

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
---
 include/net/tcp.h     |  2 --
 net/ipv4/tcp_dctcp.c  | 25 -------------------------
 net/ipv4/tcp_output.c |  4 ----
 3 files changed, 31 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af3ec72d5d41..3482d13d655b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -912,8 +912,6 @@ enum tcp_ca_event {
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
-	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
-	CA_EVENT_NON_DELAYED_ACK,
 };
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 89f88b0d8167..5869f89ca656 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
 	u32 dctcp_alpha;
 	u32 next_seq;
 	u32 ce_state;
-	u32 delayed_ack_reserved;
 	u32 loss_cwnd;
 };
 
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
 
 		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
 
-		ca->delayed_ack_reserved = 0;
 		ca->loss_cwnd = 0;
 		ca->ce_state = 0;
 
@@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
 	}
 }
 
-static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
-{
-	struct dctcp *ca = inet_csk_ca(sk);
-
-	switch (ev) {
-	case CA_EVENT_DELAYED_ACK:
-		if (!ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 1;
-		break;
-	case CA_EVENT_NON_DELAYED_ACK:
-		if (ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 0;
-		break;
-	default:
-		/* Don't care for the rest. */
-		break;
-	}
-}
-
 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 {
 	switch (ev) {
@@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	case CA_EVENT_ECN_NO_CE:
 		dctcp_ce_state_1_to_0(sk);
 		break;
-	case CA_EVENT_DELAYED_ACK:
-	case CA_EVENT_NON_DELAYED_ACK:
-		dctcp_update_ack_reserved(sk, ev);
-		break;
 	default:
 		/* Don't care for the rest. */
 		break;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e08b409c71e..00e5a300ddb9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
-	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
-
 	if (ato > TCP_DELACK_MIN) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
@@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk)
 	if (sk->sk_state == TCP_CLOSE)
 		return;
 
-	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
-
 	/* We are not putting this on the write queue, so
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
-- 
2.18.0.203.gfac676dfb9-goog

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule
  2018-07-12 13:04 ` [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule Yuchung Cheng
@ 2018-07-12 13:09   ` Lawrence Brakmo
  0 siblings, 0 replies; 5+ messages in thread
From: Lawrence Brakmo @ 2018-07-12 13:09 UTC (permalink / raw)
  To: Yuchung Cheng, davem@davemloft.net
  Cc: netdev@vger.kernel.org, ncardwell@google.com, ysseung@google.com,
	edumazet@google.com

On 7/12/18, 9:05 AM, "netdev-owner@vger.kernel.org on behalf of Yuchung Cheng" <netdev-owner@vger.kernel.org on behalf of ycheng@google.com> wrote:

    Previously, when a data segment was sent an ACK was piggybacked
    on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
    event to notify congestion control modules. So the DCTCP
    ca->delayed_ack_reserved flag could incorrectly stay set when
    in fact there were no delayed ACKs being reserved. This could result
    in sending a special ECN notification ACK that carries an older
    ACK sequence, when in fact there was no need for such an ACK.
    DCTCP keeps track of the delayed ACK status with its own separate
    state ca->delayed_ack_reserved. Previously it may accidentally cancel
    the delayed ACK without updating this field upon sending a special
    ACK that carries a older ACK sequence. This inconsistency would
    lead to DCTCP receiver never acknowledging the latest data until the
    sender times out and retry in some cases.
    
    Packetdrill script (provided by Larry Brakmo)
    
    0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
    0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
    0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
    0.000 bind(3, ..., ...) = 0
    0.000 listen(3, 1) = 0
    
    0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
    0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
    0.110 < [ect0] . 1:1(0) ack 1 win 257
    0.200 accept(3, ..., ...) = 4
    
    0.200 < [ect0] . 1:1001(1000) ack 1 win 257
    0.200 > [ect01] . 1:1(0) ack 1001
    
    0.200 write(4, ..., 1) = 1
    0.200 > [ect01] P. 1:2(1) ack 1001
    
    0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
    0.200 write(4, ..., 1) = 1
    0.200 > [ect01] P. 2:3(1) ack 2001
    
    0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
    0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
    0.200 > [ect01] . 3:3(0) ack 4001
    
    0.210 < [ce] P. 4001:4501(500) ack 3 win 257
    
    +0.001 read(4, ..., 4500) = 4500
    +0 write(4, ..., 1) = 1
    +0 > [ect01] PE. 3:4(1) ack 4501
    
    +0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
    // Previously the ACK sequence below would be 4501, causing a long RTO
    +0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack
    
    +0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
    +0 > [ect01] . 4:4(0) ack 6501     // now acks everything
    
    +0.500 < F. 9501:9501(0) ack 4 win 257
    
    Reported-by: Larry Brakmo <brakmo@fb.com>
    Signed-off-by: Yuchung Cheng <ycheng@google.com>
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Neal Cardwell <ncardwell@google.com>
    ---
     net/ipv4/tcp_dctcp.c | 6 ++++--
     1 file changed, 4 insertions(+), 2 deletions(-)
    
    diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
    index 5f5e5936760e..89f88b0d8167 100644
    --- a/net/ipv4/tcp_dctcp.c
    +++ b/net/ipv4/tcp_dctcp.c
    @@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
     	/* State has changed from CE=0 to CE=1 and delayed
     	 * ACK has not sent yet.
     	 */
    -	if (!ca->ce_state && ca->delayed_ack_reserved) {
    +	if (!ca->ce_state &&
    +	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
     		u32 tmp_rcv_nxt;
     
     		/* Save current rcv_nxt. */
    @@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
     	/* State has changed from CE=1 to CE=0 and delayed
     	 * ACK has not sent yet.
     	 */
    -	if (ca->ce_state && ca->delayed_ack_reserved) {
    +	if (ca->ce_state &&
    +	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
     		u32 tmp_rcv_nxt;
     
     		/* Save current rcv_nxt. */
    -- 
    2.18.0.203.gfac676dfb9-goog
    
LGTM. Thanks for the patch.

Acked-by: Lawrence Brakmo <brakmo@fb.com>
    

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP
  2018-07-12 13:04 ` [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP Yuchung Cheng
@ 2018-07-12 13:09   ` Lawrence Brakmo
  0 siblings, 0 replies; 5+ messages in thread
From: Lawrence Brakmo @ 2018-07-12 13:09 UTC (permalink / raw)
  To: Yuchung Cheng, davem@davemloft.net
  Cc: netdev@vger.kernel.org, ncardwell@google.com, ysseung@google.com,
	edumazet@google.com

LGTM. Thanks for the patch.

Acked-by: Lawrence Brakmo <brakmo@fb.com>

On 7/12/18, 9:05 AM, "Yuchung Cheng" <ycheng@google.com> wrote:

    After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK
    related callbacks are no longer needed
    
    Signed-off-by: Yuchung Cheng <ycheng@google.com>
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Neal Cardwell <ncardwell@google.com>
    ---
     include/net/tcp.h     |  2 --
     net/ipv4/tcp_dctcp.c  | 25 -------------------------
     net/ipv4/tcp_output.c |  4 ----
     3 files changed, 31 deletions(-)
    
    diff --git a/include/net/tcp.h b/include/net/tcp.h
    index af3ec72d5d41..3482d13d655b 100644
    --- a/include/net/tcp.h
    +++ b/include/net/tcp.h
    @@ -912,8 +912,6 @@ enum tcp_ca_event {
     	CA_EVENT_LOSS,		/* loss timeout */
     	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
     	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
    -	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
    -	CA_EVENT_NON_DELAYED_ACK,
     };
     
     /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
    diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
    index 89f88b0d8167..5869f89ca656 100644
    --- a/net/ipv4/tcp_dctcp.c
    +++ b/net/ipv4/tcp_dctcp.c
    @@ -55,7 +55,6 @@ struct dctcp {
     	u32 dctcp_alpha;
     	u32 next_seq;
     	u32 ce_state;
    -	u32 delayed_ack_reserved;
     	u32 loss_cwnd;
     };
     
    @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
     
     		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
     
    -		ca->delayed_ack_reserved = 0;
     		ca->loss_cwnd = 0;
     		ca->ce_state = 0;
     
    @@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
     	}
     }
     
    -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
    -{
    -	struct dctcp *ca = inet_csk_ca(sk);
    -
    -	switch (ev) {
    -	case CA_EVENT_DELAYED_ACK:
    -		if (!ca->delayed_ack_reserved)
    -			ca->delayed_ack_reserved = 1;
    -		break;
    -	case CA_EVENT_NON_DELAYED_ACK:
    -		if (ca->delayed_ack_reserved)
    -			ca->delayed_ack_reserved = 0;
    -		break;
    -	default:
    -		/* Don't care for the rest. */
    -		break;
    -	}
    -}
    -
     static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
     {
     	switch (ev) {
    @@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
     	case CA_EVENT_ECN_NO_CE:
     		dctcp_ce_state_1_to_0(sk);
     		break;
    -	case CA_EVENT_DELAYED_ACK:
    -	case CA_EVENT_NON_DELAYED_ACK:
    -		dctcp_update_ack_reserved(sk, ev);
    -		break;
     	default:
     		/* Don't care for the rest. */
     		break;
    diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
    index 8e08b409c71e..00e5a300ddb9 100644
    --- a/net/ipv4/tcp_output.c
    +++ b/net/ipv4/tcp_output.c
    @@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk)
     	int ato = icsk->icsk_ack.ato;
     	unsigned long timeout;
     
    -	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
    -
     	if (ato > TCP_DELACK_MIN) {
     		const struct tcp_sock *tp = tcp_sk(sk);
     		int max_ato = HZ / 2;
    @@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk)
     	if (sk->sk_state == TCP_CLOSE)
     		return;
     
    -	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
    -
     	/* We are not putting this on the write queue, so
     	 * tcp_transmit_skb() will set the ownership to this
     	 * sock.
    -- 
    2.18.0.203.gfac676dfb9-goog
    
    

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-07-12 13:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-07-12 13:04 [PATCH net 0/2] fix DCTCP delayed ACK Yuchung Cheng
2018-07-12 13:04 ` [PATCH net 1/2] tcp: fix dctcp delayed ACK schedule Yuchung Cheng
2018-07-12 13:09   ` Lawrence Brakmo
2018-07-12 13:04 ` [PATCH net 2/2] tcp: remove DELAYED ACK events in DCTCP Yuchung Cheng
2018-07-12 13:09   ` Lawrence Brakmo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).