[RFC] TCP burst control

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] TCP burst control
@ 2004-07-06 22:58 Stephen Hemminger
  2004-07-06 23:04 ` David S. Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2004-07-06 22:58 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

When using advanced congestion control it is possible for TCP to decide that
it has a large window to fill with data right away. The problem is that if TCP
creates long bursts, it becomes unfriendly to other flows and is more likely
to overrun intermediate queues.

This patch limits the amount of data in flight. It came from BICTCP 1.1 but it 
has been generalized to all TCP congestion algorithms. It has had some testing,
but needs to be more widely tested.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h	2004-07-06 15:52:39 -07:00
+++ b/include/linux/sysctl.h	2004-07-06 15:52:39 -07:00
@@ -339,6 +339,7 @@
 	NET_TCP_BIC_LOW_WINDOW=104,
 	NET_TCP_DEFAULT_WIN_SCALE=105,
 	NET_TCP_MODERATE_RCVBUF=106,
+	NET_TCP_BURST_MODERATION=107,
 };
 
 enum {
diff -Nru a/include/linux/tcp.h b/include/linux/tcp.h
--- a/include/linux/tcp.h	2004-07-06 15:52:39 -07:00
+++ b/include/linux/tcp.h	2004-07-06 15:52:39 -07:00
@@ -341,6 +341,7 @@
 	__u32	sacked_out;	/* SACK'd packets			*/
 	__u32	fackets_out;	/* FACK'd packets			*/
 	__u32	high_seq;	/* snd_nxt at onset of congestion	*/
+	__u32   max_in_flight;	/* for burst moderation */
 
 	__u32	retrans_stamp;	/* Timestamp of the last retransmit,
 				 * also used in SYN-SENT to remember stamp of
diff -Nru a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h	2004-07-06 15:52:39 -07:00
+++ b/include/net/tcp.h	2004-07-06 15:52:39 -07:00
@@ -613,6 +613,7 @@
 extern int sysctl_tcp_bic_low_window;
 extern int sysctl_tcp_default_win_scale;
 extern int sysctl_tcp_moderate_rcvbuf;
+extern int sysctl_tcp_burst_moderation;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -1335,8 +1336,11 @@
 {
 	tp->undo_marker = 0;
 	tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-	tp->snd_cwnd = min(tp->snd_cwnd,
-			   tcp_packets_in_flight(tp) + 1U);
+	if (sysctl_tcp_burst_moderation) 
+		tp->snd_cwnd = min(tp->snd_cwnd, 
+				   max(tp->snd_ssthresh, tcp_packets_in_flight(tp) + 1U));
+	else 
+		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U);
 	tp->snd_cwnd_cnt = 0;
 	tp->high_seq = tp->snd_nxt;
 	tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1393,6 +1397,24 @@
 		  tcp_minshall_check(tp))));
 }
 
+/*
+ * If doing packet burst moderation
+ * then check to see if we have used up our limit
+ */
+static __inline__ int
+tcp_burst_exhausted(struct tcp_opt *tp)
+{
+	u32 cap = tp->max_in_flight;
+
+	if (!sysctl_tcp_burst_moderation || cap == 0)
+		return 0;
+	
+	if (likely(tp->ca_state != TCP_CA_Recovery))
+		cap += tcp_max_burst(tp) + (tp->snd_cwnd>>7);
+
+	return (tcp_packets_in_flight(tp) >= cap);
+}
+
 /* This checks if the data bearing packet SKB (usually sk->sk_send_head)
  * should be put on the wire right now.
  */
@@ -1423,11 +1445,19 @@
 	/* Don't be strict about the congestion window for the
 	 * final FIN frame.  -DaveM
 	 */
-	return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
-		 || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
-		((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
-		 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
-		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
+	if ((tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	     tcp_burst_exhausted(tp)) && 
+	    !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+		return 0;	/* no space in congestion window */
+
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd))
+		return 0;	/* send window full */
+
+	if (!((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
+	      || !tcp_nagle_check(tp, skb, cur_mss, nonagle)))
+		return 0;	 /* limited by sender */
+
+	return 1;
 }
 
 static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
diff -Nru a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
--- a/net/ipv4/sysctl_net_ipv4.c	2004-07-06 15:52:39 -07:00
+++ b/net/ipv4/sysctl_net_ipv4.c	2004-07-06 15:52:39 -07:00
@@ -682,6 +682,14 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_TCP_BURST_MODERATION,
+		.procname	= "tcp_burst_moderation",
+		.data		= &sysctl_tcp_burst_moderation,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c	2004-07-06 15:52:39 -07:00
+++ b/net/ipv4/tcp_input.c	2004-07-06 15:52:39 -07:00
@@ -91,6 +91,7 @@
 int sysctl_tcp_vegas_cong_avoid;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
+int sysctl_tcp_burst_moderation = 1;
 
 /* Default values of the Vegas variables, in fixed-point representation
  * with V_PARAM_SHIFT bits to the right of the binary point.
@@ -1596,7 +1597,11 @@
 	if (decr && tp->snd_cwnd > limit)
 		tp->snd_cwnd -= decr;
 
-	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
+	limit = tcp_packets_in_flight(tp)+1;
+	if (sysctl_tcp_burst_moderation)
+		limit = max(tp->snd_ssthresh, limit);
+	
+	tp->snd_cwnd = min(tp->snd_cwnd, limit);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -3823,8 +3828,13 @@
 		/* Limited by application or receiver window. */
 		u32 win_used = max(tp->snd_cwnd_used, 2U);
 		if (win_used < tp->snd_cwnd) {
+			u32 limit = (tp->snd_cwnd + win_used) >> 1;
 			tp->snd_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+			if (sysctl_tcp_burst_moderation) 
+				tp->snd_cwnd = min(tp->snd_cwnd,
+						   max(tp->snd_ssthresh, limit));
+			else
+				tp->snd_cwnd = limit;
 		}
 		tp->snd_cwnd_used = 0;
 	}
@@ -4097,6 +4107,8 @@
 			struct tcphdr *th, unsigned len)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
+
+	tp->max_in_flight = 0;
 
 	/*
 	 *	Header prediction.
diff -Nru a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
--- a/net/ipv4/tcp_output.c	2004-07-06 15:52:39 -07:00
+++ b/net/ipv4/tcp_output.c	2004-07-06 15:52:39 -07:00
@@ -205,6 +205,10 @@
 #define SYSCTL_FLAG_WSCALE	0x2
 #define SYSCTL_FLAG_SACK	0x4
 
+		if (sysctl_tcp_burst_moderation && !tp->max_in_flight)
+			tp->max_in_flight = tcp_packets_in_flight(tp) 
+				+ tcp_max_burst(tp);
+
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
 			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -948,6 +952,11 @@
 			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 				return;
 
+			if (sysctl_tcp_burst_moderation && tp->max_in_flight) {
+				if (tcp_packets_in_flight(tp) >= tp->max_in_flight)
+					return;
+			}
+
 			if (sacked&TCPCB_LOST) {
 				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
 					if (tcp_retransmit_skb(sk, skb))
@@ -996,6 +1005,11 @@
 
 		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 			break;
+
+		if (sysctl_tcp_burst_moderation && tp->max_in_flight) {
+			if (tcp_packets_in_flight(tp) >= tp->max_in_flight)
+				return;
+		}
 
 		if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
 			continue;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] TCP burst control
  2004-07-06 22:58 [RFC] TCP burst control Stephen Hemminger
@ 2004-07-06 23:04 ` David S. Miller
  2004-07-07  0:09   ` Injong Rhee
  0 siblings, 1 reply; 12+ messages in thread
From: David S. Miller @ 2004-07-06 23:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, rhee

On Tue, 6 Jul 2004 15:58:58 -0700
Stephen Hemminger <shemminger@osdl.org> wrote:

> When using advanced congestion control it is possible for TCP to decide that
> it has a large window to fill with data right away. The problem is that if TCP
> creates long bursts, it becomes unfriendly to other flows and is more likely
> to overrun intermediate queues.
> 
> This patch limits the amount of data in flight. It came from BICTCP 1.1 but it 
> has been generalized to all TCP congestion algorithms. It has had some testing,
> but needs to be more widely tested.

Both the New Reno and Westwood+ algorithms implement rate-halving to
solve this problem.

Why can't BICTCP use that instead of this special burst control hack?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-06 23:04 ` David S. Miller
@ 2004-07-07  0:09   ` Injong Rhee
  2004-07-07  0:29     ` David S. Miller
                       ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Injong Rhee @ 2004-07-07  0:09 UTC (permalink / raw)
  To: 'David S. Miller', 'Stephen Hemminger'; +Cc: netdev, rhee, lxu2

Hi David and Stephen,

We tested this rate halving. In fact, rate having in fact degrades the
performance quite a bit. We can send you more information about it. Our test
indicates that this feature introduces many timeouts (because of bursts),
and also cause unnecessary cwnd backoff to reduce the transmission
unjustifiably low -- so there are many (I will repeat, many) window and
transmission oscillations during packet losses. We fix this problem
completely using our own special burst control. It is very simple and easy
technique to implement. If you need some data to back up our claims, I will
send you more. Once we implemented our burst control, we don't have any
timeouts and not much fluctuation other than congestion control related.
Currently with rate having, current Linux tcp stack is full of hacks that in
fact, hurt the performance of linux tcp (sorry to say this). Our burst
control, in fact, simplifies a lot of that and makes sure cwnd to follow
very closely to whatever congestion control algorithm is intended it to
behave. The Linux Reno burst control in fact interferes with the original
congestion control (in fact, it tries to do its own), and its performance is
very hard to predict.

Hope this helps.

Injong Rhee, Associate Professor
North Carolina State University
Raleigh, NC 27699
rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee

-----Original Message-----
From: David S. Miller [mailto:davem@redhat.com] 
Sent: Tuesday, July 06, 2004 7:05 PM
To: Stephen Hemminger
Cc: netdev@oss.sgi.com; rhee@ncsu.edu
Subject: Re: [RFC] TCP burst control

On Tue, 6 Jul 2004 15:58:58 -0700
Stephen Hemminger <shemminger@osdl.org> wrote:

> When using advanced congestion control it is possible for TCP to decide
that
> it has a large window to fill with data right away. The problem is that if
TCP
> creates long bursts, it becomes unfriendly to other flows and is more
likely
> to overrun intermediate queues.
> 
> This patch limits the amount of data in flight. It came from BICTCP 1.1
but it 
> has been generalized to all TCP congestion algorithms. It has had some
testing,
> but needs to be more widely tested.

Both the New Reno and Westwood+ algorithms implement rate-halving to
solve this problem.

Why can't BICTCP use that instead of this special burst control hack?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] TCP burst control
  2004-07-07  0:09   ` Injong Rhee
@ 2004-07-07  0:29     ` David S. Miller
  2004-07-07  5:46       ` Injong Rhee
  2004-07-07  2:20     ` Nivedita Singhvi
  2004-07-28  9:48     ` Xiaoliang (David) Wei
  2 siblings, 1 reply; 12+ messages in thread
From: David S. Miller @ 2004-07-07  0:29 UTC (permalink / raw)
  To: Injong Rhee; +Cc: shemminger, netdev, rhee, lxu2, mathis

On Tue, 6 Jul 2004 20:09:41 -0400
"Injong Rhee" <rhee@eos.ncsu.edu> wrote:

> Currently with rate having, current Linux tcp stack is full of hacks that in
> fact, hurt the performance of linux tcp (sorry to say this).

If rate-halving is broken, have you taken this up with it's creator,
Mr. Mathis?  What was his response?

I've added him to the CC: list so this can be properly discussed.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] TCP burst control
  2004-07-07  0:09   ` Injong Rhee
  2004-07-07  0:29     ` David S. Miller
@ 2004-07-07  2:20     ` Nivedita Singhvi
  2004-07-28  9:48     ` Xiaoliang (David) Wei
  2 siblings, 0 replies; 12+ messages in thread
From: Nivedita Singhvi @ 2004-07-07  2:20 UTC (permalink / raw)
  To: Injong Rhee
  Cc: 'David S. Miller', 'Stephen Hemminger', netdev,
	rhee, lxu2

Injong Rhee wrote:
> Hi David and Stephen,
> 
> We tested this rate halving. In fact, rate having in fact degrades the
> performance quite a bit. We can send you more information about it. Our test
> indicates that this feature introduces many timeouts (because of bursts),
> and also cause unnecessary cwnd backoff to reduce the transmission
> unjustifiably low -- so there are many (I will repeat, many) window and
> transmission oscillations during packet losses. We fix this problem

Could you point me to a paper or summary of your info?

> completely using our own special burst control. It is very simple and easy
> technique to implement. If you need some data to back up our claims, I will
> send you more. Once we implemented our burst control, we don't have any
> timeouts and not much fluctuation other than congestion control related.
> Currently with rate having, current Linux tcp stack is full of hacks that in
> fact, hurt the performance of linux tcp (sorry to say this). Our burst
> control, in fact, simplifies a lot of that and makes sure cwnd to follow
> very closely to whatever congestion control algorithm is intended it to
> behave. The Linux Reno burst control in fact interferes with the original
> congestion control (in fact, it tries to do its own), and its performance is
> very hard to predict.

Can you characterize the workload/traffic/error rate that each
would be best suited for?

thanks,
Nivedita

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-07  0:29     ` David S. Miller
@ 2004-07-07  5:46       ` Injong Rhee
  2004-07-07  5:49         ` Injong Rhee
                           ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Injong Rhee @ 2004-07-07  5:46 UTC (permalink / raw)
  To: 'David S. Miller'; +Cc: shemminger, netdev, rhee, lxu2, mathis

Hi David,

Let me clarify the issue a little. In my earlier message, I might have
sounded like accusing rate halving of the burst problem and window
oscillation. I might have misrepresented it a little in the heat of writing
the email too fast :-). In fact, rate halving helps ease burst during fast
recovery as written in the Internet draft. 

The main problem lies in the variable that rate halving is closely
interacting with in TCP SACK implementation: packet_in_flight (or pipe_). In
the current implementation of Linux TCP SACK, cwnd is set to
packet_in_flight + C for every ack for CWR, recovery, and timeout-- Here C
is 1 to 3. But many times, packet_in_flight drops *far* below cwnd during
fast recovery. In high speed networks, a lot of packets can be lost in one
RTT (even acks as well because of slow CPUs). If that happens,
packet_in_flight becomes very small. At this time, Linux cwnd moderation (or
burst control) kicks in by setting cwnd to packet_in_flight + C so that the
sender does not burst all those packets between packet_in_flight and cwnd at
a single time. However, there is a problem with this approach. Since cwnd is
kept to very small, the transmission rate drops to almost zero during fast
recovery -- it should drop only to half of the current transmission rate (or
in high-speed protocols like BIC, it is only 87% of the current rate). Since
fast recovery lasts more than several RTTs, the network capacity is highly
underutilized during fast recovery. Furthermore, right after fast recovery,
cwnd goes into slow start since cwnd is typically far smaller than ssthrsh
after fast recovery. This also creates a lot of burst -- likely causing back
to back losses or even timeouts.

You can see this behavior in the following link:

http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
IC-600-75-7500-1-0-0-noburst/index.htm

We run in a dummynet without any change in the burst control. You can see
that whenever there is fast recovery, the rate almost drop to zero. The pink
line is the throughput observed from the dummynet at every second, and red
one is from Iperf. In the second figure, you can see cwnd. It drops to the
bottom during fast recovery -- this is not part of congestion control. It is
the burst control of Linux SACK doing it. 

But with our new burst control:

http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
IC-600-75-7500-1-0-0/index.htm

You can see that cwnd is quite stabilized and the throughput does not have
as much dip as in the original case.

Here is what we do: instead of reducing cwnd to packet_in_flight (which is,
in fact, meddling with congestion control), we reduce the gap between these
two numbers by allowing transmitting more packets per ack (we set this to
three more packets per ack) until packet_in_flight becomes close to cwnd.
Also right after fast recovery, we increase packet_in_flight by 1% of
packet_in_flight up to cwnd. This reduces the huge burst after fast
recovery. Our implementation is trying to leave cwnd only to congestion
control and separates burst control from congestion control. This makes the
behavior of congestion control more predictable.  We will report more on
this tomorrow when we get back to the Lab to test some other environments,
especially when we have smaller buffers. This scheme may not be the cure for
all and needs more testing. So far, it has been working very well.

Stay tuned.
Injong.
---
Injong Rhee, Associate Professor
North Carolina State University
Raleigh, NC 27699
rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee

-----Original Message-----
From: David S. Miller [mailto:davem@redhat.com] 
Sent: Tuesday, July 06, 2004 8:29 PM
To: Injong Rhee
Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu; lxu2@ncsu.edu;
mathis@psc.edu
Subject: Re: [RFC] TCP burst control

On Tue, 6 Jul 2004 20:09:41 -0400
"Injong Rhee" <rhee@eos.ncsu.edu> wrote:

> Currently with rate having, current Linux tcp stack is full of hacks that
in
> fact, hurt the performance of linux tcp (sorry to say this).

If rate-halving is broken, have you taken this up with it's creator,
Mr. Mathis?  What was his response?

I've added him to the CC: list so this can be properly discussed.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-07  5:46       ` Injong Rhee
@ 2004-07-07  5:49         ` Injong Rhee
  2004-07-07 15:31         ` Matt Mathis
  2004-07-15  0:11         ` Weiguang Shi
  2 siblings, 0 replies; 12+ messages in thread
From: Injong Rhee @ 2004-07-07  5:49 UTC (permalink / raw)
  To: 'Injong Rhee', 'David S. Miller'
  Cc: shemminger, netdev, rhee, lxu2, mathis

Also some additional reports on this moderation problem can be found in
http://www.hep.ucl.ac.uk/~ytl/tcpip/linux/tcp_moderate_cwnd/

Injong Rhee, Associate Professor
North Carolina State University
Raleigh, NC 27699
rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee

-----Original Message-----
From: Injong Rhee [mailto:rhee@eos.ncsu.edu] 
Sent: Wednesday, July 07, 2004 1:46 AM
To: 'David S. Miller'
Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu; lxu2@ncsu.edu;
mathis@psc.edu
Subject: RE: [RFC] TCP burst control

Hi David,

Let me clarify the issue a little. In my earlier message, I might have
sounded like accusing rate halving of the burst problem and window
oscillation. I might have misrepresented it a little in the heat of writing
the email too fast :-). In fact, rate halving helps ease burst during fast
recovery as written in the Internet draft. 

The main problem lies in the variable that rate halving is closely
interacting with in TCP SACK implementation: packet_in_flight (or pipe_). In
the current implementation of Linux TCP SACK, cwnd is set to
packet_in_flight + C for every ack for CWR, recovery, and timeout-- Here C
is 1 to 3. But many times, packet_in_flight drops *far* below cwnd during
fast recovery. In high speed networks, a lot of packets can be lost in one
RTT (even acks as well because of slow CPUs). If that happens,
packet_in_flight becomes very small. At this time, Linux cwnd moderation (or
burst control) kicks in by setting cwnd to packet_in_flight + C so that the
sender does not burst all those packets between packet_in_flight and cwnd at
a single time. However, there is a problem with this approach. Since cwnd is
kept to very small, the transmission rate drops to almost zero during fast
recovery -- it should drop only to half of the current transmission rate (or
in high-speed protocols like BIC, it is only 87% of the current rate). Since
fast recovery lasts more than several RTTs, the network capacity is highly
underutilized during fast recovery. Furthermore, right after fast recovery,
cwnd goes into slow start since cwnd is typically far smaller than ssthrsh
after fast recovery. This also creates a lot of burst -- likely causing back
to back losses or even timeouts.

You can see this behavior in the following link:

http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
IC-600-75-7500-1-0-0-noburst/index.htm

We run in a dummynet without any change in the burst control. You can see
that whenever there is fast recovery, the rate almost drop to zero. The pink
line is the throughput observed from the dummynet at every second, and red
one is from Iperf. In the second figure, you can see cwnd. It drops to the
bottom during fast recovery -- this is not part of congestion control. It is
the burst control of Linux SACK doing it. 

But with our new burst control:

http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
IC-600-75-7500-1-0-0/index.htm

You can see that cwnd is quite stabilized and the throughput does not have
as much dip as in the original case.

Here is what we do: instead of reducing cwnd to packet_in_flight (which is,
in fact, meddling with congestion control), we reduce the gap between these
two numbers by allowing transmitting more packets per ack (we set this to
three more packets per ack) until packet_in_flight becomes close to cwnd.
Also right after fast recovery, we increase packet_in_flight by 1% of
packet_in_flight up to cwnd. This reduces the huge burst after fast
recovery. Our implementation is trying to leave cwnd only to congestion
control and separates burst control from congestion control. This makes the
behavior of congestion control more predictable.  We will report more on
this tomorrow when we get back to the Lab to test some other environments,
especially when we have smaller buffers. This scheme may not be the cure for
all and needs more testing. So far, it has been working very well.

Stay tuned.
Injong.
---
Injong Rhee, Associate Professor
North Carolina State University
Raleigh, NC 27699
rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee

-----Original Message-----
>From: David S. Miller [mailto:davem@redhat.com] 
Sent: Tuesday, July 06, 2004 8:29 PM
To: Injong Rhee
Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu; lxu2@ncsu.edu;
mathis@psc.edu
Subject: Re: [RFC] TCP burst control

On Tue, 6 Jul 2004 20:09:41 -0400
"Injong Rhee" <rhee@eos.ncsu.edu> wrote:

> Currently with rate having, current Linux tcp stack is full of hacks that
in
> fact, hurt the performance of linux tcp (sorry to say this).

If rate-halving is broken, have you taken this up with it's creator,
Mr. Mathis?  What was his response?

I've added him to the CC: list so this can be properly discussed.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-07  5:46       ` Injong Rhee
  2004-07-07  5:49         ` Injong Rhee
@ 2004-07-07 15:31         ` Matt Mathis
  2004-07-09 15:36           ` Injong Rhee
  2004-07-15  0:11         ` Weiguang Shi
  2 siblings, 1 reply; 12+ messages in thread
From: Matt Mathis @ 2004-07-07 15:31 UTC (permalink / raw)
  To: Injong Rhee
  Cc: 'David S. Miller', Stephen Hemminger, netdev, rhee, lxu2,
	John Heffner, Jamshid Mahdavi

Yes, Injong is correct about the problem with Rate Having.  I haven't looked at 
his fix to see if I agree with it, but let me summarize the problem that we 
never finished (and prevented us from formally publishing it).

Basicly if the sender is under fluctuating resource stress such that awnd (the 
actual window) is frequently less than cwnd (remember this was written BEFORE 
cwnd validation.) then a loss that is detected when awnd is at a minimum sets 
cwnd to an unreasonably small value that has nothing to do with the actual 
state of the network.  Since we also set ssthresh down to cwnd, this takes
"forever" to recover.

The real killer is if the application is periodicly bursty, such as copying 
from older unbuffered disks or timesharing systems running other compute bound 
applications (normal for supercomputers).  Under these conditions TCP suffers 
from frequent idle intervals, each restarting with a full window burst 
(pre-cwnd validation!).  If the burst was just slightly larger than the network 
pipe size, then the packets at the end of the burst are most at risk of being 
dropped.  When the ACKs for subsequent packets arrive and the loss is detected, 
awnd will be nearly zero, resulting in nearly zero cwnd and ssthresh......

We were in the heat of investigating solutions to this and other related 
problems (there are multiple potential solutions), when we realized that
autotuning is orders of magnitude more important (at least to our users).

As the code points out there are other corner cases that we missed, such as 
reordering and such.  In principle I would like to come back to congestion 
control work, revisit FACK-RH and fold in all of the new stuff such as cwnd 
validation and window moderation.  (BTW I would not be surprised if these 
algorithms don't play nicely with rate halving - each was designed and tested 
without considering the effects of the others).

Thanks,
--MM--
-------------------------------------------
Matt Mathis      http://www.psc.edu/~mathis
Work:412.268.3319    Home/Cell:412.654.7529
-------------------------------------------
Evil is defined by mortals who think they know
"The Truth" and forcibly apply it to others.

On Wed, 7 Jul 2004, Injong Rhee wrote:

>
> Hi David,
>
> Let me clarify the issue a little. In my earlier message, I might have
> sounded like accusing rate halving of the burst problem and window
> oscillation. I might have misrepresented it a little in the heat of writing
> the email too fast :-). In fact, rate halving helps ease burst during fast
> recovery as written in the Internet draft.
>
> The main problem lies in the variable that rate halving is closely
> interacting with in TCP SACK implementation: packet_in_flight (or pipe_). In
> the current implementation of Linux TCP SACK, cwnd is set to
> packet_in_flight + C for every ack for CWR, recovery, and timeout-- Here C
> is 1 to 3. But many times, packet_in_flight drops *far* below cwnd during
> fast recovery. In high speed networks, a lot of packets can be lost in one
> RTT (even acks as well because of slow CPUs). If that happens,
> packet_in_flight becomes very small. At this time, Linux cwnd moderation (or
> burst control) kicks in by setting cwnd to packet_in_flight + C so that the
> sender does not burst all those packets between packet_in_flight and cwnd at
> a single time. However, there is a problem with this approach. Since cwnd is
> kept to very small, the transmission rate drops to almost zero during fast
> recovery -- it should drop only to half of the current transmission rate (or
> in high-speed protocols like BIC, it is only 87% of the current rate). Since
> fast recovery lasts more than several RTTs, the network capacity is highly
> underutilized during fast recovery. Furthermore, right after fast recovery,
> cwnd goes into slow start since cwnd is typically far smaller than ssthrsh
> after fast recovery. This also creates a lot of burst -- likely causing back
> to back losses or even timeouts.
>
> You can see this behavior in the following link:
>
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
> IC-600-75-7500-1-0-0-noburst/index.htm
>
> We run in a dummynet without any change in the burst control. You can see
> that whenever there is fast recovery, the rate almost drop to zero. The pink
> line is the throughput observed from the dummynet at every second, and red
> one is from Iperf. In the second figure, you can see cwnd. It drops to the
> bottom during fast recovery -- this is not part of congestion control. It is
> the burst control of Linux SACK doing it.
>
> But with our new burst control:
>
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
> IC-600-75-7500-1-0-0/index.htm
>
> You can see that cwnd is quite stabilized and the throughput does not have
> as much dip as in the original case.
>
> Here is what we do: instead of reducing cwnd to packet_in_flight (which is,
> in fact, meddling with congestion control), we reduce the gap between these
> two numbers by allowing transmitting more packets per ack (we set this to
> three more packets per ack) until packet_in_flight becomes close to cwnd.
> Also right after fast recovery, we increase packet_in_flight by 1% of
> packet_in_flight up to cwnd. This reduces the huge burst after fast
> recovery. Our implementation is trying to leave cwnd only to congestion
> control and separates burst control from congestion control. This makes the
> behavior of congestion control more predictable.  We will report more on
> this tomorrow when we get back to the Lab to test some other environments,
> especially when we have smaller buffers. This scheme may not be the cure for
> all and needs more testing. So far, it has been working very well.
>
> Stay tuned.
> Injong.
> ---
> Injong Rhee, Associate Professor
> North Carolina State University
> Raleigh, NC 27699
> rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee
>
>
>
> -----Original Message-----
> From: David S. Miller [mailto:davem@redhat.com]
> Sent: Tuesday, July 06, 2004 8:29 PM
> To: Injong Rhee
> Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu; lxu2@ncsu.edu;
> mathis@psc.edu
> Subject: Re: [RFC] TCP burst control
>
> On Tue, 6 Jul 2004 20:09:41 -0400
> "Injong Rhee" <rhee@eos.ncsu.edu> wrote:
>
>> Currently with rate having, current Linux tcp stack is full of hacks that
> in
>> fact, hurt the performance of linux tcp (sorry to say this).
>
> If rate-halving is broken, have you taken this up with it's creator,
> Mr. Mathis?  What was his response?
>
> I've added him to the CC: list so this can be properly discussed.
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-07 15:31         ` Matt Mathis
@ 2004-07-09 15:36           ` Injong Rhee
  0 siblings, 0 replies; 12+ messages in thread
From: Injong Rhee @ 2004-07-09 15:36 UTC (permalink / raw)
  To: 'Matt Mathis'
  Cc: 'David S. Miller', 'Stephen Hemminger', netdev,
	rhee, lxu2, 'John Heffner', 'Jamshid Mahdavi'

Hi David and Matt,

The main cause of the problem is that the current linux implementation of
TCP does not follow RFC, especially by incorporating its own burst control
mechanisms. Rate halving is a form of burst control -- but this is not part
of the problems. *The cause is the use of packet-in-flight to moderate
cwnd*. This is not part of the standard and it interferes with a lot of
congestion control functions including rate halving. Let me give you some
more information below.

When fast recovery kicks in, packet-in-flight (flightSize) can be very small
because of lost packets. This is even more so with high speed connections
having large cwnd values (i.e., you lose more packets with these
connections). Because flightSize is small, but cwnd is still very large
compared to flightSize (even after cwnd reduction due to fast recovery),
there could be a lot of burst. This burst happens because the connection can
send as many as (cwnd - flightSize) at a time. Here the current
implementation makes mistake by reducing cwnd to flightSize + C -- this is a
violation of RFC 2851. This surely helps burst -- because it does not send
any new packets for a long time :-)

Since cwnd becomes very close to flightSize, rate halving obviously does not
work here. Note that rate halving gradually reduces cwnd to the half of cwnd
as duplicate acks are arriving. But this burst control of Linux makes rate
halving ineffective because cwnd is anyway reduced far below the half of
cwnd because flightSize is far less than the half. 

This feature not only creates a lot of oscillation in transmission (so
making the flow very unstable), but also reduces the throughput very
significantly. Look at the link below: the experimental results with the
current linux burst control with a scalable TCP flow. You can see that a
back-to-back loss leaves big gaps in transmission. We chose Scalable TCP for
demo because STCP may show a bigger difference as it has more fast recovery
per second. There are three of them.

http://www4.ncsu.edu/~lxu2/stcp_burst/

In the link, you can also find the experiment with our new burst control. It
makes the flow quite smooth. 

This is a real problem (I would say a bug) with the current implementation
that it does not follow RFC. In contrast, our implementation does not
violate RFC since we are not modifying cwnd at all because of flightSize. We
leave cwnd to congestion control. The burst control is just to regulate the
transmission so that it would not create the burst; cwnd is only the
guidance on how many packets you can send at a time and the burst control is
simply to adjust the transmission to match cwnd, but without creating too
much burst. In this regard, our control is pretty good. It may not be the
best one and requires more work to find the optimal control, but you cannot
go wrong with our technique since it does not violate RFC and it only makes
improvement. 

Along with burst control, I would like to add one additional mechanism in
ack processing. This does not cause any behavior change with TCP stack - it
is going to make it run faster. Just implementation suggestion. When
processing selective acks, the current linux implementation does too much
redundant work; since some sack blocks are duplicates, duplicate sack blocks
do not need a processing again. We need to remove this redundancy to speed
up sack processing. Also every time that a new sack arrives, the current
implementation performs a linear search on a list. This causes a lot of
overhead especially with high speed -- because you are receiving many ACKs.
This search processing can be greatly enhanced with a simple caching scheme.
All these changes require only a few lines of code. Again, no behavior
changes and just improves the speed. Also it is general enough to be used
with any TCP variants. Our test indicates this simple mechanism is good
enough (no need for any fancier implementation as HTCP suggests), and is
originally implemented by Tom Kelly. 

Ok. enuf arguing and rest my case here.
Injong and Lisong.

> -----Original Message-----
> From: Matt Mathis [mailto:mathis@psc.edu]
> To: Injong Rhee
> Cc: 'David S. Miller'; Stephen Hemminger; netdev@oss.sgi.com;
> rhee@ncsu.edu; lxu2@ncsu.edu; John Heffner; Jamshid Mahdavi
> Subject: RE: [RFC] TCP burst control
> 
> Yes, Injong is correct about the problem with Rate Having.  I haven't
> looked at
> his fix to see if I agree with it, but let me summarize the problem that
> we
> never finished (and prevented us from formally publishing it).
> 
> Basicly if the sender is under fluctuating resource stress such that awnd
> (the
> actual window) is frequently less than cwnd (remember this was written
> BEFORE
> cwnd validation.) then a loss that is detected when awnd is at a minimum
> sets
> cwnd to an unreasonably small value that has nothing to do with the actual
> state of the network.  Since we also set ssthresh down to cwnd, this takes
> "forever" to recover.
> 
> The real killer is if the application is periodicly bursty, such as
> copying
> >from older unbuffered disks or timesharing systems running other compute
> bound
> applications (normal for supercomputers).  Under these conditions TCP
> suffers
> >from frequent idle intervals, each restarting with a full window burst
> (pre-cwnd validation!).  If the burst was just slightly larger than the
> network
> pipe size, then the packets at the end of the burst are most at risk of
> being
> dropped.  When the ACKs for subsequent packets arrive and the loss is
> detected,
> awnd will be nearly zero, resulting in nearly zero cwnd and ssthresh......
> 
> We were in the heat of investigating solutions to this and other related
> problems (there are multiple potential solutions), when we realized that
> autotuning is orders of magnitude more important (at least to our users).
> 
> As the code points out there are other corner cases that we missed, such
> as
> reordering and such.  In principle I would like to come back to congestion
> control work, revisit FACK-RH and fold in all of the new stuff such as
> cwnd
> validation and window moderation.  (BTW I would not be surprised if these
> algorithms don't play nicely with rate halving - each was designed and
> tested
> without considering the effects of the others).
> 
> Thanks,
> --MM--
> -------------------------------------------
> Matt Mathis      http://www.psc.edu/~mathis
> Work:412.268.3319    Home/Cell:412.654.7529
> -------------------------------------------
> Evil is defined by mortals who think they know
> "The Truth" and forcibly apply it to others.
> 
> On Wed, 7 Jul 2004, Injong Rhee wrote:
> 
> >
> > Hi David,
> >
> > Let me clarify the issue a little. In my earlier message, I might have
> > sounded like accusing rate halving of the burst problem and window
> > oscillation. I might have misrepresented it a little in the heat of
> writing
> > the email too fast :-). In fact, rate halving helps ease burst during
> fast
> > recovery as written in the Internet draft.
> >
> > The main problem lies in the variable that rate halving is closely
> > interacting with in TCP SACK implementation: packet_in_flight (or
pipe_).
> In
> > the current implementation of Linux TCP SACK, cwnd is set to
> > packet_in_flight + C for every ack for CWR, recovery, and timeout-- Here
> C
> > is 1 to 3. But many times, packet_in_flight drops *far* below cwnd
> during
> > fast recovery. In high speed networks, a lot of packets can be lost in
> one
> > RTT (even acks as well because of slow CPUs). If that happens,
> > packet_in_flight becomes very small. At this time, Linux cwnd moderation
> (or
> > burst control) kicks in by setting cwnd to packet_in_flight + C so that
> the
> > sender does not burst all those packets between packet_in_flight and
> cwnd at
> > a single time. However, there is a problem with this approach. Since
> cwnd is
> > kept to very small, the transmission rate drops to almost zero during
> fast
> > recovery -- it should drop only to half of the current transmission rate
> (or
> > in high-speed protocols like BIC, it is only 87% of the current rate).
> Since
> > fast recovery lasts more than several RTTs, the network capacity is
> highly
> > underutilized during fast recovery. Furthermore, right after fast
> recovery,
> > cwnd goes into slow start since cwnd is typically far smaller than
> ssthrsh
> > after fast recovery. This also creates a lot of burst -- likely causing
> back
> > to back losses or even timeouts.
> >
> > You can see this behavior in the following link:
> >
> >
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments
> /B
> > IC-600-75-7500-1-0-0-noburst/index.htm
> >
> > We run in a dummynet without any change in the burst control. You can
> see
> > that whenever there is fast recovery, the rate almost drop to zero. The
> pink
> > line is the throughput observed from the dummynet at every second, and
> red
> > one is from Iperf. In the second figure, you can see cwnd. It drops to
> the
> > bottom during fast recovery -- this is not part of congestion control.
> It is
> > the burst control of Linux SACK doing it.
> >
> > But with our new burst control:
> >
> >
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments
> /B
> > IC-600-75-7500-1-0-0/index.htm
> >
> > You can see that cwnd is quite stabilized and the throughput does not
> have
> > as much dip as in the original case.
> >
> > Here is what we do: instead of reducing cwnd to packet_in_flight (which
> is,
> > in fact, meddling with congestion control), we reduce the gap between
> these
> > two numbers by allowing transmitting more packets per ack (we set this
> to
> > three more packets per ack) until packet_in_flight becomes close to
cwnd.
> > Also right after fast recovery, we increase packet_in_flight by 1% of
> > packet_in_flight up to cwnd. This reduces the huge burst after fast
> > recovery. Our implementation is trying to leave cwnd only to congestion
> > control and separates burst control from congestion control. This makes
> the
> > behavior of congestion control more predictable.  We will report more on
> > this tomorrow when we get back to the Lab to test some other
> environments,
> > especially when we have smaller buffers. This scheme may not be the cure
> for
> > all and needs more testing. So far, it has been working very well.
> >
> > Stay tuned.
> > Injong.
> > ---
> > Injong Rhee, Associate Professor
> > North Carolina State University
> > Raleigh, NC 27699
> > rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee
> >
> >
> >
> > -----Original Message-----
> > From: David S. Miller [mailto:davem@redhat.com]
> > Sent: Tuesday, July 06, 2004 8:29 PM
> > To: Injong Rhee
> > Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu;
> lxu2@ncsu.edu;
> > mathis@psc.edu
> > Subject: Re: [RFC] TCP burst control
> >
> > On Tue, 6 Jul 2004 20:09:41 -0400
> > "Injong Rhee" <rhee@eos.ncsu.edu> wrote:
> >
> >> Currently with rate having, current Linux tcp stack is full of hacks
> that
> > in
> >> fact, hurt the performance of linux tcp (sorry to say this).
> >
> > If rate-halving is broken, have you taken this up with it's creator,
> > Mr. Mathis?  What was his response?
> >
> > I've added him to the CC: list so this can be properly discussed.
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] TCP burst control
  2004-07-07  5:46       ` Injong Rhee
  2004-07-07  5:49         ` Injong Rhee
  2004-07-07 15:31         ` Matt Mathis
@ 2004-07-15  0:11         ` Weiguang Shi
  2 siblings, 0 replies; 12+ messages in thread
From: Weiguang Shi @ 2004-07-15  0:11 UTC (permalink / raw)
  To: Injong Rhee; +Cc: netdev linux, Qiang Ye

Hi,

My question is: Why in_flight drops *far* below cwnd in 
the first place?

The assumption is that each time an ack comes, TCP SHOULD 
decrease in_flight by the number of new segments that the ack 
acknowledges. 

During fast recovery, each packet after the lost is acked 
immediately (in the form of a duplicate ack) by the 
receiver since it is out of order. Each dupack should 
bring the latest SACK info, i.e., one more packet received. 
Therefore a packet cannot trigger a dupack acknowledging
more than one new segment, not even in the multiple-packet-drop 
scenario.

That is, sacked_out++ upon each ack during fast recovery;
lost_out=0 according to the conservative SACK; and before
the first (partial) ack that advances snd.una, retrans_out=1. 
Therefore, 

      in_flight = packets_out - sacked_out + 1

This seems to indicate that with SACK, in_flight should 
gradually decrease instead of dropping suddenly during 
fast recovery.

On the other hand, I've seen sudden-drops in my experiments. What 
happened?

Regards,
Wei

--- Injong Rhee <rhee@eos.ncsu.edu> wrote: > 
> Hi David,
> 
> ...
>
> The main problem lies in the variable that rate halving is closely
> interacting with in TCP SACK implementation: packet_in_flight (or pipe_). In
> the current implementation of Linux TCP SACK, cwnd is set to
> packet_in_flight + C for every ack for CWR, recovery, and timeout-- Here C
> is 1 to 3. But many times, packet_in_flight drops *far* below cwnd during
> fast recovery. In high speed networks, a lot of packets can be lost in one
> RTT (even acks as well because of slow CPUs). If that happens,
> packet_in_flight becomes very small. At this time, Linux cwnd moderation (or
> burst control) kicks in by setting cwnd to packet_in_flight + C so that the
> sender does not burst all those packets between packet_in_flight and cwnd at
> a single time. However, there is a problem with this approach. Since cwnd is
> kept to very small, the transmission rate drops to almost zero during fast
> recovery -- it should drop only to half of the current transmission rate (or
> in high-speed protocols like BIC, it is only 87% of the current rate). Since
> fast recovery lasts more than several RTTs, the network capacity is highly
> underutilized during fast recovery. Furthermore, right after fast recovery,
> cwnd goes into slow start since cwnd is typically far smaller than ssthrsh
> after fast recovery. This also creates a lot of burst -- likely causing back
> to back losses or even timeouts.
> 
> You can see this behavior in the following link:
> 
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
> IC-600-75-7500-1-0-0-noburst/index.htm
> 
> We run in a dummynet without any change in the burst control. You can see
> that whenever there is fast recovery, the rate almost drop to zero. The pink
> line is the throughput observed from the dummynet at every second, and red
> one is from Iperf. In the second figure, you can see cwnd. It drops to the
> bottom during fast recovery -- this is not part of congestion control. It is
> the burst control of Linux SACK doing it. 
> 
> But with our new burst control:
> 
> http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/tiny_release/experiments/B
> IC-600-75-7500-1-0-0/index.htm
> 
> You can see that cwnd is quite stabilized and the throughput does not have
> as much dip as in the original case.
> 
> Here is what we do: instead of reducing cwnd to packet_in_flight (which is,
> in fact, meddling with congestion control), we reduce the gap between these
> two numbers by allowing transmitting more packets per ack (we set this to
> three more packets per ack) until packet_in_flight becomes close to cwnd.
> Also right after fast recovery, we increase packet_in_flight by 1% of
> packet_in_flight up to cwnd. This reduces the huge burst after fast
> recovery. Our implementation is trying to leave cwnd only to congestion
> control and separates burst control from congestion control. This makes the
> behavior of congestion control more predictable.  We will report more on
> this tomorrow when we get back to the Lab to test some other environments,
> especially when we have smaller buffers. This scheme may not be the cure for
> all and needs more testing. So far, it has been working very well.
> 
> Stay tuned.
> Injong.
> ---
> Injong Rhee, Associate Professor
> North Carolina State University
> Raleigh, NC 27699
> rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee
> 
> 
> 
> -----Original Message-----
> From: David S. Miller [mailto:davem@redhat.com] 
> Sent: Tuesday, July 06, 2004 8:29 PM
> To: Injong Rhee
> Cc: shemminger@osdl.org; netdev@oss.sgi.com; rhee@ncsu.edu; lxu2@ncsu.edu;
> mathis@psc.edu
> Subject: Re: [RFC] TCP burst control
> 
> On Tue, 6 Jul 2004 20:09:41 -0400
> "Injong Rhee" <rhee@eos.ncsu.edu> wrote:
> 
> > Currently with rate having, current Linux tcp stack is full of hacks that
> in
> > fact, hurt the performance of linux tcp (sorry to say this).
> 
> If rate-halving is broken, have you taken this up with it's creator,
> Mr. Mathis?  What was his response?
> 
> I've added him to the CC: list so this can be properly discussed.
> 
> 
>  




______________________________________________________________________ 
Post your free ad now! http://personals.yahoo.ca

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] TCP burst control
  2004-07-07  0:09   ` Injong Rhee
  2004-07-07  0:29     ` David S. Miller
  2004-07-07  2:20     ` Nivedita Singhvi
@ 2004-07-28  9:48     ` Xiaoliang (David) Wei
  2004-07-28 13:45       ` Lisong Xu
  2 siblings, 1 reply; 12+ messages in thread
From: Xiaoliang (David) Wei @ 2004-07-28  9:48 UTC (permalink / raw)
  To: Injong Rhee, 'David S. Miller',
	'Stephen Hemminger'
  Cc: netdev, rhee, lxu2

Hi Injong and other Gurus,

    I support this approach to decouple congestion control and burstiness
control. But I have a question for the patch code.

    My question is about the tcp_cwnd_application_limited() in tcp_input.c
(@@ -3823,8 +3828,13 @@ in the patch).
My understanding is that tcp_cwnd_application_limited() is to reduce the
cwnd, according to RFC 2861, to avoid a large sending rate when the
connection resumes full sending speed. The window reduction here is not to
reduce burstiness, but to reduce the cwnd (since the cwnd is not a good
congestion measure after the idling period). But the patch seems to take
this congestion window reduction as a burstiness reduction mechanism, too. I
guess we don't need to change the window reduction in this function? Please
correct me if I made any mistake. Thanks:)

     I copied the part of patch codes that I'm not sure:
------------------------------------------------------------
@@ -3823,8 +3828,13 @@
  /* Limited by application or receiver window. */
  u32 win_used = max(tp->snd_cwnd_used, 2U);
  if (win_used < tp->snd_cwnd) {
+ u32 limit = (tp->snd_cwnd + win_used) >> 1;
  tp->snd_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ if (sysctl_tcp_burst_moderation)
+ tp->snd_cwnd = min(tp->snd_cwnd,
+    max(tp->snd_ssthresh, limit));
+ else
+ tp->snd_cwnd = limit;
  }
  tp->snd_cwnd_used = 0;
  }
------------------------------------------------------------

-David

Xiaoliang (David) Wei             Graduate Student in CS@Caltech
http://www.cs.caltech.edu/~weixl
====================================================
----- Original Message ----- 
From: "Injong Rhee" <rhee@eos.ncsu.edu>
To: "'David S. Miller'" <davem@redhat.com>; "'Stephen Hemminger'"
<shemminger@osdl.org>
Cc: <netdev@oss.sgi.com>; <rhee@ncsu.edu>; <lxu2@ncsu.edu>
Sent: Tuesday, July 06, 2004 5:09 PM
Subject: RE: [RFC] TCP burst control


> Hi David and Stephen,
>
> We tested this rate halving. In fact, rate having in fact degrades the
> performance quite a bit. We can send you more information about it. Our
test
> indicates that this feature introduces many timeouts (because of bursts),
> and also cause unnecessary cwnd backoff to reduce the transmission
> unjustifiably low -- so there are many (I will repeat, many) window and
> transmission oscillations during packet losses. We fix this problem
> completely using our own special burst control. It is very simple and easy
> technique to implement. If you need some data to back up our claims, I
will
> send you more. Once we implemented our burst control, we don't have any
> timeouts and not much fluctuation other than congestion control related.
> Currently with rate having, current Linux tcp stack is full of hacks that
in
> fact, hurt the performance of linux tcp (sorry to say this). Our burst
> control, in fact, simplifies a lot of that and makes sure cwnd to follow
> very closely to whatever congestion control algorithm is intended it to
> behave. The Linux Reno burst control in fact interferes with the original
> congestion control (in fact, it tries to do its own), and its performance
is
> very hard to predict.
>
> Hope this helps.
>
> Injong Rhee, Associate Professor
> North Carolina State University
> Raleigh, NC 27699
> rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee
>
>
> -----Original Message-----
> From: David S. Miller [mailto:davem@redhat.com]
> Sent: Tuesday, July 06, 2004 7:05 PM
> To: Stephen Hemminger
> Cc: netdev@oss.sgi.com; rhee@ncsu.edu
> Subject: Re: [RFC] TCP burst control
>
> On Tue, 6 Jul 2004 15:58:58 -0700
> Stephen Hemminger <shemminger@osdl.org> wrote:
>
> > When using advanced congestion control it is possible for TCP to decide
> that
> > it has a large window to fill with data right away. The problem is that
if
> TCP
> > creates long bursts, it becomes unfriendly to other flows and is more
> likely
> > to overrun intermediate queues.
> >
> > This patch limits the amount of data in flight. It came from BICTCP 1.1
> but it
> > has been generalized to all TCP congestion algorithms. It has had some
> testing,
> > but needs to be more widely tested.
>
> Both the New Reno and Westwood+ algorithms implement rate-halving to
> solve this problem.
>
> Why can't BICTCP use that instead of this special burst control hack?
>
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] TCP burst control
  2004-07-28  9:48     ` Xiaoliang (David) Wei
@ 2004-07-28 13:45       ` Lisong Xu
  0 siblings, 0 replies; 12+ messages in thread
From: Lisong Xu @ 2004-07-28 13:45 UTC (permalink / raw)
  To: Xiaoliang (David) Wei, Injong Rhee, 'David S. Miller',
	'Stephen Hemminger'
  Cc: netdev, rhee, lxu2

Hi Xiaoliang:

You are right that tcp_cwnd_application_limited() is designed to avoid a
large sending rate when the connection resumes full sending speed after an
idling period. But actually we have observed that
tcp_cwnd_application_limited() is also triggered in non-idling cases, and
interfere with our burst control algorithm, so we have modified
tcp_cwnd_application_limited().

We understand that our current implementation is not a perfect solution, and
we are working on a better one.

Thank you!
Lisong

----- Original Message ----- 
From: "Xiaoliang (David) Wei" <weixl@cs.caltech.edu>
To: "Injong Rhee" <rhee@eos.ncsu.edu>; "'David S. Miller'"
<davem@redhat.com>; "'Stephen Hemminger'" <shemminger@osdl.org>
Cc: <netdev@oss.sgi.com>; <rhee@ncsu.edu>; <lxu2@ncsu.edu>
Sent: Wednesday, July 28, 2004 5:48 AM
Subject: Re: [RFC] TCP burst control


> Hi Injong and other Gurus,
>
>     I support this approach to decouple congestion control and burstiness
> control. But I have a question for the patch code.
>
>     My question is about the tcp_cwnd_application_limited() in tcp_input.c
> (@@ -3823,8 +3828,13 @@ in the patch).
> My understanding is that tcp_cwnd_application_limited() is to reduce the
> cwnd, according to RFC 2861, to avoid a large sending rate when the
> connection resumes full sending speed. The window reduction here is not to
> reduce burstiness, but to reduce the cwnd (since the cwnd is not a good
> congestion measure after the idling period). But the patch seems to take
> this congestion window reduction as a burstiness reduction mechanism, too.
I
> guess we don't need to change the window reduction in this function?
Please
> correct me if I made any mistake. Thanks:)
>
>      I copied the part of patch codes that I'm not sure:
> ------------------------------------------------------------
> @@ -3823,8 +3828,13 @@
>   /* Limited by application or receiver window. */
>   u32 win_used = max(tp->snd_cwnd_used, 2U);
>   if (win_used < tp->snd_cwnd) {
> + u32 limit = (tp->snd_cwnd + win_used) >> 1;
>   tp->snd_ssthresh = tcp_current_ssthresh(tp);
> - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
> + if (sysctl_tcp_burst_moderation)
> + tp->snd_cwnd = min(tp->snd_cwnd,
> +    max(tp->snd_ssthresh, limit));
> + else
> + tp->snd_cwnd = limit;
>   }
>   tp->snd_cwnd_used = 0;
>   }
> ------------------------------------------------------------
>
> -David
>
> Xiaoliang (David) Wei             Graduate Student in CS@Caltech
> http://www.cs.caltech.edu/~weixl
> ====================================================
> ----- Original Message ----- 
> From: "Injong Rhee" <rhee@eos.ncsu.edu>
> To: "'David S. Miller'" <davem@redhat.com>; "'Stephen Hemminger'"
> <shemminger@osdl.org>
> Cc: <netdev@oss.sgi.com>; <rhee@ncsu.edu>; <lxu2@ncsu.edu>
> Sent: Tuesday, July 06, 2004 5:09 PM
> Subject: RE: [RFC] TCP burst control
>
>
> > Hi David and Stephen,
> >
> > We tested this rate halving. In fact, rate having in fact degrades the
> > performance quite a bit. We can send you more information about it. Our
> test
> > indicates that this feature introduces many timeouts (because of
bursts),
> > and also cause unnecessary cwnd backoff to reduce the transmission
> > unjustifiably low -- so there are many (I will repeat, many) window and
> > transmission oscillations during packet losses. We fix this problem
> > completely using our own special burst control. It is very simple and
easy
> > technique to implement. If you need some data to back up our claims, I
> will
> > send you more. Once we implemented our burst control, we don't have any
> > timeouts and not much fluctuation other than congestion control related.
> > Currently with rate having, current Linux tcp stack is full of hacks
that
> in
> > fact, hurt the performance of linux tcp (sorry to say this). Our burst
> > control, in fact, simplifies a lot of that and makes sure cwnd to follow
> > very closely to whatever congestion control algorithm is intended it to
> > behave. The Linux Reno burst control in fact interferes with the
original
> > congestion control (in fact, it tries to do its own), and its
performance
> is
> > very hard to predict.
> >
> > Hope this helps.
> >
> > Injong Rhee, Associate Professor
> > North Carolina State University
> > Raleigh, NC 27699
> > rhee@eos.ncsu.edu, http://www.csc.ncsu.edu/faculty/rhee
> >
> >
> > -----Original Message-----
> > From: David S. Miller [mailto:davem@redhat.com]
> > Sent: Tuesday, July 06, 2004 7:05 PM
> > To: Stephen Hemminger
> > Cc: netdev@oss.sgi.com; rhee@ncsu.edu
> > Subject: Re: [RFC] TCP burst control
> >
> > On Tue, 6 Jul 2004 15:58:58 -0700
> > Stephen Hemminger <shemminger@osdl.org> wrote:
> >
> > > When using advanced congestion control it is possible for TCP to
decide
> > that
> > > it has a large window to fill with data right away. The problem is
that
> if
> > TCP
> > > creates long bursts, it becomes unfriendly to other flows and is more
> > likely
> > > to overrun intermediate queues.
> > >
> > > This patch limits the amount of data in flight. It came from BICTCP
1.1
> > but it
> > > has been generalized to all TCP congestion algorithms. It has had some
> > testing,
> > > but needs to be more widely tested.
> >
> > Both the New Reno and Westwood+ algorithms implement rate-halving to
> > solve this problem.
> >
> > Why can't BICTCP use that instead of this special burst control hack?
> >
> >
> >
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2004-07-28 13:45 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-07-06 22:58 [RFC] TCP burst control Stephen Hemminger
2004-07-06 23:04 ` David S. Miller
2004-07-07  0:09   ` Injong Rhee
2004-07-07  0:29     ` David S. Miller
2004-07-07  5:46       ` Injong Rhee
2004-07-07  5:49         ` Injong Rhee
2004-07-07 15:31         ` Matt Mathis
2004-07-09 15:36           ` Injong Rhee
2004-07-15  0:11         ` Weiguang Shi
2004-07-07  2:20     ` Nivedita Singhvi
2004-07-28  9:48     ` Xiaoliang (David) Wei
2004-07-28 13:45       ` Lisong Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).