[RFC] Vegas and tcp parameters per route

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] Vegas and tcp parameters per route
@ 2004-03-12 23:17 Stephen Hemminger
  2004-03-13  6:41 ` Pasi Sarolahti
  2004-03-15 17:38 ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 2 replies; 8+ messages in thread
From: Stephen Hemminger @ 2004-03-12 23:17 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, linux-net

This is the second more complete version of TCP Vegas that allows setting
the options based on route.  Reused the RTAX_FEATURE metric which got
defined but never used, to provide the ability to select vegas, westwood,
and/or frto per route.

There is an modified version of iproute2 available at
	http://developer.osdl.org/shemminger/tcp/iproute2-exp.tar.bz2
With this it is possible to setup options per route with ip.
	ip route add to 10.0.0.1 features vegas/frto

This is intended for comment and testing, please don't put it into 2.6 yet.

P.s: the netdevice.h change is to make building iproute2 less painful.

diff -urNp -X dontdiff linux-2.6/include/linux/netdevice.h tcp-vegas-2.6/include/linux/netdevice.h
--- linux-2.6/include/linux/netdevice.h	2004-03-12 14:22:19.000000000 -0800
+++ tcp-vegas-2.6/include/linux/netdevice.h	2004-03-12 14:36:07.000000000 -0800
@@ -29,11 +29,11 @@
 #include <linux/if_ether.h>
 #include <linux/if_packet.h>
 
+#ifdef __KERNEL__
 #include <asm/atomic.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
 
-#ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
diff -urNp -X dontdiff linux-2.6/include/linux/rtnetlink.h tcp-vegas-2.6/include/linux/rtnetlink.h
--- linux-2.6/include/linux/rtnetlink.h	2004-01-23 09:39:20.000000000 -0800
+++ tcp-vegas-2.6/include/linux/rtnetlink.h	2004-03-12 09:54:06.000000000 -0800
@@ -294,9 +294,10 @@ enum
 
 #define RTAX_MAX RTAX_FEATURES
 
-#define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NOMETRIC	0x00000001
+#define RTAX_FEATURE_FRTO	0x00000002
+#define RTAX_FEATURE_VEGAS	0x00000004
+#define RTAX_FEATURE_WESTWOOD	0x00000008
 
 struct rta_session
 {
diff -urNp -X dontdiff linux-2.6/include/linux/sysctl.h tcp-vegas-2.6/include/linux/sysctl.h
--- linux-2.6/include/linux/sysctl.h	2004-03-09 16:24:23.000000000 -0800
+++ tcp-vegas-2.6/include/linux/sysctl.h	2004-03-12 14:20:25.000000000 -0800
@@ -317,11 +317,12 @@ enum
 	NET_IPV4_ICMP_RATELIMIT=89,
 	NET_IPV4_ICMP_RATEMASK=90,
 	NET_TCP_TW_REUSE=91,
-	NET_TCP_FRTO=92,
-	NET_TCP_LOW_LATENCY=93,
-	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
-	NET_TCP_WESTWOOD=95,
-	NET_IPV4_IGMP_MAX_MSF=96,
+	NET_TCP_LOW_LATENCY=92,
+	NET_IPV4_IPFRAG_SECRET_INTERVAL=93,
+	NET_IPV4_IGMP_MAX_MSF=94,
+	NET_TCP_VEGAS_ALPHA=95,
+	NET_TCP_VEGAS_BETA=96,
+	NET_TCP_VEGAS_GAMMA=97,
 };
 
 enum {
diff -urNp -X dontdiff linux-2.6/include/linux/tcp.h tcp-vegas-2.6/include/linux/tcp.h
--- linux-2.6/include/linux/tcp.h	2004-02-05 14:44:29.000000000 -0800
+++ tcp-vegas-2.6/include/linux/tcp.h	2004-03-12 14:37:18.000000000 -0800
@@ -253,6 +253,8 @@ struct tcp_opt {
 	__u16	ext2_header_len;/* Options depending on route */
 	__u8	ca_state;	/* State of fast-retransmit machine 	*/
 	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
+	__u8	frto_counter;   /* Number of new acks after RTO */
+	__u8	features;	/* Feature (vegas, frto, ...) metric    */
 
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	queue_shrunk;	/* Write queue has been shrunk recently.*/
@@ -370,7 +372,6 @@ struct tcp_opt {
 	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
 	int			linger2;
 
-	int                     frto_counter; /* Number of new acks after RTO */
 	__u32                   frto_highmark; /* snd_nxt when RTO occurred */
 
 	unsigned long last_synq_overflow; 
@@ -388,6 +389,16 @@ struct tcp_opt {
                 __u32    rtt;
                 __u32    rtt_min;          /* minimum observed RTT */
         } westwood;
+/* Vegas variables */
+	struct {
+		__u32	beg_snd_nxt;	/* right edge during last RTT */
+		__u32	beg_snd_una;	/* left edge  during last RTT */
+		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
+		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
+		__u16	cntRTT;		/* # of RTTs measured within last RTT */
+		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
+		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
+	} vegas;
 };
 
 /* WARNING: don't change the layout of the members in tcp_sock! */
diff -urNp -X dontdiff linux-2.6/include/net/tcp.h tcp-vegas-2.6/include/net/tcp.h
--- linux-2.6/include/net/tcp.h	2004-03-01 08:55:47.000000000 -0800
+++ tcp-vegas-2.6/include/net/tcp.h	2004-03-12 14:19:13.000000000 -0800
@@ -580,9 +580,10 @@ extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
 extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
-extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_westwood;
+extern int sysctl_tcp_vegas_alpha;
+extern int sysctl_tcp_vegas_beta;
+extern int sysctl_tcp_vegas_gamma;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -1211,6 +1212,59 @@ static inline __u32 tcp_recalc_ssthresh(
 	return max(tp->snd_cwnd >> 1U, 2U);
 }
 
+/* Stop taking Vegas samples for now. */
+#define tcp_vegas_disable(__tp)	((__tp)->vegas.doing_vegas_now = 0)
+
+/* Is this TCP connection using Vegas (regardless of whether it is taking
+ * Vegas measurements at the current time)?
+ */
+#define tcp_is_vegas(__tp)	((__tp)->features & RTAX_FEATURE_VEGAS)
+    
+static inline void tcp_vegas_enable(struct tcp_opt *tp)
+{
+	/* There are several situations when we must "re-start" Vegas:
+	 *
+	 *  o when a connection is established
+	 *  o after an RTO
+	 *  o after fast recovery
+	 *  o when we send a packet and there is no outstanding
+	 *    unacknowledged data (restarting an idle connection)
+	 *
+	 * In these circumstances we cannot do a Vegas calculation at the
+	 * end of the first RTT, because any calculation we do is using
+	 * stale info -- both the saved cwnd and congestion feedback are
+	 * stale.
+	 *
+	 * Instead we must wait until the completion of an RTT during
+	 * which we actually receive ACKs.
+	 */
+    
+	/* Begin taking Vegas samples next time we send something. */
+	tp->vegas.doing_vegas_now = 1;
+     
+	/* Set the beginning of the next send window. */
+	tp->vegas.beg_snd_nxt = tp->snd_nxt;
+
+	tp->vegas.cntRTT = 0;
+	tp->vegas.minRTT = 0x7fffffff;
+}
+
+static inline void tcp_set_ca_state(struct tcp_opt *tp, u8 ca_state)
+{
+	if (tcp_is_vegas(tp)) {
+		if (ca_state == TCP_CA_Open) 
+			tcp_vegas_enable(tp);
+		else
+			tcp_vegas_disable(tp);
+	}
+	tp->ca_state = ca_state;
+}
+
+/* Should we be taking Vegas samples right now? */
+#define tcp_vegas_enabled(__tp)	((__tp)->vegas.doing_vegas_now)
+
+extern void tcp_vegas_init(struct tcp_opt *tp);
+
 /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
  * The exception is rate halving phase, when cwnd is decreasing towards
  * ssthresh.
@@ -1270,7 +1324,7 @@ static inline void tcp_enter_cwr(struct 
 	tp->prior_ssthresh = 0;
 	if (tp->ca_state < TCP_CA_CWR) {
 		__tcp_enter_cwr(tp);
-		tp->ca_state = TCP_CA_CWR;
+		tcp_set_ca_state(tp, TCP_CA_CWR);
 	}
 }
 
@@ -1974,6 +2028,8 @@ static inline void tcp_v4_setup_caps(str
 
 #define TCP_CHECK_TIMER(sk) do { } while (0)
 
+#define tcp_is_frto(__tp)	((__tp)->features & RTAX_FEATURE_FRTO)
+
 static inline int tcp_use_frto(const struct sock *sk)
 {
 	const struct tcp_opt *tp = tcp_sk(sk);
@@ -1982,7 +2038,7 @@ static inline int tcp_use_frto(const str
 	 * unsent new data, and the advertised window should allow
 	 * sending it.
 	 */
-	return (sysctl_tcp_frto && tp->send_head &&
+	return (tcp_is_frto(tp) && tp->send_head &&
 		!after(TCP_SKB_CB(tp->send_head)->end_seq,
 		       tp->snd_una + tp->snd_wnd));
 }
@@ -2028,9 +2084,11 @@ extern void tcp_proc_unregister(struct t
 #define TCP_WESTWOOD_INIT_RTT  (20*HZ)           /* maybe too conservative?! */
 #define TCP_WESTWOOD_RTT_MIN   (HZ/20)           /* 50ms */
 
+#define tcp_is_westwood(__tp) ((__tp)->features & RTAX_FEATURE_WESTWOOD)
+
 static inline void tcp_westwood_update_rtt(struct tcp_opt *tp, __u32 rtt_seq)
 {
-        if (sysctl_tcp_westwood)
+        if (tcp_is_westwood(tp))
                 tp->westwood.rtt = rtt_seq;
 }
 
@@ -2039,13 +2097,13 @@ void __tcp_westwood_slow_bw(struct sock 
 
 static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
 {
-        if (sysctl_tcp_westwood)
+        if (tcp_is_westwood(tcp_sk(sk)))
                 __tcp_westwood_fast_bw(sk, skb);
 }
 
 static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
 {
-        if (sysctl_tcp_westwood)
+        if (tcp_is_westwood(tcp_sk(sk)))
                 __tcp_westwood_slow_bw(sk, skb);
 }
 
@@ -2058,14 +2116,14 @@ static inline __u32 __tcp_westwood_bw_rt
 
 static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
 {
-	return sysctl_tcp_westwood ? __tcp_westwood_bw_rttmin(tp) : 0;
+	return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
 }
 
 static inline int tcp_westwood_ssthresh(struct tcp_opt *tp)
 {
 	__u32 ssthresh = 0;
 
-	if (sysctl_tcp_westwood) {
+	if (tcp_is_westwood(tp)) {
 		ssthresh = __tcp_westwood_bw_rttmin(tp);
 		if (ssthresh)
 			tp->snd_ssthresh = ssthresh;  
@@ -2078,7 +2136,7 @@ static inline int tcp_westwood_cwnd(stru
 {
 	__u32 cwnd = 0;
 
-	if (sysctl_tcp_westwood) {
+	if (tcp_is_westwood(tp)) {
 		cwnd = __tcp_westwood_bw_rttmin(tp);
 		if (cwnd)
 			tp->snd_cwnd = cwnd;
@@ -2086,4 +2144,5 @@ static inline int tcp_westwood_cwnd(stru
 
 	return (cwnd != 0);
 }
+ 
 #endif	/* _TCP_H */
diff -urNp -X dontdiff linux-2.6/net/ipv4/sysctl_net_ipv4.c tcp-vegas-2.6/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6/net/ipv4/sysctl_net_ipv4.c	2004-03-08 08:32:59.000000000 -0800
+++ tcp-vegas-2.6/net/ipv4/sysctl_net_ipv4.c	2004-03-12 14:20:50.000000000 -0800
@@ -569,14 +569,6 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec
 	},
 	{
-		.ctl_name	= NET_TCP_FRTO,
-		.procname	= "tcp_frto",
-		.data		= &sysctl_tcp_frto,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
 		.ctl_name	= NET_TCP_LOW_LATENCY,
 		.procname	= "tcp_low_latency",
 		.data		= &sysctl_tcp_low_latency,
@@ -594,9 +586,25 @@ ctl_table ipv4_table[] = {
 		.strategy	= &sysctl_jiffies
 	},
 	{
-		.ctl_name	= NET_TCP_WESTWOOD, 
-		.procname	= "tcp_westwood",
-		.data		= &sysctl_tcp_westwood,
+		.ctl_name	= NET_TCP_VEGAS_ALPHA,
+		.procname	= "tcp_vegas_alpha",
+		.data		= &sysctl_tcp_vegas_alpha,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS_BETA,
+		.procname	= "tcp_vegas_beta",
+		.data		= &sysctl_tcp_vegas_beta,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS_GAMMA,
+		.procname	= "tcp_vegas_gamma",
+		.data		= &sysctl_tcp_vegas_gamma,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp.c tcp-vegas-2.6/net/ipv4/tcp.c
--- linux-2.6/net/ipv4/tcp.c	2004-03-02 08:59:43.000000000 -0800
+++ tcp-vegas-2.6/net/ipv4/tcp.c	2004-03-09 16:37:21.000000000 -0800
@@ -2158,7 +2158,7 @@ int tcp_disconnect(struct sock *sk, int 
 	tp->packets_out = 0;
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_cnt = 0;
-	tp->ca_state = TCP_CA_Open;
+	tcp_set_ca_state(tp, TCP_CA_Open);
 	tcp_clear_retrans(tp);
 	tcp_delack_init(tp);
 	tp->send_head = NULL;
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp_input.c tcp-vegas-2.6/net/ipv4/tcp_input.c
--- linux-2.6/net/ipv4/tcp_input.c	2004-02-05 14:44:30.000000000 -0800
+++ tcp-vegas-2.6/net/ipv4/tcp_input.c	2004-03-12 14:25:47.000000000 -0800
@@ -89,8 +89,14 @@ int sysctl_tcp_adv_win_scale = 2;
 int sysctl_tcp_stdurg;
 int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
-int sysctl_tcp_frto;
-int sysctl_tcp_westwood;
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -407,6 +413,41 @@ static void tcp_event_data_recv(struct s
 		tcp_grow_window(sk, tp, skb);
 }
 
+/* Set up a new TCP connection, depending on whether it should be
+ * using Vegas or not.
+ */    
+void tcp_vegas_init(struct tcp_opt *tp)
+{
+	if (tcp_is_vegas(tp)) {
+		tp->vegas.baseRTT = 0x7fffffff;
+		tcp_vegas_enable(tp);
+	} else 
+		tcp_vegas_disable(tp);
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt)
+{
+	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < tp->vegas.baseRTT) 
+		tp->vegas.baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
+	tp->vegas.cntRTT++;
+}
+
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -420,6 +461,9 @@ static void tcp_rtt_estimator(struct tcp
 {
 	long m = mrtt; /* RTT */
 
+	if (tcp_vegas_enabled(tp))
+		vegas_rtt_calc(tp, mrtt);
+
 	/*	The following amusing code comes from Jacobson's
 	 *	article in SIGCOMM '88.  Note that rtt and mdev
 	 *	are scaled versions of rtt and mean deviation.
@@ -537,6 +581,10 @@ void tcp_update_metrics(struct sock *sk)
 			return;
 		}
 
+		/* don't want to store metrics */
+		if (tp->features & RTAX_FEATURE_NOMETRIC)
+			return;
+
 		m = dst_metric(dst, RTAX_RTT) - tp->srtt;
 
 		/* If newly calculated rtt larger than stored one,
@@ -629,6 +677,12 @@ static void tcp_init_metrics(struct sock
 
 	dst_confirm(dst);
 
+	tp->features = dst_metric(dst, RTAX_FEATURES);
+	if (tp->features & RTAX_FEATURE_NOMETRIC) {
+		printk(KERN_DEBUG "skipping initial metric setup\n");
+		goto reset;
+	}
+
 	if (dst_metric_locked(dst, RTAX_CWND))
 		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
 	if (dst_metric(dst, RTAX_SSTHRESH)) {
@@ -1003,7 +1057,7 @@ void tcp_enter_frto(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
-	tp->ca_state = TCP_CA_Open;
+	tcp_set_ca_state(tp, TCP_CA_Open);
 	tp->frto_highmark = tp->snd_nxt;
 }
 
@@ -1049,7 +1103,7 @@ void tcp_enter_frto_loss(struct sock *sk
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tp->ca_state = TCP_CA_Loss;
+	tcp_set_ca_state(tp, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
 }
@@ -1112,7 +1166,7 @@ void tcp_enter_loss(struct sock *sk, int
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tp->ca_state = TCP_CA_Loss;
+	tcp_set_ca_state(tp, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
 }
@@ -1489,7 +1543,7 @@ static int tcp_try_undo_recovery(struct 
 		tcp_moderate_cwnd(tp);
 		return 1;
 	}
-	tp->ca_state = TCP_CA_Open;
+	tcp_set_ca_state(tp, TCP_CA_Open);
 	return 0;
 }
 
@@ -1549,7 +1603,7 @@ static int tcp_try_undo_loss(struct sock
 		tp->retransmits = 0;
 		tp->undo_marker = 0;
 		if (!IsReno(tp))
-			tp->ca_state = TCP_CA_Open;
+			tcp_set_ca_state(tp, TCP_CA_Open);
 		return 1;
 	}
 	return 0;
@@ -1583,7 +1637,7 @@ static void tcp_try_to_open(struct sock 
 			state = TCP_CA_Disorder;
 
 		if (tp->ca_state != state) {
-			tp->ca_state = state;
+			tcp_set_ca_state(tp, state);
 			tp->high_seq = tp->snd_nxt;
 		}
 		tcp_moderate_cwnd(tp);
@@ -1642,7 +1696,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 	/* E. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
 	if (tp->ca_state == TCP_CA_Open) {
-		if (!sysctl_tcp_frto)
+		if (!tcp_is_frto(tp))
 			BUG_TRAP(tp->retrans_out == 0);
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
@@ -1658,7 +1712,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 			 * is ACKed for CWR bit to reach receiver. */
 			if (tp->snd_una != tp->high_seq) {
 				tcp_complete_cwr(tp);
-				tp->ca_state = TCP_CA_Open;
+				tcp_set_ca_state(tp, TCP_CA_Open);
 			}
 			break;
 
@@ -1669,7 +1723,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 			     * catching for all duplicate ACKs. */
 			    IsReno(tp) || tp->snd_una != tp->high_seq) {
 				tp->undo_marker = 0;
-				tp->ca_state = TCP_CA_Open;
+				tcp_set_ca_state(tp, TCP_CA_Open);
 			}
 			break;
 
@@ -1743,7 +1797,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 		}
 
 		tp->snd_cwnd_cnt = 0;
-		tp->ca_state = TCP_CA_Recovery;
+		tcp_set_ca_state(tp, TCP_CA_Recovery);
 	}
 
 	if (is_dupack || tcp_head_timedout(sk, tp))
@@ -1814,7 +1868,7 @@ tcp_ack_update_rtt(struct tcp_opt *tp, i
 /* This is Jacobson's slow start and congestion avoidance. 
  * SIGCOMM '88, p. 328.
  */
-static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
+static __inline__ void reno_cong_avoid(struct tcp_opt *tp)
 {
         if (tp->snd_cwnd <= tp->snd_ssthresh) {
                 /* In "safe" area, increase. */
@@ -1834,6 +1888,236 @@ static __inline__ void tcp_cong_avoid(st
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+/* This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+static void vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt)
+{
+	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up nicely with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, tp->vegas.beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+		u32 old_wnd, old_snd_cwnd;
+
+		
+		/* Here old_wnd is essentially the window of data that was
+		 * sent during the previous RTT, and has all
+		 * been acknowledged in the course of the RTT that ended
+		 * with the ACK we just received. Likewise, old_snd_cwnd
+		 * is the cwnd during the previous RTT.
+		 */
+		old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
+			tp->mss_cache;
+		old_snd_cwnd = tp->vegas.beg_snd_cwnd;
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
+		tp->vegas.beg_snd_nxt  = tp->snd_nxt;
+		tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+		/* Take into account the current RTT sample too, to
+		 * decrease the impact of delayed acks. This double counts
+		 * this sample since we count it for the next window as well,
+		 * but that's not too awful, since we're taking the min,
+		 * rather than averaging.
+		 */
+		vegas_rtt_calc(tp, seq_rtt);
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (tp->vegas.cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			if (tp->snd_cwnd > tp->snd_ssthresh)
+				tp->snd_cwnd++;
+		} else {
+			u32 rtt, target_cwnd, diff;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = tp->vegas.minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 * We keep it as a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary point.
+			 */
+			target_cwnd = ((old_wnd * tp->vegas.baseRTT)
+				       << V_PARAM_SHIFT) / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 *
+			 * Again, this is a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary
+			 * point.
+			 */
+			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+			if (tp->snd_cwnd < tp->snd_ssthresh) {
+				/* Slow start.  */
+				if (diff > sysctl_tcp_vegas_gamma) {
+					/* Going too fast. Time to slow down
+					 * and switch to congestion avoidance.
+					 */
+					tp->snd_ssthresh = 2;
+
+					/* Set cwnd to match the actual rate
+					 * exactly:
+					 *   cwnd = (actual rate) * baseRTT
+					 * Then we add 1 because the integer
+					 * truncation robs us of full link
+					 * utilization.
+					 */
+					tp->snd_cwnd = min(tp->snd_cwnd,
+							   (target_cwnd >>
+							    V_PARAM_SHIFT)+1);
+
+				}
+			} else {
+				/* Congestion avoidance. */
+				u32 next_snd_cwnd;
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > sysctl_tcp_vegas_beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					next_snd_cwnd = old_snd_cwnd - 1;
+				} else if (diff < sysctl_tcp_vegas_alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					next_snd_cwnd = old_snd_cwnd + 1;
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+					next_snd_cwnd = old_snd_cwnd;
+				}
+
+				/* Adjust cwnd upward or downward, toward the
+				 * desired value.
+				 */
+				if (next_snd_cwnd > tp->snd_cwnd)
+					tp->snd_cwnd++;
+				else if (next_snd_cwnd < tp->snd_cwnd)
+					tp->snd_cwnd--;
+			}
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		tp->vegas.cntRTT = 0;
+		tp->vegas.minRTT = 0x7fffffff;
+	}
+
+	/* The following code is executed for every ack we receive,
+	 * except for conditions checked in should_advance_cwnd()
+	 * before the call to tcp_cong_avoid(). Mainly this means that
+	 * we only execute this code if the ack actually acked some
+	 * data.
+	 */
+
+	/* If we are in slow start, increase our cwnd in response to this ACK.
+	 * (If we are not in slow start then we are in congestion avoidance,
+	 * and adjust our congestion window only once per RTT. See the code
+	 * above.)
+	 */
+	if (tp->snd_cwnd <= tp->snd_ssthresh) 
+		tp->snd_cwnd++;
+
+	/* to keep cwnd from growing without bound */
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+	/* Make sure that we are never so timid as to reduce our cwnd below
+	 * 2 MSS.
+	 *
+	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+	 */
+	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt)
+{
+	if (tcp_vegas_enabled(tp))
+		vegas_cong_avoid(tp, ack, seq_rtt);
+	else
+		reno_cong_avoid(tp);
+}
+
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
@@ -1848,7 +2132,7 @@ static __inline__ void tcp_ack_packets_o
 }
 
 /* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -1934,6 +2218,7 @@ static int tcp_clean_rtx_queue(struct so
 		}
 	}
 #endif
+	*seq_rtt_p = seq_rtt;
 	return acked;
 }
 
@@ -2294,6 +2579,7 @@ static int tcp_ack(struct sock *sk, stru
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	u32 prior_in_flight;
+	s32 seq_rtt;
 	int prior_packets;
 
 	/* If the ack is newer than sent or older than previous acks
@@ -2345,7 +2631,7 @@ static int tcp_ack(struct sock *sk, stru
 	prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk);
+	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
@@ -2353,13 +2639,14 @@ static int tcp_ack(struct sock *sk, stru
 	if (tcp_ack_is_dubious(tp, flag)) {
 		/* Advanve CWND, if state allows this. */
 		if ((flag & FLAG_DATA_ACKED) &&
-		    prior_in_flight >= tp->snd_cwnd &&
+		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
 		    tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp);
+			tcp_cong_avoid(tp, ack, seq_rtt);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd)
-			tcp_cong_avoid(tp);
+		if ((flag & FLAG_DATA_ACKED) && 
+		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
+			tcp_cong_avoid(tp, ack, seq_rtt);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp_minisocks.c tcp-vegas-2.6/net/ipv4/tcp_minisocks.c
--- linux-2.6/net/ipv4/tcp_minisocks.c	2004-03-01 08:55:47.000000000 -0800
+++ tcp-vegas-2.6/net/ipv4/tcp_minisocks.c	2004-03-08 09:33:16.000000000 -0800
@@ -769,7 +769,7 @@ struct sock *tcp_create_openreq_child(st
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
-		newtp->ca_state = TCP_CA_Open;
+		tcp_set_ca_state(newtp, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->send_head = NULL;
@@ -841,6 +841,8 @@ struct sock *tcp_create_openreq_child(st
 		if (newtp->ecn_flags&TCP_ECN_OK)
 			newsk->sk_no_largesend = 1;
 
+		tcp_vegas_init(newtp);
+
 		TCP_INC_STATS_BH(TcpPassiveOpens);
 	}
 	return newsk;
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp_output.c tcp-vegas-2.6/net/ipv4/tcp_output.c
--- linux-2.6/net/ipv4/tcp_output.c	2004-01-23 09:39:28.000000000 -0800
+++ tcp-vegas-2.6/net/ipv4/tcp_output.c	2004-03-09 14:05:56.000000000 -0800
@@ -105,7 +105,9 @@ static void tcp_cwnd_restart(struct tcp_
 	s32 delta = tcp_time_stamp - tp->lsndtime;
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
-
+	
+	if (tcp_is_vegas(tp)) 
+		tcp_vegas_enable(tp);
 	tp->snd_ssthresh = tcp_current_ssthresh(tp);
 	restart_cwnd = min(restart_cwnd, cwnd);
 
@@ -225,6 +227,19 @@ int tcp_transmit_skb(struct sock *sk, st
 			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 					    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
 		}
+		
+		/*
+		 * If the connection is idle and we are restarting,
+		 * then we don't want to do any Vegas calculations
+		 * until we get fresh RTT samples.  So when we
+		 * restart, we reset our Vegas state to a clean
+		 * slate. After we get acks for this flight of
+		 * packets, _then_ we can make Vegas calculations
+		 * again.
+		 */
+		if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
+			tcp_vegas_enable(tp);
+
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
 		skb_set_owner_w(skb, sk);
@@ -869,7 +884,7 @@ void tcp_simple_retransmit(struct sock *
 		tp->snd_ssthresh = tcp_current_ssthresh(tp);
 		tp->prior_ssthresh = 0;
 		tp->undo_marker = 0;
-		tp->ca_state = TCP_CA_Loss;
+		tcp_set_ca_state(tp, TCP_CA_Loss);
 	}
 	tcp_xmit_retransmit_queue(sk);
 }
@@ -1268,6 +1283,7 @@ static inline void tcp_connect_init(stru
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(sk);
+	tcp_vegas_init(tp);
 
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1318,6 +1334,7 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 	tp->snd_nxt = tp->write_seq;
 	tp->pushed_seq = tp->write_seq;
+	tcp_vegas_init(tp);
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-12 23:17 [RFC] Vegas and tcp parameters per route Stephen Hemminger
@ 2004-03-13  6:41 ` Pasi Sarolahti
  2004-03-15 17:20   ` Stephen Hemminger
  2004-03-15 17:38 ` YOSHIFUJI Hideaki / 吉藤英明
  1 sibling, 1 reply; 8+ messages in thread
From: Pasi Sarolahti @ 2004-03-13  6:41 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev, linux-net

Hi Stephen,

On Sat, 2004-03-13 at 01:17, Stephen Hemminger wrote:
> This is the second more complete version of TCP Vegas that allows setting
> the options based on route.  Reused the RTAX_FEATURE metric which got
> defined but never used, to provide the ability to select vegas, westwood,
> and/or frto per route.
> 
> There is an modified version of iproute2 available at
> 	http://developer.osdl.org/shemminger/tcp/iproute2-exp.tar.bz2
> With this it is possible to setup options per route with ip.
> 	ip route add to 10.0.0.1 features vegas/frto

I would still like to keep the good old sysctl interface (well, at least
for tcp_frto), because I'd belive it is more familiar and easier to use
for many and not dependent of external tools.

Would it be too complicated, if sysctl would give the global default,
from which one can deviate using RTAX_FEATURE?

- Pasi



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-13  6:41 ` Pasi Sarolahti
@ 2004-03-15 17:20   ` Stephen Hemminger
  2004-03-15 21:22     ` David S. Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Stephen Hemminger @ 2004-03-15 17:20 UTC (permalink / raw)
  To: Pasi Sarolahti; +Cc: David S. Miller, netdev, linux-net

On Sat, 13 Mar 2004 08:41:05 +0200
Pasi Sarolahti <pasi.sarolahti@iki.fi> wrote:

> Hi Stephen,
> 
> On Sat, 2004-03-13 at 01:17, Stephen Hemminger wrote:
> > This is the second more complete version of TCP Vegas that allows setting
> > the options based on route.  Reused the RTAX_FEATURE metric which got
> > defined but never used, to provide the ability to select vegas, westwood,
> > and/or frto per route.
> > 
> > There is an modified version of iproute2 available at
> > 	http://developer.osdl.org/shemminger/tcp/iproute2-exp.tar.bz2
> > With this it is possible to setup options per route with ip.
> > 	ip route add to 10.0.0.1 features vegas/frto
> 
> I would still like to keep the good old sysctl interface (well, at least
> for tcp_frto), because I'd belive it is more familiar and easier to use
> for many and not dependent of external tools.
> 
> Would it be too complicated, if sysctl would give the global default,
> from which one can deviate using RTAX_FEATURE?

Maybe for frto it needs to stay, but sysctl's are more painful and complex
than keeping the stuff in the routing info.  Also, the external tools are
part of every distro, except for a few embedded systems, the networking code
depends on user tools already.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-12 23:17 [RFC] Vegas and tcp parameters per route Stephen Hemminger
  2004-03-13  6:41 ` Pasi Sarolahti
@ 2004-03-15 17:38 ` YOSHIFUJI Hideaki / 吉藤英明
  1 sibling, 0 replies; 8+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-03-15 17:38 UTC (permalink / raw)
  To: shemminger; +Cc: davem, netdev, linux-net, yoshfuji

In article <20040312151729.25d9c696@dell_ss3.pdx.osdl.net> (at Fri, 12 Mar 2004 15:17:29 -0800), Stephen Hemminger <shemminger@osdl.org> says:

> diff -urNp -X dontdiff linux-2.6/include/linux/sysctl.h tcp-vegas-2.6/include/linux/sysctl.h
> --- linux-2.6/include/linux/sysctl.h	2004-03-09 16:24:23.000000000 -0800
> +++ tcp-vegas-2.6/include/linux/sysctl.h	2004-03-12 14:20:25.000000000 -0800
> @@ -317,11 +317,12 @@ enum
>  	NET_IPV4_ICMP_RATELIMIT=89,
>  	NET_IPV4_ICMP_RATEMASK=90,
>  	NET_TCP_TW_REUSE=91,
> -	NET_TCP_FRTO=92,
> -	NET_TCP_LOW_LATENCY=93,
> -	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
> -	NET_TCP_WESTWOOD=95,
> -	NET_IPV4_IGMP_MAX_MSF=96,
> +	NET_TCP_LOW_LATENCY=92,
> +	NET_IPV4_IPFRAG_SECRET_INTERVAL=93,
> +	NET_IPV4_IGMP_MAX_MSF=94,
> +	NET_TCP_VEGAS_ALPHA=95,
> +	NET_TCP_VEGAS_BETA=96,
> +	NET_TCP_VEGAS_GAMMA=97,
>  };
>  

Please do not change values for NET_IPV4_IPFRAG_SECRET_INTERVAL etc.

-- 
Hideaki YOSHIFUJI @ USAGI Project <yoshfuji@linux-ipv6.org>
GPG FP: 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-15 17:20   ` Stephen Hemminger
@ 2004-03-15 21:22     ` David S. Miller
  2004-03-15 21:29       ` Andi Kleen
  2004-03-15 21:30       ` Stephen Hemminger
  0 siblings, 2 replies; 8+ messages in thread
From: David S. Miller @ 2004-03-15 21:22 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: pasi.sarolahti, netdev, linux-net

On Mon, 15 Mar 2004 09:20:18 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> > Would it be too complicated, if sysctl would give the global default,
> > from which one can deviate using RTAX_FEATURE?
> 
> Maybe for frto it needs to stay, but sysctl's are more painful and complex
> than keeping the stuff in the routing info.  Also, the external tools are
> part of every distro, except for a few embedded systems, the networking code
> depends on user tools already.

How do you propose to support some kind of "global enable" for features.

I think sysctl's support this quite well.  The test for the feature
becomes "sysctl || route_attribute".

Also, as Yoshfuji stated, you absolutely cannot change the existing
sysctl numbers as tools that use the sysctl() system call use those
numbers explicitly thus they are compiled into applications.

I really am not going to consider something that removes existing
sysctl tunables. :-)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-15 21:22     ` David S. Miller
@ 2004-03-15 21:29       ` Andi Kleen
  2004-03-15 21:30       ` Stephen Hemminger
  1 sibling, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2004-03-15 21:29 UTC (permalink / raw)
  To: David S. Miller; +Cc: Stephen Hemminger, pasi.sarolahti, netdev, linux-net

> Also, as Yoshfuji stated, you absolutely cannot change the existing
> sysctl numbers as tools that use the sysctl() system call use those
> numbers explicitly thus they are compiled into applications.

Actually it is fine these days. Numerical sysctl is deprecated
and should be removed soon.

-Andi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-15 21:22     ` David S. Miller
  2004-03-15 21:29       ` Andi Kleen
@ 2004-03-15 21:30       ` Stephen Hemminger
  2004-03-15 21:37         ` David S. Miller
  1 sibling, 1 reply; 8+ messages in thread
From: Stephen Hemminger @ 2004-03-15 21:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: pasi.sarolahti, netdev, linux-net

On Mon, 15 Mar 2004 13:22:14 -0800
"David S. Miller" <davem@redhat.com> wrote:

> On Mon, 15 Mar 2004 09:20:18 -0800
> Stephen Hemminger <shemminger@osdl.org> wrote:
> 
> > > Would it be too complicated, if sysctl would give the global default,
> > > from which one can deviate using RTAX_FEATURE?
> > 
> > Maybe for frto it needs to stay, but sysctl's are more painful and complex
> > than keeping the stuff in the routing info.  Also, the external tools are
> > part of every distro, except for a few embedded systems, the networking code
> > depends on user tools already.
> 
> How do you propose to support some kind of "global enable" for features.

The easiest way to do that is to initialize each TP with features from
sysctl when created.

> I think sysctl's support this quite well.  The test for the feature
> becomes "sysctl || route_attribute".

That is what the next version does for FRTO.

> Also, as Yoshfuji stated, you absolutely cannot change the existing
> sysctl numbers as tools that use the sysctl() system call use those
> numbers explicitly thus they are compiled into applications.

Okay, what about WESTWOOD?


> I really am not going to consider something that removes existing
> sysctl tunables. :-)

What about tcp_westwood which is new?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] Vegas and tcp parameters per route
  2004-03-15 21:30       ` Stephen Hemminger
@ 2004-03-15 21:37         ` David S. Miller
  0 siblings, 0 replies; 8+ messages in thread
From: David S. Miller @ 2004-03-15 21:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: pasi.sarolahti, netdev, linux-net

On Mon, 15 Mar 2004 13:30:34 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> > How do you propose to support some kind of "global enable" for features.
> 
> The easiest way to do that is to initialize each TP with features from
> sysctl when created.

Right.

> > I think sysctl's support this quite well.  The test for the feature
> > becomes "sysctl || route_attribute".
> 
> That is what the next version does for FRTO.

OK, but do not limit it to FRTO, I think all TCP features should
be handle'able this way.

Actually, this is a problem with using an RTAX_* that is a bitmask
or single binary.  You need a "don't care" value, and thus effectively
a trinary route entry state to do this properly.  Right?

In this way, you have a global default, but you can also FORCE something
off per-route.  People can work-around ECN-fux0red sites without obviating
ECN completely.

Or perhaps you can come up with another method by which to achieve this?

> Okay, what about WESTWOOD?
 ...
> What about tcp_westwood which is new?

Just don't change sysctl numbers, even we keep them perfectly sync'd between
2.6.x and 2.4.x sources.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2004-03-15 21:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-03-12 23:17 [RFC] Vegas and tcp parameters per route Stephen Hemminger
2004-03-13  6:41 ` Pasi Sarolahti
2004-03-15 17:20   ` Stephen Hemminger
2004-03-15 21:22     ` David S. Miller
2004-03-15 21:29       ` Andi Kleen
2004-03-15 21:30       ` Stephen Hemminger
2004-03-15 21:37         ` David S. Miller
2004-03-15 17:38 ` YOSHIFUJI Hideaki / 吉藤英明

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).