This patch makes selection of congestion control algorithm simpler by using a single sysctl for that purpose, rather than a cascade of sysctls. The patch also does some minor cleanups to avoid cascade actions between algorithms so that flow control is cleaner. Possible improvements: - Use a string when reading/writing from sysctl to make it more friendly to humans - And/Or, provide a list of all available congestion control algorithms The patch is against 2.6.11-rc4-bk9. Signed-Off-By: Yee-Ting Li Signed-Off-By: Baruch Even Index: 2.6.11-select/include/linux/sysctl.h =================================================================== --- 2.6.11-select.orig/include/linux/sysctl.h +++ 2.6.11-select/include/linux/sysctl.h @@ -344,6 +344,7 @@ enum NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, + NET_TCP_ADV_CONG=108, }; enum { Index: 2.6.11-select/include/net/tcp.h =================================================================== --- 2.6.11-select.orig/include/net/tcp.h +++ 2.6.11-select/include/net/tcp.h @@ -597,13 +597,11 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; extern int sysctl_tcp_vegas_alpha; extern int sysctl_tcp_vegas_beta; extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; +extern int sysctl_tcp_adv_cong; extern int sysctl_tcp_bic_fast_convergence; extern int sysctl_tcp_bic_low_window; extern int sysctl_tcp_moderate_rcvbuf; @@ -1241,7 +1239,8 @@ static __inline__ unsigned int tcp_packe */ static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) { - if (tcp_is_bic(tp)) { + switch (tp->adv_cong) { + case TCP_BIC: if (sysctl_tcp_bic_fast_convergence && tp->snd_cwnd < tp->bictcp.last_max_cwnd) tp->bictcp.last_max_cwnd @@ -1253,9 +1252,11 @@ static inline __u32 tcp_recalc_ssthresh( if (tp->snd_cwnd > sysctl_tcp_bic_low_window) return max(tp->snd_cwnd - (tp->snd_cwnd/BICTCP_1_OVER_BETA), 2U); - } + break; - return max(tp->snd_cwnd >> 1U, 2U); + default: + return max(tp->snd_cwnd >> 1U, 2U); + } } /* Stop taking Vegas samples for now. */ @@ -1980,24 +1981,19 @@ static inline void tcp_westwood_update_r tp->westwood.rtt = rtt_seq; } -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) +static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) { return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / (__u32) (tp->mss_cache_std), 2U); } -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) { __u32 ssthresh = 0; if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); + ssthresh = tcp_westwood_bw_rttmin(tp); if (ssthresh) tp->snd_ssthresh = ssthresh; } @@ -2010,7 +2006,7 @@ static inline int tcp_westwood_cwnd(stru __u32 cwnd = 0; if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); + cwnd = tcp_westwood_bw_rttmin(tp); if (cwnd) tp->snd_cwnd = cwnd; } Index: 2.6.11-select/net/ipv4/sysctl_net_ipv4.c =================================================================== --- 2.6.11-select.orig/net/ipv4/sysctl_net_ipv4.c +++ 2.6.11-select/net/ipv4/sysctl_net_ipv4.c @@ -602,22 +602,14 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, + { + .ctl_name = NET_TCP_ADV_CONG, + .procname = "tcp_adv_cong", + .data = &sysctl_tcp_adv_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_TCP_VEGAS_ALPHA, .procname = "tcp_vegas_alpha", @@ -643,14 +635,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, .procname = "tcp_bic_fast_convergence", .data = &sysctl_tcp_bic_fast_convergence, Index: 2.6.11-select/net/ipv4/tcp_input.c =================================================================== --- 2.6.11-select.orig/net/ipv4/tcp_input.c +++ 2.6.11-select/net/ipv4/tcp_input.c @@ -87,8 +87,6 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; @@ -99,10 +97,11 @@ int sysctl_tcp_moderate_rcvbuf = 1; int sysctl_tcp_vegas_alpha = 1<adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } + switch (sysctl_tcp_adv_cong) { + case TCP_VEGAS: + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + /* Fallthrough */ + case TCP_BIC: + case TCP_WESTWOOD: + tp->adv_cong = sysctl_tcp_adv_cong; + break; + default: + tp->adv_cong = TCP_RENO; + } } /* Do RTT sampling needed for Vegas. @@ -1600,18 +1602,25 @@ static void tcp_cwnd_down(struct tcp_soc int decr = tp->snd_cwnd_cnt + 1; __u32 limit; - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ + switch (tp->adv_cong) { + case TCP_WESTWOOD: + /* + * TCP Westwood + * Here limit is evaluated as BWestimation*RTTmin (for obtaining it + * in packets we use mss_cache). The guard is against + * strange cases in which BWE*RTTmin could assume value + * 0. It should not happen but... + */ - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; + if (!(limit = tcp_westwood_bw_rttmin(tp))) + limit = tp->snd_ssthresh/2; + break; + default: + limit = tp->snd_ssthresh/2; + break; + } + tp->snd_cwnd_cnt = decr&1; decr >>= 1; @@ -2014,6 +2023,27 @@ static inline void tcp_ack_update_rtt(st tcp_ack_no_tstamp(tp, seq_rtt, flag); } +static inline void tcp_slow_start(struct tcp_sock *tp) +{ + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} + +static inline void tcp_increase_cwnd(struct tcp_sock *tp, __u32 window) +{ + /* In dangerous area, increase slowly. + * In theory, for standard tcp, this is tp->snd_cwnd += 1 / window + * (snd_cwnd for Reno) + */ + if (tp->snd_cwnd_cnt >= window) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; +} + /* * Compute congestion window to use. * @@ -2029,10 +2059,6 @@ static inline void tcp_ack_update_rtt(st */ static inline __u32 bictcp_cwnd(struct tcp_sock *tp) { - /* orignal Reno behaviour */ - if (!tcp_is_bic(tp)) - return tp->snd_cwnd; - if (tp->bictcp.last_cwnd == tp->snd_cwnd && (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) return tp->bictcp.cnt; @@ -2080,23 +2106,13 @@ static inline __u32 bictcp_cwnd(struct t /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -static inline void reno_cong_avoid(struct tcp_sock *tp) +static inline void reno_cong_avoid(struct tcp_sock *tp, u32 snd_cwnd) { - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else + tcp_increase_cwnd(tp, snd_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2324,10 +2340,22 @@ static void vegas_cong_avoid(struct tcp_ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) { - if (tcp_vegas_enabled(tp)) - vegas_cong_avoid(tp, ack, seq_rtt); - else - reno_cong_avoid(tp); + if (tp->snd_cwnd >= tp->snd_cwnd_clamp) + return; + + switch (sysctl_tcp_adv_cong) { + case TCP_VEGAS: + vegas_cong_avoid(tp, ack, seq_rtt); + break; + + case TCP_BIC: + reno_cong_avoid(tp, bictcp_cwnd(tp)); + break; + + default: + reno_cong_avoid(tp, tp->snd_cwnd); + break; + } } /* Restart timer after forward progress on connection.