From mboxrd@z Thu Jan 1 00:00:00 1970 From: Lawrence Brakmo Subject: Re: [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control Date: Fri, 24 Jul 2015 00:47:36 +0000 Message-ID: References: <1437538896-1330704-1-git-send-email-brakmo@fb.com> <1437538896-1330704-4-git-send-email-brakmo@fb.com> Mime-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: netdev , Kernel Team , "Neal Cardwell" , Eric Dumazet To: Yuchung Cheng Return-path: Received: from mx0a-00082601.pphosted.com ([67.231.145.42]:39473 "EHLO mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752694AbbGXArn convert rfc822-to-8bit (ORCPT ); Thu, 23 Jul 2015 20:47:43 -0400 In-Reply-To: Content-Language: en-US Content-ID: Sender: netdev-owner@vger.kernel.org List-ID: Thank you all for your comments, I=B9m currently testing the changes. Other comments inline. On 7/21/15, 11:50 PM, "Yuchung Cheng" wrote: >On Tue, Jul 21, 2015 at 9:21 PM, Lawrence Brakmo wrote= : >> This is a request for comments. >> >> TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier versio= n of >> NV was presented at 2010's LPC (slides). It is a delayed based >> congestion avoidance for the data center. This version has been test= ed >> within a 10G rack where the HW RTTs are 20-50us. >> >> A description of TCP-NV, including implementation and experimental >> results, can be found at: >>=20 >>https://urldefense.proofpoint.com/v1/url?u=3Dhttp://www.brakmo.org/ne= tworki >>ng/tcp-nv/TCPNV.html&k=3DZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=3Dm30SgjN0= 7T%2FK%2 >>FdV1ZIt1iA%3D%3D%0A&m=3DXeELWxnafKynbNgkHg6RW%2F85hv1bPWlufUn2Dh4cOH4= %3D%0A >>&s=3D0029c47e62d84d6ffd22bd33e1895a3f61eaa21d88cbfb553aa1df780bbbdcf9 >> >> The current version includes many module parameters to support >> experimentation with the parameters. >> >> Signed-off-by: Lawrence Brakmo >> --- >> include/net/tcp.h | 1 + >> net/ipv4/Kconfig | 16 ++ >> net/ipv4/Makefile | 1 + >> net/ipv4/sysctl_net_ipv4.c | 9 + >> net/ipv4/tcp_input.c | 2 + >> net/ipv4/tcp_nv.c | 479 >>+++++++++++++++++++++++++++++++++++++++++++++ >> 6 files changed, 508 insertions(+) >> create mode 100644 net/ipv4/tcp_nv.c >> >> diff --git a/include/net/tcp.h b/include/net/tcp.h >> index 2e62efe..c0690ae 100644 >> --- a/include/net/tcp.h >> +++ b/include/net/tcp.h >> @@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat; >> extern int sysctl_tcp_min_tso_segs; >> extern int sysctl_tcp_autocorking; >> extern int sysctl_tcp_invalid_ratelimit; >> +extern int sysctl_tcp_nv_enable; >> >> extern atomic_long_t tcp_memory_allocated; >> extern struct percpu_counter tcp_sockets_allocated; >> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig >> index 6fb3c90..c37b374 100644 >> --- a/net/ipv4/Kconfig >> +++ b/net/ipv4/Kconfig >> @@ -539,6 +539,22 @@ config TCP_CONG_VEGAS >> window. TCP Vegas should provide less packet loss, but it is >> not as aggressive as TCP Reno. >> >> +config TCP_CONG_NV >> + tristate "TCP NV" >> + default m >> + ---help--- >> + TCP NV is a follow up to TCP Vegas. It has been modified to >>deal with >> + 10G networks, measurement noise introduced by LRO, GRO and >>interrupt >> + coalescence. In addition, it will decrease its cwnd >>multiplicative >multiplicatively > >> + instead of linearly. >> + >> + Note that in general congestion avoidance (cwnd decreased wh= en >># packets >> + queued grows) cannot coexist with congestion control (cwnd >>decreased only >> + when there is packet loss) due to fairness issues. One scena= rio >>when the >s/the/they >> + can coexist safely is when the CA flows have RTTs << CC flow= s >>RTTs. >> + >> + For further details see >>https://urldefense.proofpoint.com/v1/url?u=3Dhttp://www.brakmo.org/ne= tworki >>ng/tcp-nv/&k=3DZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=3Dm30SgjN07T%2FK%2Fd= V1ZIt1iA >>%3D%3D%0A&m=3DXeELWxnafKynbNgkHg6RW%2F85hv1bPWlufUn2Dh4cOH4%3D%0A&s=3D= 3441162 >>a0eefcad01003dbf0ba478e00a2080f76cd460eaf12213eb74f2eedbd >> + >> config TCP_CONG_SCALABLE >> tristate "Scalable TCP" >> default n >> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile >> index efc43f3..06f335f 100644 >> --- a/net/ipv4/Makefile >> +++ b/net/ipv4/Makefile >> @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) +=3D tcp_highspeed.o >> obj-$(CONFIG_TCP_CONG_HYBLA) +=3D tcp_hybla.o >> obj-$(CONFIG_TCP_CONG_HTCP) +=3D tcp_htcp.o >> obj-$(CONFIG_TCP_CONG_VEGAS) +=3D tcp_vegas.o >> +obj-$(CONFIG_TCP_CONG_NV) +=3D tcp_nv.o >> obj-$(CONFIG_TCP_CONG_VENO) +=3D tcp_veno.o >> obj-$(CONFIG_TCP_CONG_SCALABLE) +=3D tcp_scalable.o >> obj-$(CONFIG_TCP_CONG_LP) +=3D tcp_lp.o >> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c >> index 433231c..31846d5 100644 >> --- a/net/ipv4/sysctl_net_ipv4.c >> +++ b/net/ipv4/sysctl_net_ipv4.c >> @@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] =3D { >> .proc_handler =3D proc_dointvec_ms_jiffies, >> }, >> { >> + .procname =3D "tcp_nv_enable", >> + .data =3D &sysctl_tcp_nv_enable, >> + .maxlen =3D sizeof(int), >> + .mode =3D 0644, >> + .proc_handler =3D proc_dointvec_minmax, >> + .extra1 =3D &zero, >> + .extra2 =3D &one, >> + }, >> + { >> .procname =3D "icmp_msgs_per_sec", >> .data =3D &sysctl_icmp_msgs_per_sec, >> .maxlen =3D sizeof(int), >> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c >> index aca4ae5..87560d9 100644 >> --- a/net/ipv4/tcp_input.c >> +++ b/net/ipv4/tcp_input.c >> @@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly; >> int sysctl_tcp_moderate_rcvbuf __read_mostly =3D 1; >> int sysctl_tcp_early_retrans __read_mostly =3D 3; >> int sysctl_tcp_invalid_ratelimit __read_mostly =3D HZ/2; >> +int sysctl_tcp_nv_enable __read_mostly =3D 1; >> +EXPORT_SYMBOL(sysctl_tcp_nv_enable); >> >> #define FLAG_DATA 0x01 /* Incoming frame contained dat= a. >> */ >> #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a >>window update. */ >> diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c >> new file mode 100644 >> index 0000000..af451b6 >> --- /dev/null >> +++ b/net/ipv4/tcp_nv.c >> @@ -0,0 +1,479 @@ >> +/* >> + * TCP NV: TCP with Congestion Avoidance >> + * >> + * TCP-NV is a successor of TCP-Vegas that has been developed to >> + * deal with the issues that occur in modern networks. >> + * Like TCP-Vegas, TCP-NV supports true congestion avoidance, >> + * the ability to detect congestion before packet losses occur. >> + * When congestion (queue buildup) starts to occur, TCP-NV >> + * predicts what the cwnd size should be for the current >> + * throughput and it reduces the cwnd proportionally to >> + * the difference between the current cwnd and the predicted cwnd. >> + * TCP-NV behaves like Reno when no congestion is detected, or when >> + * recovering from packet losses. >what happens when loss is not related to congestion? This version makes no distinction. I will update a patch later (after I= =B9ve done enough testing) that adds the option of not decreasing cwnd in som= e cases (such as no congestion detected) as long as they are reasonable. >> + * >> + * TODO: >> + * 1) Modify the behavior so cwnd can grow faster under certain >>conditions >> + * 2) Add mechanism to deal with reverse congestion. >> + */ >> + >> +#include >> +#include >> +#include >> +#include >> +#include >> + >> +/* TCP NV parameters */ >> +static int nv_pad __read_mostly =3D 10; >> +static int nv_pad_buffer __read_mostly =3D 2; >> +static int nv_reset_period __read_mostly =3D 5; >> +static int nv_min_cwnd =3D 10; >> +static int nv_dec_eval_min_calls =3D 100; >> +static int nv_ssthresh_eval_min_calls =3D 30; >> +static int nv_rtt_min_cnt =3D 2; >> +static int nv_cong_decrease_mult =3D 30*128/100; >> +static int nv_ssthresh_factor =3D 8; >> +static int nv_rtt_factor =3D 128; >> +static int nv_rtt_cnt_dec_delta =3D 20; /* dec cwnd by this many RT= Ts */ >> +static int nv_dec_factor =3D 5; /* actual value is factor/8 */ >> +static int nv_loss_dec_factor =3D 820; /* on loss reduce cwnd by 20= % */ >> +static int nv_cwnd_growth_factor =3D 2; /* larger =3D> cwnd grows s= lower */ >> + >> +module_param(nv_pad, int, 0644); >> +MODULE_PARM_DESC(nv_pad, "extra packets above congestion level"); >> +module_param(nv_pad_buffer, int, 0644); >> +MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone"); >> +module_param(nv_reset_period, int, 0644); >> +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)")= ; >> +module_param(nv_min_cwnd, int, 0644); >> +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this >>value" >> + " without losses"); >> +module_param(nv_dec_eval_min_calls, int, 0644); >> +MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data >>points " >> + "before declaring congestion (< 256)"); >> +module_param(nv_ssthresh_eval_min_calls, int, 0644); >> +MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many da= ta >>points " >> + "before declaring congestion during initial >>slow-start"); >> +module_param(nv_rtt_min_cnt, int, 0644); >> +MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before >>declaring" >> + " congestion (<64)"); >> +module_param(nv_cong_decrease_mult, int, 0644); >> +MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor= "); >> +module_param(nv_ssthresh_factor, int, 0644); >> +MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor"); >> +module_param(nv_rtt_factor, int, 0644); >> +MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)"); >> +module_param(nv_rtt_cnt_dec_delta, int, 0644); >> +MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many >>RTTs " >> + "every 100 RTTs"); >> +module_param(nv_dec_factor, int, 0644); >> +MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by >>factor/8"); >> +module_param(nv_loss_dec_factor, int, 0644); >> +MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd =3D cwnd * t= his / >>1024"); >> +module_param(nv_cwnd_growth_factor, int, 0644); >> +MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger =3D> cwnd grows slo= wer"); >> + >> +/* TCP NV Parameters */ >> +struct tcpnv { >> + unsigned long nv_min_rtt_reset_jiffies; /* when to switch t= o >> + * nv_min_rtt_new *= / >> + u32 cnt; /* increase cwnd by 1 after ACKs */ >> + u32 loss_cwnd; /* cwnd at last loss */ >> + u8 nv_enable:1, >> + nv_allow_cwnd_growth:1, /* whether cwnd can >>grow */ >> + nv_rtt_cnt:6; /* RTTs without making ca decision *= / >> + u8 nv_eval_call_cnt;/* call count since last eval */ >> + u8 nv_min_cwnd; /* nv won't make a ca decision if cw= nd >>is >> + * smaller than this. It may grow to >>handle >> + * TSO, LRO and interrupt coalescenc= e >>because >> + * with these a small cwnd cannot >>saturate >> + * the link. Note that this is >>different from >> + * sysctl_tcp_nv_min_cwnd */ >> + u8 available; >> + u32 nv_last_rtt; /* last rtt */ >> + u32 nv_min_rtt; /* active min rtt. Used to determine >>slope */ >> + u32 nv_min_rtt_new; /* min rtt for future use */ >> + u32 nv_rtt_max_rate; /* max rate seen during current RTT = */ >> + u32 nv_rtt_start_seq; /* current RTT ends when packet arri= ves >> + * acking beyond nv_rtt_start_seq */ >> + u32 nv_last_snd_una; /* Previous value of tp->snd_una. It= is >> + * used to determine bytes acked sin= ce >>last >> + * call to bictcp_acked */ >> + u32 nv_no_cong_cnt; /* Consecutive no congestion decisio= ns >>*/ >> + u32 nv_rtt_cnt_dec; /* RTTs since last temporary cwnd >>decrease */ >> +}; >> + >> +#define NV_INIT_RTT 0xffffffff >> +#define NV_MIN_CWND 4 >> +#define NV_MIN_CWND_GROW 2 >> +#define NV_TSO_CWND_BOUND 80 >> + >> +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) >> +{ >> + struct tcp_sock *tp =3D tcp_sk(sk); >> + >> + ca->loss_cwnd =3D 0; >> + ca->nv_no_cong_cnt =3D 0; >> + ca->cnt =3D 0; >> + ca->nv_rtt_cnt =3D 0; >> + ca->nv_rtt_cnt_dec =3D 0; >> + ca->nv_allow_cwnd_growth =3D 1; >> + ca->nv_last_rtt =3D 0; >> + ca->nv_rtt_max_rate =3D 0; >> + ca->nv_rtt_start_seq =3D tp->snd_una; >> + ca->nv_eval_call_cnt =3D 0; >> + ca->nv_last_snd_una =3D tp->snd_una; >> +} >> + >> +static void tcpnv_init(struct sock *sk) >> +{ >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + tcpnv_reset(ca, sk); >> + >> + ca->nv_min_rtt_reset_jiffies =3D jiffies + 2*HZ; >> + ca->nv_min_rtt =3D NV_INIT_RTT; >> + ca->nv_min_rtt_new =3D NV_INIT_RTT; >> + ca->nv_enable =3D sysctl_tcp_nv_enable; >> + ca->nv_min_cwnd =3D NV_MIN_CWND; >> + if (nv_dec_eval_min_calls > 255) >> + nv_dec_eval_min_calls =3D 255; >> + if (nv_rtt_min_cnt > 63) >> + nv_rtt_min_cnt =3D 63; >> +} >> + >> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) >> +{ >> + struct tcp_sock *tp =3D tcp_sk(sk); >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + if (!tcp_is_cwnd_limited(sk)) >> + return; >> + >> + /* Only grow cwnd if NV has not detected congestion */ >> + if (sysctl_tcp_nv_enable && ca->nv_enable && >> + !ca->nv_allow_cwnd_growth) >> + return; >> + >> + if (tp->snd_cwnd <=3D tp->snd_ssthresh) { >> + acked =3D tcp_slow_start(tp, acked); >> + if (!acked) >> + return; >> + } >> + if (ca->cnt =3D=3D 0 || !(sysctl_tcp_nv_enable || ca->nv_ena= ble)) >> + ca->cnt =3D tp->snd_cwnd; >> + >> + tcp_cong_avoid_ai(tp, ca->cnt, acked); >> +} >> + >> +static u32 tcpnv_recalc_ssthresh(struct sock *sk) >> +{ >> + const struct tcp_sock *tp =3D tcp_sk(sk); >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + ca->loss_cwnd =3D tp->snd_cwnd; >> + return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); >> +} >> + >> +static u32 tcpnv_undo_cwnd(struct sock *sk) >> +{ >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); >> +} >> + >> +static void tcpnv_state(struct sock *sk, u8 new_state) >> +{ >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + if (new_state =3D=3D TCP_CA_Open) { >> + ca->nv_enable =3D 1; >> + tcpnv_reset(ca, sk); >> + } else if (new_state =3D=3D TCP_CA_Loss) { >> + ca->nv_enable =3D 0; >> + } >> +} >> + >> +/* Do congestion avoidance calculaitons for TCP-NV >> + */ >> +static void tcpnv_acked(struct sock *sk, struct ack_sample sample) >> +{ >> + const struct inet_connection_sock *icsk =3D inet_csk(sk); >> + struct tcp_sock *tp =3D tcp_sk(sk); >> + struct tcpnv *ca =3D inet_csk_ca(sk); >> + unsigned long now =3D jiffies; >> + s64 rate64 =3D 0; >> + u32 rate, max_win, cwnd_by_slope; >> + u32 avg_rtt; >> + u32 bytes_acked =3D 0; >> + >> + /* Some calls are for duplicates without timetamps */ >> + if (sample.rtt_us < 0) >> + return; >> + >> + /* If not in TCP_CA_Open state, skip. */ >> + if (icsk->icsk_ca_state !=3D TCP_CA_Open) >> + return; >> + >> + /* If NV mode is not enabled, behave like Reno */ >> + if (!sysctl_tcp_nv_enable || !ca->nv_enable) { >> + ca->nv_allow_cwnd_growth =3D 1; >what's the use case of using NV with sysctl_tcp_nv_enable=3D0? I=B9m paranoid, this is to turn NV behavior off in case it starts actin= g badly. I=B9ve done some testing within a rack, have plans to do more extensive testing. As per Neal=B9s comment, I=B9ve changed it to a module parameter making= NV even less intrusive. =20 > >> + return; >> + } >> + >> + bytes_acked =3D tp->snd_una - ca->nv_last_snd_una; >> + ca->nv_last_snd_una =3D tp->snd_una; >> + >> + if (sample.in_flight =3D=3D 0) >> + return; >> + >> + /* Calculate moving average of RTT */ >> + if (nv_rtt_factor > 0) { >> + if (ca->nv_last_rtt > 0) { >> + avg_rtt =3D (((u64)sample.rtt_us) * nv_rtt_f= actor >>+ >> + ((u64)ca->nv_last_rtt) >> + * (256 - nv_rtt_factor)) >> 8; >> + } else { >> + avg_rtt =3D sample.rtt_us; >> + ca->nv_min_rtt =3D avg_rtt << 1; >> + } >> + ca->nv_last_rtt =3D avg_rtt; >> + } else { >> + avg_rtt =3D sample.rtt_us; >> + } >> + >> + /* rate in 100's bits per second */ >> + rate64 =3D ((u64)sample.in_flight) * 8000000; >> + rate =3D (u32)div64_u64(rate64, (u64)(avg_rtt*100)); >> + >> + /* Remember the maximum rate seen during this RTT >> + * Note: It may be more than one RTT. This function should b= e >> + * called at least nv_dec_eval_min_calls times. >> + */ >> + if (ca->nv_rtt_max_rate < rate) >> + ca->nv_rtt_max_rate =3D rate; >> + >> + /* We have valid information, increment counter */ >> + if (ca->nv_eval_call_cnt < 255) >> + ca->nv_eval_call_cnt++; >> + >> + /* update min rtt if necessary */ >> + if (avg_rtt < ca->nv_min_rtt) >> + ca->nv_min_rtt =3D avg_rtt; >> + >> + /* update future min_rtt if necessary */ >> + if (avg_rtt < ca->nv_min_rtt_new) >> + ca->nv_min_rtt_new =3D avg_rtt; >> + >> + /* nv_min_rtt is updated with the minimum (possibley average= d) >>rtt >> + * seen in the last sysctl_tcp_nv_reset_period seconds (i.e.= a >> + * warm reset). This new nv_min_rtt will be continued to be >>updated >> + * and be used for another sysctl_tcp_nv_reset_period second= s, >> + * when it will be updated again. >> + * In practice we introduce some randomness, so the actual >>period used >> + * is chosen randomly from the range: >> + * [sysctl_tcp_nv_reset_period*3/4, >>sysctl_tcp_nv_reset_period*5/4) >> + */ >> + if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { >> + unsigned char rand; >> + ca->nv_min_rtt =3D ca->nv_min_rtt_new; >> + ca->nv_min_rtt_new =3D NV_INIT_RTT; >> + get_random_bytes(&rand, 1); >> + ca->nv_min_rtt_reset_jiffies =3D >> + now + ((nv_reset_period*(384 + rand)*HZ)>>9)= ; >> + /* Every so often we decrease nv_min_cwnd in case >>previous >> + * value is no longer accurate. >> + */ >> + ca->nv_min_cwnd =3D max(ca->nv_min_cwnd/2, NV_MIN_CW= ND); >> + } >> + >> + /* Once per RTT check if we need to do congestion avoidance = */ >> + if (before(ca->nv_rtt_start_seq, tp->snd_una)) { >> + ca->nv_rtt_start_seq =3D tp->snd_nxt; >> + if (ca->nv_rtt_cnt < 63) >> + /* Increase counter for RTTs without CA >>decision */ >> + ca->nv_rtt_cnt++; >> + if (ca->nv_rtt_cnt_dec < 255) >> + /* Increase counter for temporary cwnd decre= ase >>*/ >> + ca->nv_rtt_cnt_dec++; >> + >> + /* If this function is only called once within an RT= T >> + * the cwnd is probably too small (in some cases due= to >> + * tso, lro or interrupt coalescence), so we increas= e >> + * nv_min_cwnd. >> + */ >> + if (ca->nv_eval_call_cnt =3D=3D 1 >> + && bytes_acked >=3D (ca->nv_min_cwnd - 1) * >>tp->mss_cache >> + && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1) >> + && ca->nv_rtt_cnt_dec < 192) { >> + ca->nv_min_cwnd =3D min(ca->nv_min_cwnd >> + + NV_MIN_CWND_GROW, >> + NV_TSO_CWND_BOUND + 1)= ; >> + ca->nv_rtt_start_seq =3D tp->snd_nxt + >> + ca->nv_min_cwnd*tp->mss_cache; >> + ca->nv_eval_call_cnt =3D 0; >> + ca->nv_allow_cwnd_growth =3D 1; >> + return; >> + } >> + >> + /* Every 192 to 320 RTTs decrease cwnd to get better >>min RTT >> + * measurement. In practice we accomplish this by >>initializing >> + * nv_rtt_cnd_dec randomly form the range [0, 128) a= nd >> + * stopping at 320. >> + * We keep the value low for nv_rtt_cnt_dec_delta RT= Ts >>and then >> + * we restore cwnd to its previous value (by setting >> + * ssthresh to the previous value). >> + */ >> + if (ca->nv_rtt_cnt_dec =3D=3D 320) { >> + /* decrease cwnd and ssthresh */ >> + tp->snd_cwnd =3D >> + max((unsigned int)nv_min_cwnd, >> + ((tp->snd_cwnd * nv_dec_factor) = >> >>3)); >> + tp->snd_ssthresh =3D >> + max(tp->snd_cwnd, >> + ((tp->snd_ssthresh * nv_dec_fact= or) >>>> 3)); >> + ca->nv_allow_cwnd_growth =3D 0; >> + return; >> + } else if (ca->nv_rtt_cnt_dec > 320) { >> + if (ca->nv_rtt_cnt_dec - 320 >=3D >>nv_rtt_cnt_dec_delta) { >> + /* Restore ssthresh to restore cwnd = */ >> + unsigned char rand; >> + get_random_bytes(&rand, 1); >> + ca->nv_rtt_cnt_dec =3D rand >> 1; >> + tp->snd_ssthresh =3D (tp->snd_ssthre= sh << >>3) >> + / nv_dec_factor; >> + ca->nv_allow_cwnd_growth =3D 1; >> + ca->nv_no_cong_cnt =3D 0; >> + } >> + return; >> + } >> + >> + /* Find the ideal cwnd for current rate from slope >> + * slope =3D 80000.0 * mss / nv_min_rtt >> + * cwnd_by_slope =3D nv_rtt_max_rate / slope >> + */ >> + cwnd_by_slope =3D (u32) >> + div64_u64(((u64)ca->nv_rtt_max_rate) * >>ca->nv_min_rtt, >> + (u64)(80000 * tp->mss_cache)); >> + max_win =3D cwnd_by_slope + nv_pad; >> + >> + /* If cwnd > max_win, decrease cwnd >> + * if cwnd < max_win, grow cwnd >> + * else leave the same >> + */ >> + if (tp->snd_cwnd > max_win) { >> + /* there is congestion, check that it is ok >> + * to make a CA decision >> + * 1. We should have at least >>nv_dec_eval_min_calls >> + * data points before making a CA decisi= on >> + * 2. We only make a congesion decision afte= r >> + * nv_rtt_min_cnt RTTs >> + */ >> + if (ca->nv_rtt_cnt < nv_rtt_min_cnt) >> + return; >> + else if (tp->snd_ssthresh =3D=3D >>TCP_INFINITE_SSTHRESH) { >> + if (ca->nv_eval_call_cnt < >> + nv_ssthresh_eval_min_calls) >> + return; >> + } else if (ca->nv_eval_call_cnt < >> + nv_dec_eval_min_calls) { >> + return; >> + } >> + >> + /* We have enough data to determine we are >>congested */ >> + ca->nv_allow_cwnd_growth =3D 0; >> + tp->snd_ssthresh =3D >> + (nv_ssthresh_factor * max_win) >> 3; >> + if (tp->snd_cwnd - max_win > 2) { >> + /* gap > 2, we do exponential cwnd >>decrease */ >> + int dec; >> + dec =3D max(2U, ((tp->snd_cwnd - max= _win) >>* >> + nv_cong_decrease_mult= ) >>>> 7); >> + tp->snd_cwnd -=3D dec; >> + } else if (nv_cong_decrease_mult > 0) { >> + tp->snd_cwnd =3D max_win; >> + } >> + ca->cnt =3D tp->snd_cwnd; >> + ca->nv_no_cong_cnt =3D 0; >> + } else if (tp->snd_cwnd <=3D max_win - nv_pad_buffe= r) { >> + /* We allow growth of cwnd every RTT since w= e >>would >> + * have grown even if we waited (just slower= ) >> + */ >> + ca->nv_allow_cwnd_growth =3D 1; >> + ca->nv_no_cong_cnt++; >> + if (nv_cwnd_growth_factor > 0 && >> + ca->nv_no_cong_cnt > nv_cwnd_growth_fact= or) >>{ >> + ca->cnt =3D max(ca->cnt >> 1, (u32) = 4); >> + ca->nv_no_cong_cnt =3D 0; >> + } >> + } else { >> + ca->nv_allow_cwnd_growth =3D 0; >> + } >> + >> + /* update state */ >> + ca->nv_eval_call_cnt =3D 0; >> + ca->nv_rtt_cnt =3D 0; >> + ca->nv_rtt_max_rate =3D 0; >> + >> + /* Don't want to make cwnd < nv_min_cwnd >> + * (it wasn't before, if it is now is because nv >> + * decreased it). >> + */ >> + if (tp->snd_cwnd < nv_min_cwnd) >> + tp->snd_cwnd =3D nv_min_cwnd; >> + >> + } >> +} >> + >> +/* Extract info for Tcp socket info provided via netlink */ >> +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, >> + union tcp_cc_info *info) >> +{ >> + const struct tcpnv *ca =3D inet_csk_ca(sk); >> + >> + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { >> + info->vegas.tcpv_enabled =3D ca->nv_enable >> + && sysctl_tcp_nv_enable; >> + info->vegas.tcpv_rttcnt =3D ca->nv_rtt_cnt; >> + info->vegas.tcpv_rtt =3D ca->nv_last_rtt; >> + info->vegas.tcpv_minrtt =3D ca->nv_min_rtt; >> + >> + *attr =3D INET_DIAG_VEGASINFO; >> + return sizeof(struct tcpvegas_info); >> + } >> + return 0; >> +} >> +EXPORT_SYMBOL_GPL(tcpnv_get_info); >> + >> +static struct tcp_congestion_ops tcpnv __read_mostly =3D { >> + .init =3D tcpnv_init, >> + .ssthresh =3D tcpnv_recalc_ssthresh, >> + .cong_avoid =3D tcpnv_cong_avoid, >> + .set_state =3D tcpnv_state, >> + .undo_cwnd =3D tcpnv_undo_cwnd, >> + .pkts_acked =3D tcpnv_acked, >> + .get_info =3D tcpnv_get_info, >> + >> + .owner =3D THIS_MODULE, >> + .name =3D "nv", >> +}; >> + >> +static int __init tcpnv_register(void) >> +{ >> + BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); >> + >> + return tcp_register_congestion_control(&tcpnv); >> +} >> + >> +static void __exit tcpnv_unregister(void) >> +{ >> + tcp_unregister_congestion_control(&tcpnv); >> +} >> + >> +module_init(tcpnv_register); >> +module_exit(tcpnv_unregister); >> + >> +MODULE_AUTHOR("Lawrence Brakmo"); >> +MODULE_LICENSE("GPL"); >> +MODULE_DESCRIPTION("TCP NV"); >> +MODULE_VERSION("1.0"); >> -- >> 1.8.1 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe netdev" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html