* [RFC PATCH v5 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
2015-08-05 1:39 [RFC PATCH v5 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
@ 2015-08-05 1:39 ` Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 2/4] tcp: refactor struct tcp_skb_cb Lawrence Brakmo
` (2 subsequent siblings)
3 siblings, 0 replies; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-05 1:39 UTC (permalink / raw)
To: netdev
Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
Replace 2 arguments (cnt and rtt) in the congestion control modules'
pkts_acked() function with a struct. This will allow adding more
information without having to modify existing congestion control
modules (tcp_nv in particular needs bytes in flight when packet
was sent).
As proposed by Neal Cardwell in his comments to the tcp_nv patch.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/net/tcp.h | 7 ++++++-
net/ipv4/tcp_bic.c | 6 +++---
net/ipv4/tcp_cdg.c | 14 +++++++-------
net/ipv4/tcp_cubic.c | 6 +++---
net/ipv4/tcp_htcp.c | 10 +++++-----
net/ipv4/tcp_illinois.c | 20 ++++++++++----------
net/ipv4/tcp_input.c | 7 +++++--
net/ipv4/tcp_lp.c | 6 +++---
net/ipv4/tcp_vegas.c | 6 +++---
net/ipv4/tcp_vegas.h | 2 +-
net/ipv4/tcp_veno.c | 7 ++++---
net/ipv4/tcp_westwood.c | 7 ++++---
net/ipv4/tcp_yeah.c | 7 ++++---
13 files changed, 58 insertions(+), 47 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a..0121529 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,6 +834,11 @@ enum tcp_ca_ack_event_flags {
union tcp_cc_info;
+struct ack_sample {
+ u32 pkts_acked;
+ s32 rtt_us;
+};
+
struct tcp_congestion_ops {
struct list_head list;
u32 key;
@@ -857,7 +862,7 @@ struct tcp_congestion_ops {
/* new value of cwnd after loss (optional) */
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
- void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
+ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fd1405d..f469f1b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+ ca->delayed_ack += sample->pkts_acked -
+ (ca->delayed_ack >> ACK_RATIO_SHIFT);
}
}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 167b6a3..b4e5af7 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
}
-static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- if (rtt_us <= 0)
+ if (sample->rtt_us <= 0)
return;
/* A heuristic for filtering delayed ACKs, adapted from:
@@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
* delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
*/
if (tp->sacked_out == 0) {
- if (num_acked == 1 && ca->delack) {
+ if (sample->pkts_acked == 1 && ca->delack) {
/* A delayed ACK is only used for the minimum if it is
* provenly lower than an existing non-zero minimum.
*/
- ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
ca->delack--;
return;
- } else if (num_acked > 1 && ca->delack < 5) {
+ } else if (sample->pkts_acked > 1 && ca->delack < 5) {
ca->delack++;
}
}
- ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
- ca->rtt.max = max(ca->rtt.max, rtt_us);
+ ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+ ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
}
static u32 tcp_cdg_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 28011fb..c5d0ba5 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -416,21 +416,21 @@ static void hystart_update(struct sock *sk, u32 delay)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
/* Some calls are for duplicates without timetamps */
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Discard delay samples right after fast recovery */
if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
return;
- delay = (rtt_us << 3) / USEC_PER_MSEC;
+ delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
if (delay == 0)
delay = 1;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 82f0d9e..4a4d8e7 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
}
static void measure_achieved_throughput(struct sock *sk,
- u32 pkts_acked, s32 rtt)
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk,
u32 now = tcp_time_stamp;
if (icsk->icsk_ca_state == TCP_CA_Open)
- ca->pkts_acked = pkts_acked;
+ ca->pkts_acked = sample->pkts_acked;
- if (rtt > 0)
- measure_rtt(sk, usecs_to_jiffies(rtt));
+ if (sample->rtt_us > 0)
+ measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
if (!use_bandwidth_switch)
return;
@@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk,
return;
}
- ca->packetcount += pkts_acked;
+ ca->packetcount += sample->pkts_acked;
if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 2ab9bbb..77ad119 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -82,30 +82,30 @@ static void tcp_illinois_init(struct sock *sk)
}
/* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample)
{
struct illinois *ca = inet_csk_ca(sk);
- ca->acked = pkts_acked;
+ ca->acked = sample->pkts_acked;
/* dup ack, no rtt sample */
- if (rtt < 0)
+ if (sample->rtt_us < 0)
return;
/* ignore bogus values, this prevents wraparound in alpha math */
- if (rtt > RTT_MAX)
- rtt = RTT_MAX;
+ if (sample->rtt_us > RTT_MAX)
+ sample->rtt_us = RTT_MAX;
/* keep track of minimum RTT seen so far */
- if (ca->base_rtt > rtt)
- ca->base_rtt = rtt;
+ if (ca->base_rtt > sample->rtt_us)
+ ca->base_rtt = sample->rtt_us;
/* and max */
- if (ca->max_rtt < rtt)
- ca->max_rtt = rtt;
+ if (ca->max_rtt < sample->rtt_us)
+ ca->max_rtt = sample->rtt_us;
++ca->cnt_rtt;
- ca->sum_rtt += rtt;
+ ca->sum_rtt += sample->rtt_us;
}
/* Maximum queuing delay */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bc..f506a0a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3196,8 +3196,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
- if (icsk->icsk_ca_ops->pkts_acked)
- icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+ if (icsk->icsk_ca_ops->pkts_acked) {
+ struct ack_sample sample = {pkts_acked, ca_rtt_us};
+
+ icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+ }
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 1e70fa8..c67ece1 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
* newReno in increase case.
* We work it out by following the idea from TCP-LP's paper directly
*/
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
- if (rtt_us > 0)
- tcp_lp_rtt_sample(sk, rtt_us);
+ if (sample->rtt_us > 0)
+ tcp_lp_rtt_sample(sk, sample->rtt_us);
/* calc inference */
if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13951c4..4c4bac1 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < vegas->baseRTT)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index ef9da53..248cfc0 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -17,7 +17,7 @@ struct vegas {
void tcp_vegas_init(struct sock *sk);
void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample);
void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 0d094b9..40171e1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,16 +69,17 @@ static void tcp_veno_init(struct sock *sk)
}
/* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void tcp_veno_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct veno *veno = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < veno->basertt)
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c10732e..4b03a2e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -99,12 +99,13 @@ static void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void tcp_westwood_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct westwood *w = inet_csk_ca(sk);
- if (rtt > 0)
- w->rtt = usecs_to_jiffies(rtt);
+ if (sample->rtt_us > 0)
+ w->rtt = usecs_to_jiffies(sample->rtt_us);
}
/*
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 17d3566..1b9d013 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -56,15 +56,16 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+static void tcp_yeah_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct yeah *yeah = inet_csk_ca(sk);
if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = pkts_acked;
+ yeah->pkts_acked = sample->pkts_acked;
- tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+ tcp_vegas_pkts_acked(sk, sample);
}
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
--
1.8.5.6
^ permalink raw reply related [flat|nested] 8+ messages in thread* [RFC PATCH v5 net-next 2/4] tcp: refactor struct tcp_skb_cb
2015-08-05 1:39 [RFC PATCH v5 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
@ 2015-08-05 1:39 ` Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 3/4] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo
3 siblings, 0 replies; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-05 1:39 UTC (permalink / raw)
To: netdev
Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
Refactor tcp_skb_cb to create two overlaping areas to store
state for incoming or outgoing skbs based on comments by
Neal Cardwell to tcp_nv patch:
AFAICT this patch would not require an increase in the size of
sk_buff cb[] if it were to take advantage of the fact that the
tcp_skb_cb header.h4 and header.h6 fields are only used in the packet
reception code path, and this in_flight field is only used on the
transmit side.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/net/tcp.h | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0121529..a086a98 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -755,11 +755,16 @@ struct tcp_skb_cb {
/* 1 byte hole */
__u32 ack_seq; /* Sequence number ACK'd */
union {
- struct inet_skb_parm h4;
+ struct {
+ /* There is space for up to 20 bytes */
+ } tx; /* only used for outgoing skbs */
+ union {
+ struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_skb_parm h6;
+ struct inet6_skb_parm h6;
#endif
- } header; /* For incoming frames */
+ } header; /* For incoming skbs */
+ };
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
--
1.8.5.6
^ permalink raw reply related [flat|nested] 8+ messages in thread* [RFC PATCH v5 net-next 3/4] tcp: add in_flight to tcp_skb_cb
2015-08-05 1:39 [RFC PATCH v5 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 2/4] tcp: refactor struct tcp_skb_cb Lawrence Brakmo
@ 2015-08-05 1:39 ` Lawrence Brakmo
2015-08-05 1:39 ` [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo
3 siblings, 0 replies; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-05 1:39 UTC (permalink / raw)
To: netdev
Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
Add in_flight (bytes in flight when packet was sent) field
to tx component of tcp_skb_cb and make it available to
congestion modules' pkts_acked() function through the
ack_sample function argument.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/net/tcp.h | 2 ++
net/ipv4/tcp_input.c | 5 ++++-
net/ipv4/tcp_output.c | 4 +++-
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a086a98..cdd93e5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -757,6 +757,7 @@ struct tcp_skb_cb {
union {
struct {
/* There is space for up to 20 bytes */
+ __u32 in_flight;/* Bytes in flight when packet sent */
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -842,6 +843,7 @@ union tcp_cc_info;
struct ack_sample {
u32 pkts_acked;
s32 rtt_us;
+ u32 in_flight;
};
struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f506a0a..338e6bb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3069,6 +3069,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
+ u32 last_in_flight = 0;
bool rtt_update;
int flag = 0;
@@ -3108,6 +3109,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!first_ackt.v64)
first_ackt = last_ackt;
+ last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
@@ -3197,7 +3199,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
if (icsk->icsk_ca_ops->pkts_acked) {
- struct ack_sample sample = {pkts_acked, ca_rtt_us};
+ struct ack_sample sample = {pkts_acked, ca_rtt_us,
+ last_in_flight};
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d1efa7..0fccd5f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -920,9 +920,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
+ tp = tcp_sk(sk);
if (clone_it) {
skb_mstamp_get(&skb->skb_mstamp);
+ TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
+ - tp->snd_una;
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -933,7 +936,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
inet = inet_sk(sk);
- tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
--
1.8.5.6
^ permalink raw reply related [flat|nested] 8+ messages in thread* [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control
2015-08-05 1:39 [RFC PATCH v5 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
` (2 preceding siblings ...)
2015-08-05 1:39 ` [RFC PATCH v5 net-next 3/4] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
@ 2015-08-05 1:39 ` Lawrence Brakmo
2015-08-06 0:51 ` Kenneth Klette Jonassen
3 siblings, 1 reply; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-05 1:39 UTC (permalink / raw)
To: netdev
Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
This is a request for comments.
TCP-NV (New Vegas) is a major update to TCP-Vegas.
An earlier version of NV was presented at 2010's LPC.
It is a delayed based congestion avoidance for the
data center. This version has been tested within a
10G rack where the HW RTTs are 20-50us.
A description of TCP-NV, including implementation
details as well as experimental results, can be found at:
http://www.brakmo.org/networking/tcp-nv/TCPNV.html
The current version includes many module parameters to support
experimentation with the parameters.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
net/ipv4/Kconfig | 16 ++
net/ipv4/Makefile | 1 +
net/ipv4/tcp_nv.c | 479 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 496 insertions(+)
create mode 100644 net/ipv4/tcp_nv.c
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90..f11f2f8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
window. TCP Vegas should provide less packet loss, but it is
not as aggressive as TCP Reno.
+config TCP_CONG_NV
+ tristate "TCP NV"
+ default n
+ ---help---
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
+
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+ For further details see http://www.brakmo.org/networking/tcp-nv/
+
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index efc43f3..06f335f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 0000000..c4379b8
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,479 @@
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks.
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ * TCP-NV behaves like Reno when no congestion is detected, or when
+ * recovering from packet losses.
+ *
+ * TODO:
+ * 1) Add option to not decrease cwnd on losses below certain level
+ * 2) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters */
+static int nv_enable __read_mostly = 1;
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5;
+static int nv_min_cwnd = 10;
+static int nv_dec_eval_min_calls = 100;
+static int nv_ssthresh_eval_min_calls = 30;
+static int nv_rtt_min_cnt = 2;
+static int nv_cong_decrease_mult = 30*128/100;
+static int nv_ssthresh_factor = 8;
+static int nv_rtt_factor = 128;
+static int nv_rtt_cnt_dec_delta = 20; /* dec cwnd by this many RTTs */
+static int nv_dec_factor = 5; /* actual value is factor/8 */
+static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
+static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
+module_param(nv_pad_buffer, int, 0644);
+MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+ " without losses");
+module_param(nv_dec_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
+ "before declaring congestion (< 256)");
+module_param(nv_ssthresh_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
+ "before declaring congestion during initial slow-start");
+module_param(nv_rtt_min_cnt, int, 0644);
+MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
+ " congestion (<64)");
+module_param(nv_cong_decrease_mult, int, 0644);
+MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
+module_param(nv_ssthresh_factor, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
+module_param(nv_rtt_factor, int, 0644);
+MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
+module_param(nv_rtt_cnt_dec_delta, int, 0644);
+MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many RTTs "
+ "every 100 RTTs");
+module_param(nv_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
+module_param(nv_loss_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
+module_param(nv_cwnd_growth_factor, int, 0644);
+MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
+
+/* TCP NV Parameters */
+struct tcpnv {
+ unsigned long nv_min_rtt_reset_jiffies; /* when to switch to
+ * nv_min_rtt_new */
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 loss_cwnd; /* cwnd at last loss */
+ u8 nv_enable:1,
+ nv_allow_cwnd_growth:1, /* whether cwnd can grow */
+ nv_rtt_cnt:6; /* RTTs without making ca decision */
+ u8 nv_eval_call_cnt;/* call count since last eval */
+ u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is
+ * smaller than this. It may grow to handle
+ * TSO, LRO and interrupt coalescence because
+ * with these a small cwnd cannot saturate
+ * the link. Note that this is different from
+ * sysctl_tcp_nv_min_cwnd */
+ u8 available;
+ u32 nv_last_rtt; /* last rtt */
+ u32 nv_min_rtt; /* active min rtt. Used to determine slope */
+ u32 nv_min_rtt_new; /* min rtt for future use */
+ u32 nv_rtt_max_rate; /* max rate seen during current RTT */
+ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
+ * acking beyond nv_rtt_start_seq */
+ u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is
+ * used to determine bytes acked since last
+ * call to bictcp_acked */
+ u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */
+ u32 nv_rtt_cnt_dec; /* RTTs since last temporary cwnd decrease */
+};
+
+#define NV_INIT_RTT 0xffffffff
+#define NV_MIN_CWND 4
+#define NV_MIN_CWND_GROW 2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->loss_cwnd = 0;
+ ca->nv_no_cong_cnt = 0;
+ ca->cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_rtt_cnt_dec = 0;
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_last_rtt = 0;
+ ca->nv_rtt_max_rate = 0;
+ ca->nv_rtt_start_seq = tp->snd_una;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ tcpnv_reset(ca, sk);
+
+ ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
+ ca->nv_min_rtt = NV_INIT_RTT;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ ca->nv_enable = nv_enable;
+ ca->nv_min_cwnd = NV_MIN_CWND;
+ if (nv_dec_eval_min_calls > 255)
+ nv_dec_eval_min_calls = 255;
+ if (nv_rtt_min_cnt > 63)
+ nv_rtt_min_cnt = 63;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* Only grow cwnd if NV has not detected congestion */
+ if (nv_enable && ca->nv_enable && !ca->nv_allow_cwnd_growth)
+ return;
+
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ if (ca->cnt == 0 || !(nv_enable || ca->nv_enable))
+ ca->cnt = tp->snd_cwnd;
+
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
+ return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
+}
+
+static u32 tcpnv_undo_cwnd(struct sock *sk)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Open) {
+ ca->nv_enable = 1;
+ tcpnv_reset(ca, sk);
+ } else if (new_state == TCP_CA_Loss) {
+ ca->nv_enable = 0;
+ }
+}
+
+/* Do congestion avoidance calculaitons for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, struct ack_sample *sample)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+ unsigned long now = jiffies;
+ s64 rate64 = 0;
+ u32 rate, max_win, cwnd_by_slope;
+ u32 avg_rtt;
+ u32 bytes_acked = 0;
+
+ /* Some calls are for duplicates without timetamps */
+ if (sample->rtt_us < 0)
+ return;
+
+ /* If not in TCP_CA_Open state, skip. */
+ if (icsk->icsk_ca_state != TCP_CA_Open)
+ return;
+
+ /* If NV mode is not enabled, behave like Reno */
+ if (!nv_enable || !ca->nv_enable) {
+ ca->nv_allow_cwnd_growth = 1;
+ return;
+ }
+
+ bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+ ca->nv_last_snd_una = tp->snd_una;
+
+ if (sample->in_flight == 0)
+ return;
+
+ /* Calculate moving average of RTT */
+ if (nv_rtt_factor > 0) {
+ if (ca->nv_last_rtt > 0) {
+ avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
+ ((u64)ca->nv_last_rtt)
+ * (256 - nv_rtt_factor)) >> 8;
+ } else {
+ avg_rtt = sample->rtt_us;
+ ca->nv_min_rtt = avg_rtt << 1;
+ }
+ ca->nv_last_rtt = avg_rtt;
+ } else {
+ avg_rtt = sample->rtt_us;
+ }
+
+ /* rate in 100's bits per second */
+ rate64 = ((u64)sample->in_flight) * 8000000;
+ rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
+
+ /* Remember the maximum rate seen during this RTT
+ * Note: It may be more than one RTT. This function should be
+ * called at least nv_dec_eval_min_calls times.
+ */
+ if (ca->nv_rtt_max_rate < rate)
+ ca->nv_rtt_max_rate = rate;
+
+ /* We have valid information, increment counter */
+ if (ca->nv_eval_call_cnt < 255)
+ ca->nv_eval_call_cnt++;
+
+ /* update min rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt)
+ ca->nv_min_rtt = avg_rtt;
+
+ /* update future min_rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt_new)
+ ca->nv_min_rtt_new = avg_rtt;
+
+ /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+ * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+ * warm reset). This new nv_min_rtt will be continued to be updated
+ * and be used for another sysctl_tcp_nv_reset_period seconds,
+ * when it will be updated again.
+ * In practice we introduce some randomness, so the actual period used
+ * is chosen randomly from the range:
+ * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+ */
+ if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+ unsigned char rand;
+ ca->nv_min_rtt = ca->nv_min_rtt_new;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ get_random_bytes(&rand, 1);
+ ca->nv_min_rtt_reset_jiffies =
+ now + ((nv_reset_period*(384 + rand)*HZ)>>9);
+ /* Every so often we decrease nv_min_cwnd in case previous
+ * value is no longer accurate.
+ */
+ ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
+ }
+
+ /* Once per RTT check if we need to do congestion avoidance */
+ if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+ ca->nv_rtt_start_seq = tp->snd_nxt;
+ if (ca->nv_rtt_cnt < 63)
+ /* Increase counter for RTTs without CA decision */
+ ca->nv_rtt_cnt++;
+ if (ca->nv_rtt_cnt_dec < 255)
+ /* Increase counter for temporary cwnd decrease */
+ ca->nv_rtt_cnt_dec++;
+
+ /* If this function is only called once within an RTT
+ * the cwnd is probably too small (in some cases due to
+ * tso, lro or interrupt coalescence), so we increase
+ * nv_min_cwnd.
+ */
+ if (ca->nv_eval_call_cnt == 1
+ && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
+ && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
+ && ca->nv_rtt_cnt_dec < 192) {
+ ca->nv_min_cwnd = min(ca->nv_min_cwnd
+ + NV_MIN_CWND_GROW,
+ NV_TSO_CWND_BOUND + 1);
+ ca->nv_rtt_start_seq = tp->snd_nxt +
+ ca->nv_min_cwnd*tp->mss_cache;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_allow_cwnd_growth = 1;
+ return;
+ }
+
+ /* Every 192 to 320 RTTs decrease cwnd to get better min RTT
+ * measurement. In practice we accomplish this by initializing
+ * nv_rtt_cnd_dec randomly form the range [0, 128) and
+ * stopping at 320.
+ * We keep the value low for nv_rtt_cnt_dec_delta RTTs and then
+ * we restore cwnd to its previous value (by setting
+ * ssthresh to the previous value).
+ */
+ if (ca->nv_rtt_cnt_dec == 320) {
+ /* decrease cwnd and ssthresh */
+ tp->snd_cwnd =
+ max((unsigned int)nv_min_cwnd,
+ ((tp->snd_cwnd * nv_dec_factor) >> 3));
+ tp->snd_ssthresh =
+ max(tp->snd_cwnd,
+ ((tp->snd_ssthresh * nv_dec_factor) >> 3));
+ ca->nv_allow_cwnd_growth = 0;
+ return;
+ } else if (ca->nv_rtt_cnt_dec > 320) {
+ if (ca->nv_rtt_cnt_dec - 320 >= nv_rtt_cnt_dec_delta) {
+ /* Restore ssthresh to restore cwnd */
+ unsigned char rand;
+ get_random_bytes(&rand, 1);
+ ca->nv_rtt_cnt_dec = rand >> 1;
+ tp->snd_ssthresh = (tp->snd_ssthresh << 3)
+ / nv_dec_factor;
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_no_cong_cnt = 0;
+ }
+ return;
+ }
+
+ /* Find the ideal cwnd for current rate from slope
+ * slope = 80000.0 * mss / nv_min_rtt
+ * cwnd_by_slope = nv_rtt_max_rate / slope
+ */
+ cwnd_by_slope = (u32)
+ div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+ (u64)(80000 * tp->mss_cache));
+ max_win = cwnd_by_slope + nv_pad;
+
+ /* If cwnd > max_win, decrease cwnd
+ * if cwnd < max_win, grow cwnd
+ * else leave the same
+ */
+ if (tp->snd_cwnd > max_win) {
+ /* there is congestion, check that it is ok
+ * to make a CA decision
+ * 1. We should have at least nv_dec_eval_min_calls
+ * data points before making a CA decision
+ * 2. We only make a congesion decision after
+ * nv_rtt_min_cnt RTTs
+ */
+ if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
+ return;
+ else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+ if (ca->nv_eval_call_cnt <
+ nv_ssthresh_eval_min_calls)
+ return;
+ } else if (ca->nv_eval_call_cnt <
+ nv_dec_eval_min_calls) {
+ return;
+ }
+
+ /* We have enough data to determine we are congested */
+ ca->nv_allow_cwnd_growth = 0;
+ tp->snd_ssthresh =
+ (nv_ssthresh_factor * max_win) >> 3;
+ if (tp->snd_cwnd - max_win > 2) {
+ /* gap > 2, we do exponential cwnd decrease */
+ int dec;
+ dec = max(2U, ((tp->snd_cwnd - max_win) *
+ nv_cong_decrease_mult) >> 7);
+ tp->snd_cwnd -= dec;
+ } else if (nv_cong_decrease_mult > 0) {
+ tp->snd_cwnd = max_win;
+ }
+ ca->cnt = tp->snd_cwnd;
+ ca->nv_no_cong_cnt = 0;
+ } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) {
+ /* We allow growth of cwnd every RTT since we would
+ * have grown even if we waited (just slower)
+ */
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_no_cong_cnt++;
+ if (nv_cwnd_growth_factor > 0 &&
+ ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
+ ca->cnt = max(ca->cnt >> 1, (u32) 4);
+ ca->nv_no_cong_cnt = 0;
+ }
+ } else {
+ ca->nv_allow_cwnd_growth = 0;
+ }
+
+ /* update state */
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_rtt_max_rate = 0;
+
+ /* Don't want to make cwnd < nv_min_cwnd
+ * (it wasn't before, if it is now is because nv
+ * decreased it).
+ */
+ if (tp->snd_cwnd < nv_min_cwnd)
+ tp->snd_cwnd = nv_min_cwnd;
+
+ }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = ca->nv_enable
+ && nv_enable;
+ info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+ info->vegas.tcpv_rtt = ca->nv_last_rtt;
+ info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcpnv_get_info);
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+ .init = tcpnv_init,
+ .ssthresh = tcpnv_recalc_ssthresh,
+ .cong_avoid = tcpnv_cong_avoid,
+ .set_state = tcpnv_state,
+ .undo_cwnd = tcpnv_undo_cwnd,
+ .pkts_acked = tcpnv_acked,
+ .get_info = tcpnv_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+ return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
--
1.8.5.6
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control
2015-08-05 1:39 ` [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo
@ 2015-08-06 0:51 ` Kenneth Klette Jonassen
2015-08-21 21:29 ` Lawrence Brakmo
2015-08-26 0:05 ` Lawrence Brakmo
0 siblings, 2 replies; 8+ messages in thread
From: Kenneth Klette Jonassen @ 2015-08-06 0:51 UTC (permalink / raw)
To: Lawrence Brakmo
Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
On Wed, Aug 5, 2015 at 3:39 AM, Lawrence Brakmo <brakmo@fb.com> wrote:
> This is a request for comments.
Nice to see more development on delay-based congestion control.
It would be good to see how NV stacks up against CDG. Any chance of
adding cdg as a congestion control parameter to your experiments?
Experiments on NV without its temporary cwnd reductions would also be
of interest -- to get a reference of how effective this mechanism is.
> +#define NV_INIT_RTT 0xffffffff
Maybe use U32_MAX?
> +static void tcpnv_init(struct sock *sk)
> +{
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + tcpnv_reset(ca, sk);
> +
> + ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
> + ca->nv_min_rtt = NV_INIT_RTT;
> + ca->nv_min_rtt_new = NV_INIT_RTT;
> + ca->nv_enable = nv_enable;
Can this assignment be ca->nv_enable = 1? That would match the
TCP_CA_Open case in tcpnv_state().
> + if (nv_dec_eval_min_calls > 255)
> + nv_dec_eval_min_calls = 255;
> + if (nv_rtt_min_cnt > 63)
> + nv_rtt_min_cnt = 63;
nv_dec_eval_min_calls can be clamped to 0-255 by changing its type to u8.
nv_rtt_min_cnt can also be u8? In struct tcpnv, perhaps move
nv_rtt_cnt to the available byte.
> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + if (!tcp_is_cwnd_limited(sk))
> + return;
> +
> + /* Only grow cwnd if NV has not detected congestion */
> + if (nv_enable && ca->nv_enable && !ca->nv_allow_cwnd_growth)
> + return;
The check for ca->nv_enable might be overly harsh on some unfortunate
sockets in TCP_CA_Disorder. Is it needed here?
> +static void tcpnv_acked(struct sock *sk, struct ack_sample *sample)
Maybe move some of this function to tcpnv_cong_avoid()?
> +{
> + const struct inet_connection_sock *icsk = inet_csk(sk);
> + struct tcp_sock *tp = tcp_sk(sk);
> + struct tcpnv *ca = inet_csk_ca(sk);
> + unsigned long now = jiffies;
> + s64 rate64 = 0;
> + u32 rate, max_win, cwnd_by_slope;
> + u32 avg_rtt;
> + u32 bytes_acked = 0;
> +
> + /* Some calls are for duplicates without timetamps */
> + if (sample->rtt_us < 0)
> + return;
> +
> + /* If not in TCP_CA_Open state, skip. */
> + if (icsk->icsk_ca_state != TCP_CA_Open)
> + return;
Consider using samples in other states too, especially
TCP_CA_Disorder. Linux 4.2 enhances RTT sampling from SACKs, so any
non-negative RTT sample should be fully usable.
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control
2015-08-06 0:51 ` Kenneth Klette Jonassen
@ 2015-08-21 21:29 ` Lawrence Brakmo
2015-08-26 0:05 ` Lawrence Brakmo
1 sibling, 0 replies; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-21 21:29 UTC (permalink / raw)
To: Kenneth Klette Jonassen
Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
Kenneth, thank you for your comments, I¹ve implemented most of the
improvements you've mentioned.
I¹m finishing the new patch and the updated results, they should
be done by Monday (including cdg).
On 8/5/15, 5:51 PM, "knneth@gmail.com on behalf of Kenneth Klette
Jonassen" <knneth@gmail.com on behalf of kennetkl@ifi.uio.no> wrote:
>On Wed, Aug 5, 2015 at 3:39 AM, Lawrence Brakmo <brakmo@fb.com> wrote:
>> This is a request for comments.
>
>Nice to see more development on delay-based congestion control.
Thank you.
>
>It would be good to see how NV stacks up against CDG. Any chance of
>adding cdg as a congestion control parameter to your experiments?
>Experiments on NV without its temporary cwnd reductions would also be
>of interest -- to get a reference of how effective this mechanism is.
I¹m finishing with cdg experiments, they will be up on Monday together
with an update to the NV patch.
I will also have some experiments with variations in the temporary cwnd
reduction. This mechanism is meant to reduce min_rtt creep, but it is
now always successful. Its drawback is that it can increase high
percentile latency.
>
>
>> +#define NV_INIT_RTT 0xffffffff
>
>Maybe use U32_MAX?
Done
>
>
>> +static void tcpnv_init(struct sock *sk)
>> +{
>> + struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> + tcpnv_reset(ca, sk);
>> +
>> + ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
>> + ca->nv_min_rtt = NV_INIT_RTT;
>> + ca->nv_min_rtt_new = NV_INIT_RTT;
>> + ca->nv_enable = nv_enable;
>
>Can this assignment be ca->nv_enable = 1? That would match the
>TCP_CA_Open case in tcpnv_state().
Done
>
>
>> + if (nv_dec_eval_min_calls > 255)
>> + nv_dec_eval_min_calls = 255;
>> + if (nv_rtt_min_cnt > 63)
>> + nv_rtt_min_cnt = 63;
>
>nv_dec_eval_min_calls can be clamped to 0-255 by changing its type to u8.
>
>nv_rtt_min_cnt can also be u8? In struct tcpnv, perhaps move
>nv_rtt_cnt to the available byte.
Done
>
>
>> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
>> +{
>> + struct tcp_sock *tp = tcp_sk(sk);
>> + struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> + if (!tcp_is_cwnd_limited(sk))
>> + return;
>> +
>> + /* Only grow cwnd if NV has not detected congestion */
>> + if (nv_enable && ca->nv_enable && !ca->nv_allow_cwnd_growth)
>> + return;
>
>The check for ca->nv_enable might be overly harsh on some unfortunate
>sockets in TCP_CA_Disorder. Is it needed here?
TCP_CA_Disorder should not affect ca->nv_enable in the new patch
>
>
>> +static void tcpnv_acked(struct sock *sk, struct ack_sample *sample)
>
>Maybe move some of this function to tcpnv_cong_avoid()?
It needs to be here since We need the information provided in argument
sample
>
>
>> +{
>> + const struct inet_connection_sock *icsk = inet_csk(sk);
>> + struct tcp_sock *tp = tcp_sk(sk);
>> + struct tcpnv *ca = inet_csk_ca(sk);
>> + unsigned long now = jiffies;
>> + s64 rate64 = 0;
>> + u32 rate, max_win, cwnd_by_slope;
>> + u32 avg_rtt;
>> + u32 bytes_acked = 0;
>> +
>> + /* Some calls are for duplicates without timetamps */
>> + if (sample->rtt_us < 0)
>> + return;
>> +
>> + /* If not in TCP_CA_Open state, skip. */
>> + if (icsk->icsk_ca_state != TCP_CA_Open)
>> + return;
>
>Consider using samples in other states too, especially
>TCP_CA_Disorder. Linux 4.2 enhances RTT sampling from SACKs, so any
>non-negative RTT sample should be fully usable.
Done
>
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [RFC PATCH v5 net-next 4/4] tcp: add NV congestion control
2015-08-06 0:51 ` Kenneth Klette Jonassen
2015-08-21 21:29 ` Lawrence Brakmo
@ 2015-08-26 0:05 ` Lawrence Brakmo
1 sibling, 0 replies; 8+ messages in thread
From: Lawrence Brakmo @ 2015-08-26 0:05 UTC (permalink / raw)
To: Kenneth Klette Jonassen
Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng,
Stephen Hemminger
On 8/5/15, 5:51 PM, "knneth@gmail.com on behalf of Kenneth Klette
Jonassen" <knneth@gmail.com on behalf of kennetkl@ifi.uio.no> wrote:
>On Wed, Aug 5, 2015 at 3:39 AM, Lawrence Brakmo <brakmo@fb.com> wrote:
>> This is a request for comments.
>
>Nice to see more development on delay-based congestion control.
>
>It would be good to see how NV stacks up against CDG. Any chance of
>adding cdg as a congestion control parameter to your experiments?
Done. I¹m updating the TCP-NV page with the updated results, should be
done by tomorrow (8/26).
>Experiments on NV without its temporary cwnd reductions would also be
>of interest -- to get a reference of how effective this mechanism is.
Done. It turns out that it only improves fairness a little but hurts
P99 latencies more significantly. So it is now off by default.
Thanks for making me re-examine this feature.
^ permalink raw reply [flat|nested] 8+ messages in thread