public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] TCP timestamp hardening (update)
@ 2007-07-25 10:44 shemminger
  2007-07-25 10:44 ` [PATCH 1/3] TCP: congestion control API pass RTT in microseconds shemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: shemminger @ 2007-07-25 10:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

These patches deal with issues brought up by Gavin McCullagh
about reactions of Cubic and HTCP to hostile receivers that return
bogus timestamp options. If the receiver crafts a timestamp that is
larger than the original, then some of the congestion control algorithms
maybe come unfair.

The solution in these patches is to only use local values to measure
RTT for congestion control.  The timestamp is still used as described
in RFC's to measure RTT used for retransmit timer.

Thank you to Sangtae Ha for testing, these, see:
  http://netsrv.csc.ncsu.edu/net-2.6.22/stephen_lowres/
He also found some pre-existing problems with TCP-LP that might
be related to NAPI on the receiver.

This should go into 2.6.23. But not into the stable kernel
since the risk of causing regression is greater than the possible
risk exposure.

-- 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] TCP: congestion control API pass RTT in microseconds
  2007-07-25 10:44 [PATCH 0/3] TCP timestamp hardening (update) shemminger
@ 2007-07-25 10:44 ` shemminger
  2007-07-25 10:44 ` [PATCH 2/3] TCP: cubic - eliminate use of receive time stamp shemminger
  2007-07-25 10:44 ` [PATCH 3/3] TCP: htcp - use measured rtt shemminger
  2 siblings, 0 replies; 4+ messages in thread
From: shemminger @ 2007-07-25 10:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: tcp-cong-rtt.patch --]
[-- Type: text/plain, Size: 8466 bytes --]

This patch changes the API for the callback that is done after an ACK is
received. It solves a couple of issues:

  * Some congestion controls want higher resolution value of RTT
    (controlled by TCP_CONG_RTT_SAMPLE flag). These don't really want a ktime, but
    all compute a RTT in microseconds.

  * Other congestion control methods could use RTT at jiffies resolution.

To keep API consistent the units should be the same for both cases, just the
resolution should change. 

A value -1 is used to indicate no valid timestamp is available.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

--- a/include/net/tcp.h	2007-07-23 10:29:42.000000000 +0100
+++ b/include/net/tcp.h	2007-07-23 10:33:50.000000000 +0100
@@ -660,7 +660,7 @@ struct tcp_congestion_ops {
 	/* new value of cwnd after loss (optional) */
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
-	void (*pkts_acked)(struct sock *sk, u32 num_acked, ktime_t last);
+	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
 	/* get info for inet_diag (optional) */
 	void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb);
 
--- a/net/ipv4/tcp_input.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_input.c	2007-07-23 10:33:50.000000000 +0100
@@ -2490,12 +2490,23 @@ static int tcp_clean_rtx_queue(struct so
 		tcp_ack_update_rtt(sk, acked, seq_rtt);
 		tcp_ack_packets_out(sk);
 
-		/* Is the ACK triggering packet unambiguous? */
-		if (acked & FLAG_RETRANS_DATA_ACKED)
-			last_ackt = net_invalid_timestamp();
+		if (ca_ops->pkts_acked) {
+			s32 rtt_us = -1;
 
-		if (ca_ops->pkts_acked)
-			ca_ops->pkts_acked(sk, pkts_acked, last_ackt);
+			/* Is the ACK triggering packet unambiguous? */
+			if (!(acked & FLAG_RETRANS_DATA_ACKED)) {
+				/* High resolution needed and available? */
+				if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
+				    !ktime_equal(last_ackt,
+						 net_invalid_timestamp()))
+					rtt_us = ktime_us_delta(ktime_get_real(),
+								last_ackt);
+				else if (seq_rtt > 0)
+					rtt_us = jiffies_to_usecs(seq_rtt);
+			}
+
+			ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+		}
 	}
 
 #if FASTRETRANS_DEBUG > 0
--- a/net/ipv4/tcp_bic.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_bic.c	2007-07-23 10:33:50.000000000 +0100
@@ -206,7 +206,7 @@ static void bictcp_state(struct sock *sk
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last)
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
--- a/net/ipv4/tcp_cubic.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_cubic.c	2007-07-23 10:33:50.000000000 +0100
@@ -334,7 +334,7 @@ static void bictcp_state(struct sock *sk
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last)
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
--- a/net/ipv4/tcp_htcp.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_htcp.c	2007-07-23 10:33:50.000000000 +0100
@@ -98,7 +98,7 @@ static inline void measure_rtt(struct so
 	}
 }
 
-static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
--- a/net/ipv4/tcp_illinois.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_illinois.c	2007-07-23 10:33:50.000000000 +0100
@@ -83,18 +83,16 @@ static void tcp_illinois_init(struct soc
 }
 
 /* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last)
+static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
 {
 	struct illinois *ca = inet_csk_ca(sk);
-	u32 rtt;
 
 	ca->acked = pkts_acked;
 
-	if (ktime_equal(last, net_invalid_timestamp()))
+	/* dup ack, no rtt sample */
+	if (rtt < 0)
 		return;
 
-	rtt = ktime_to_us(net_timedelta(last));
-
 	/* ignore bogus values, this prevents wraparound in alpha math */
 	if (rtt > RTT_MAX)
 		rtt = RTT_MAX;
--- a/net/ipv4/tcp_lp.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_lp.c	2007-07-23 10:33:50.000000000 +0100
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct soc
  * newReno in increase case.
  * We work it out by following the idea from TCP-LP's paper directly
  */
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last)
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct lp *lp = inet_csk_ca(sk);
 
-	if (!ktime_equal(last, net_invalid_timestamp()))
-		tcp_lp_rtt_sample(sk,  ktime_to_us(net_timedelta(last)));
+	if (rtt_us > 0)
+		tcp_lp_rtt_sample(sk, rtt_us);
 
 	/* calc inference */
 	if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
--- a/net/ipv4/tcp_vegas.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_vegas.c	2007-07-23 10:33:50.000000000 +0100
@@ -112,16 +112,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
  *   o min-filter RTT samples from a much longer window (forever for now)
  *     to find the propagation delay (baseRTT)
  */
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 {
 	struct vegas *vegas = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (ktime_equal(last, net_invalid_timestamp()))
+	if (rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = ktime_to_us(net_timedelta(last)) + 1;
+	vrtt = rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < vegas->baseRTT)
--- a/net/ipv4/tcp_vegas.h	2007-07-23 10:27:43.000000000 +0100
+++ b/net/ipv4/tcp_vegas.h	2007-07-23 10:33:50.000000000 +0100
@@ -17,7 +17,7 @@ struct vegas {
 
 extern void tcp_vegas_init(struct sock *sk);
 extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
-extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last);
+extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
 extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
 extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
 
--- a/net/ipv4/tcp_veno.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_veno.c	2007-07-23 10:33:50.000000000 +0100
@@ -69,16 +69,16 @@ static void tcp_veno_init(struct sock *s
 }
 
 /* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
+static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 {
 	struct veno *veno = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (ktime_equal(last, net_invalid_timestamp()))
+	if (rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = ktime_to_us(net_timedelta(last)) + 1;
+	vrtt = rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < veno->basertt)
--- a/net/ipv4/tcp_westwood.c	2007-07-23 10:27:43.000000000 +0100
+++ b/net/ipv4/tcp_westwood.c	2007-07-23 10:33:50.000000000 +0100
@@ -100,11 +100,12 @@ static void westwood_filter(struct westw
  * Called after processing group of packets.
  * but all westwood needs is the last sample of srtt.
  */
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
 {
 	struct westwood *w = inet_csk_ca(sk);
-	if (cnt > 0)
-		w->rtt = tcp_sk(sk)->srtt >> 3;
+
+	if (rtt > 0)
+		w->rtt = usecs_to_jiffies(rtt);
 }
 
 /*
--- a/net/ipv4/tcp_yeah.c	2007-07-23 10:29:42.000000000 +0100
+++ b/net/ipv4/tcp_yeah.c	2007-07-23 10:33:50.000000000 +0100
@@ -58,7 +58,7 @@ static void tcp_yeah_init(struct sock *s
 }
 
 
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last)
+static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct yeah *yeah = inet_csk_ca(sk);
@@ -66,7 +66,7 @@ static void tcp_yeah_pkts_acked(struct s
 	if (icsk->icsk_ca_state == TCP_CA_Open)
 		yeah->pkts_acked = pkts_acked;
 
-	tcp_vegas_pkts_acked(sk, pkts_acked, last);
+	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
 }
 
 static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,

-- 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/3] TCP: cubic - eliminate use of receive time stamp
  2007-07-25 10:44 [PATCH 0/3] TCP timestamp hardening (update) shemminger
  2007-07-25 10:44 ` [PATCH 1/3] TCP: congestion control API pass RTT in microseconds shemminger
@ 2007-07-25 10:44 ` shemminger
  2007-07-25 10:44 ` [PATCH 3/3] TCP: htcp - use measured rtt shemminger
  2 siblings, 0 replies; 4+ messages in thread
From: shemminger @ 2007-07-25 10:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: tcp-cubic-rtt.patch --]
[-- Type: text/plain, Size: 2502 bytes --]

Remove use of received timestamp option value from RTT calculation in Cubic.
A hostile receiver may be returning a larger timestamp option than the original
value. This would cause the sender to believe the malevolent receiver had
a larger RTT and because Cubic tries to provide some RTT friendliness, the
sender would then favor the liar.

Instead, use the jiffie resolutionRTT value already computed and
passed back after ack.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

--- a/net/ipv4/tcp_cubic.c	2007-07-23 10:33:50.000000000 +0100
+++ b/net/ipv4/tcp_cubic.c	2007-07-23 10:35:26.000000000 +0100
@@ -246,38 +246,12 @@ static inline void bictcp_update(struct 
 		ca->cnt = 1;
 }
 
-
-/* Keep track of minimum rtt */
-static inline void measure_delay(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
-	u32 delay;
-
-	/* No time stamp */
-	if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
-	     /* Discard delay samples right after fast recovery */
-	    (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
-		return;
-
-	delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3;
-	if (delay == 0)
-		delay = 1;
-
-	/* first time call or link delay decreases */
-	if (ca->delay_min == 0 || ca->delay_min > delay)
-		ca->delay_min = delay;
-}
-
 static void bictcp_cong_avoid(struct sock *sk, u32 ack,
 			      u32 in_flight, int data_acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
 
-	if (data_acked)
-		measure_delay(sk);
-
 	if (!tcp_is_cwnd_limited(sk, in_flight))
 		return;
 
@@ -337,14 +311,30 @@ static void bictcp_state(struct sock *sk
 static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+	u32 delay;
 
 	if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
-		struct bictcp *ca = inet_csk_ca(sk);
 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
 		ca->delayed_ack += cnt;
 	}
-}
 
+	/* Some calls are for duplicates without timetamps */
+	if (rtt_us < 0)
+		return;
+
+	/* Discard delay samples right after fast recovery */
+	if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+		return;
+
+	delay = usecs_to_jiffies(rtt_us) << 3;
+	if (delay == 0)
+		delay = 1;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
+}
 
 static struct tcp_congestion_ops cubictcp = {
 	.init		= bictcp_init,

-- 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 3/3] TCP: htcp - use measured rtt
  2007-07-25 10:44 [PATCH 0/3] TCP timestamp hardening (update) shemminger
  2007-07-25 10:44 ` [PATCH 1/3] TCP: congestion control API pass RTT in microseconds shemminger
  2007-07-25 10:44 ` [PATCH 2/3] TCP: cubic - eliminate use of receive time stamp shemminger
@ 2007-07-25 10:44 ` shemminger
  2 siblings, 0 replies; 4+ messages in thread
From: shemminger @ 2007-07-25 10:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: htcp-fix.patch --]
[-- Type: text/plain, Size: 1447 bytes --]

Change HTCP to use measured RTT rather than smooth RTT.
Srtt is computed using the TCP receive timestamp
options, so it is vulnerable to hostile receivers. To avoid any problems
this might cause use the measured RTT instead.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

--- a/net/ipv4/tcp_htcp.c	2007-07-19 08:26:40.000000000 +0100
+++ b/net/ipv4/tcp_htcp.c	2007-07-19 08:28:07.000000000 +0100
@@ -76,12 +76,11 @@ static u32 htcp_cwnd_undo(struct sock *s
 	return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
 }
 
-static inline void measure_rtt(struct sock *sk)
+static inline void measure_rtt(struct sock *sk, u32 srtt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct htcp *ca = inet_csk_ca(sk);
-	u32 srtt = tp->srtt >> 3;
 
 	/* keep track of minimum RTT seen so far, minRTT is zero at first */
 	if (ca->minRTT > srtt || !ca->minRTT)
@@ -108,6 +107,9 @@ static void measure_achieved_throughput(
 	if (icsk->icsk_ca_state == TCP_CA_Open)
 		ca->pkts_acked = pkts_acked;
 
+	if (rtt > 0)
+		measure_rtt(sk, usecs_to_jiffies(rtt));
+
 	if (!use_bandwidth_switch)
 		return;
 
@@ -237,8 +239,6 @@ static void htcp_cong_avoid(struct sock 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
 		tcp_slow_start(tp);
 	else {
-		measure_rtt(sk);
-
 		/* In dangerous area, increase slowly.
 		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
 		 */

-- 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-07-25 11:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-25 10:44 [PATCH 0/3] TCP timestamp hardening (update) shemminger
2007-07-25 10:44 ` [PATCH 1/3] TCP: congestion control API pass RTT in microseconds shemminger
2007-07-25 10:44 ` [PATCH 2/3] TCP: cubic - eliminate use of receive time stamp shemminger
2007-07-25 10:44 ` [PATCH 3/3] TCP: htcp - use measured rtt shemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox