netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] TCP_FAILFAST: a new socket option to timeout/abort a connection quicker
@ 2010-08-24  6:20 H.K. Jerry Chu
  2010-08-24  6:44 ` Eric Dumazet
  0 siblings, 1 reply; 17+ messages in thread
From: H.K. Jerry Chu @ 2010-08-24  6:20 UTC (permalink / raw)
  To: ilpo.jarvinen, davem; +Cc: netdev, Jerry Chu

From: Jerry Chu <hkchu@google.com>

This is a TCP level socket option that takes an unsigned int to specify
how long in ms TCP should resend a lost data packet before giving up
and returning ETIMEDOUT. The normal TCP retry/abort timeout limit still
applies. In other words this option is only meant for those applications
that need to "fail faster" than the default TCP timeout. The latter
may take upto 20 minutes in a normal WAN environment.

The option is disabled (by default) when set to 0. Also it does not
apply during the connection establishment phase.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
---
 include/linux/tcp.h                |    1 +
 include/net/inet_connection_sock.h |    1 +
 net/ipv4/tcp.c                     |   11 ++++++++-
 net/ipv4/tcp_timer.c               |   42 +++++++++++++++++++++++++++++++----
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a778ee0..60b7244 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -105,6 +105,7 @@ enum {
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
+#define TCP_FAILFAST		18	/* Abort connection in loss retry sooner*/
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b6d3b55..6553921 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -125,6 +125,7 @@ struct inet_connection_sock {
 		int		  probe_size;
 	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
+	u32			  icsk_max_timeout;
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11a..ddb548a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		err = tp->af_specific->md5_parse(sk, optval, optlen);
 		break;
 #endif
-
+	case TCP_FAILFAST:
+		/* Cap the max timeout in ms TCP will retry/retrans
+		 * before giving up and aborting (ETIMEDOUT) a connection.
+		 */
+		icsk->icsk_max_timeout = msecs_to_jiffies(val);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		val = tp->thin_dupack;
 		break;
+
+	case TCP_FAILFAST:
+		val = jiffies_to_msecs(icsk->icsk_max_timeout);
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb92..95c2548 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -138,7 +138,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
  * retransmissions with an initial RTO of TCP_RTO_MIN.
  */
 static bool retransmits_timed_out(struct sock *sk,
-				  unsigned int boundary)
+				  unsigned int boundary,
+				  unsigned int max_timeout)
 {
 	unsigned int timeout, linear_backoff_thresh;
 	unsigned int start_ts;
@@ -159,6 +160,9 @@ static bool retransmits_timed_out(struct sock *sk,
 		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
 			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
 
+	if (max_timeout != 0 && timeout > max_timeout)
+		timeout = max_timeout;
+
 	return (tcp_time_stamp - start_ts) >= timeout;
 }
 
@@ -174,7 +178,7 @@ static int tcp_write_timeout(struct sock *sk)
 			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -187,14 +191,16 @@ static int tcp_write_timeout(struct sock *sk)
 
 			retry_until = tcp_orphan_retries(sk, alive);
 			do_reset = alive ||
-				   !retransmits_timed_out(sk, retry_until);
+				   !retransmits_timed_out(sk, retry_until, 0);
 
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
 	}
 
-	if (retransmits_timed_out(sk, retry_until)) {
+	if (retransmits_timed_out(sk, retry_until,
+	    (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 :
+	    icsk->icsk_max_timeout)) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -434,9 +440,35 @@ out_reset_timer:
 	} else {
 		/* Use normal (exponential) backoff */
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+		if (icsk->icsk_max_timeout &&
+		    ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) == 0) {
+			int ts;
+			unsigned int base_rto =
+			    min(__tcp_set_rto(tp), TCP_RTO_MAX);
+
+			if (unlikely(!tp->retrans_stamp))
+				ts = (int)TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+			else
+				ts = (int)tp->retrans_stamp;
+			ts = icsk->icsk_max_timeout - (tcp_time_stamp - ts) -
+				base_rto-1;
+			/*
+			 * Adjust rto so that the total timeout is not far off
+			 * the max_timeout range. Also if the total # of
+			 * retries would be less than 6, allow one more shot.
+			 */
+			if (icsk->icsk_rto > ts && icsk->icsk_retransmits < 6)
+				icsk->icsk_rto >>= 1;
+			if ((int)(icsk->icsk_rto) > ts) {
+				if (ts < (int)base_rto)
+					icsk->icsk_rto = base_rto;
+				else
+					icsk->icsk_rto = ts;
+			}
+		}
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
 		__sk_dst_reset(sk);
 
 out:;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-08-26  7:42 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-24  6:20 [PATCH] TCP_FAILFAST: a new socket option to timeout/abort a connection quicker H.K. Jerry Chu
2010-08-24  6:44 ` Eric Dumazet
2010-08-24  8:04   ` Arnd Hannemann
2010-08-24  9:10     ` Hagen Paul Pfeifer
2010-08-24 14:58       ` Arnd Hannemann
2010-08-24 16:28         ` Hagen Paul Pfeifer
2010-08-24 22:13           ` Jerry Chu
2010-08-25  8:21             ` Hagen Paul Pfeifer
2010-08-25 20:20               ` Jerry Chu
2010-08-25 22:59                 ` Hagen Paul Pfeifer
2010-08-26  1:49                   ` Jerry Chu
2010-08-26  6:01                     ` Lars Eggert
2010-08-26  7:12                       ` Arnd Hannemann
2010-08-26  7:42                         ` Hagen Paul Pfeifer
2010-08-26  7:27                       ` Hagen Paul Pfeifer
2010-08-24 21:56     ` Jerry Chu
2010-08-24 20:47   ` Jerry Chu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).