netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ben Woodard <woodard@redhat.com>
To: Vlad Yasevich <vladislav.yasevich@hp.com>
Cc: David Miller <davem@davemloft.net>,
	netdev@vger.kernel.org, mgrondona@llnl.gov, behlendorf1@llnl.gov
Subject: Re: [PATCH] Customizable TCP backoff patch
Date: Wed, 11 Oct 2006 17:51:24 -0700	[thread overview]
Message-ID: <452D918C.6050300@redhat.com> (raw)
In-Reply-To: <452CF94B.9000309@hp.com>

[-- Attachment #1: Type: text/plain, Size: 4968 bytes --]

Here we go again. The changes are as follows:
1) the spaces are gone and now it is tabs.
2) used msecs_to_jiffies() and jiffies_to_msecs(). I'm much happier with 
  that. I didn't know those functions existed.
3) changed over to proc_doulongvec_ms_jiffies_minmax(). Last night's 
test compile failed because I was out of disk. I figured that the code I 
changed had compiled. I guess not.

I really didn't understand Yoshifuji Hideaki's comment. (sorry)

-ben


Vlad Yasevich wrote:
> Ben Woodard wrote:
>  ------------------------------------------------------------------------
>> diff -ru linux-2.6.18/include/net/tcp.h linux-2.6.18.new/include/net/tcp.h
>> --- linux-2.6.18/include/net/tcp.h	2006-09-19 20:42:06.000000000 -0700
>> +++ linux-2.6.18.new/include/net/tcp.h	2006-10-10 18:42:00.000000000 -0700
>> @@ -227,11 +227,23 @@
>>  extern int sysctl_tcp_base_mss;
>>  extern int sysctl_tcp_workaround_signed_windows;
>>  extern int sysctl_tcp_slow_start_after_idle;
>> +extern unsigned long sysctl_tcp_rto_max;
>> +extern unsigned long sysctl_tcp_rto_init;
>>  
>>  extern atomic_t tcp_memory_allocated;
>>  extern atomic_t tcp_sockets_allocated;
>>  extern int tcp_memory_pressure;
>>  
>> +static inline unsigned long tcp_rto_max(struct tcp_sock *tp)
>> +{
>> +        return tp->rto_max ? tp->rto_max*HZ/1000 : sysctl_tcp_rto_max;
>                                 ^^^^^^^^^^^^^^^^^^^
> That should probably be msecs_to_jiffies(tp->rto_max)
> 
>> +}
>> +
>> +static inline unsigned long tcp_rto_init(struct tcp_sock *tp)
>> +{
>> +        return tp->rto_init ? tp->rto_init*HZ/1000 : sysctl_tcp_rto_init;
> 
> Ditto.
> 
>> +}
>> +
>>  /*
>>   * The next routines deal with comparing 32 bit unsigned ints
>>   * and worry about wraparound (automatic with unsigned arithmetic).
>> diff -ru linux-2.6.18/net/ipv4/sysctl_net_ipv4.c linux-2.6.18.new/net/ipv4/sysctl_net_ipv4.c
>> --- linux-2.6.18/net/ipv4/sysctl_net_ipv4.c	2006-09-19 20:42:06.000000000 -0700
>> +++ linux-2.6.18.new/net/ipv4/sysctl_net_ipv4.c	2006-10-10 16:32:08.000000000 -0700
>> @@ -128,6 +128,8 @@
>>  	return ret;
>>  }
>>  
>> +static unsigned long tcp_rto_min=0;
>> +static unsigned long tcp_rto_max=65535;
>>  
>>  ctl_table ipv4_table[] = {
>>          {
>> @@ -697,6 +699,26 @@
>>  		.mode		= 0644,
>>  		.proc_handler	= &proc_dointvec
>>  	},
>> +	{
>> +	        .ctl_name       = NET_TCP_RTO_MAX,
>> +		.procname       = "tcp_rto_max",
>> +		.data           = &sysctl_tcp_rto_max,
>> +		.maxlen         = sizeof(unsigned),
>> +		.mode           = 0644,
>> +		.proc_handler   = &proc_doulongvec_ms_jiffies,
>> +		.extra1         = &tcp_rto_min_constant,
>> +		.extra2         = &tcp_rto_max_constant,
>> +	},
>> +	{
>> +	        .ctl_name       = NET_TCP_RTO_INIT,
>> +		.procname       = "tcp_rto_init",
>> +		.data           = &sysctl_tcp_rto_init,
>> +		.maxlen         = sizeof(unsigned),
>> +		.mode           = 0644,
>> +		.proc_handler   = &proc_doulongvec_ms_jiffies,
>> +		.extra1         = &tcp_rto_min_constant,
>> +		.extra2         = &tcp_rto_max_constant,
>> +	},
>>  	{ .ctl_name = 0 }
>>  };
> 
> Try as I might, I can't find proc_doulongvec_ms_jiffies() anywhere.  I think
> you meant proc_doulongvec_ms_jiffies_minmax()?
> 
> Also, this function has a bug in that it doesn't do corrections for potentially different
> HZ values.  Look at the msecs_to_jiffies...  When I've used proc_doulongvec_ms_jiffies_minmax()
> before, I would seen rather interesting truncation errors such that sysctl input didn't match
> sysctl output.
> 
>>  
>> diff -ru linux-2.6.18/net/ipv4/tcp.c linux-2.6.18.new/net/ipv4/tcp.c
>> --- linux-2.6.18/net/ipv4/tcp.c	2006-09-19 20:42:06.000000000 -0700
>> +++ linux-2.6.18.new/net/ipv4/tcp.c	2006-10-10 18:37:40.000000000 -0700
>> @@ -1764,6 +1764,8 @@
>>  	return err;
>>  }
>>  
>> +#define TCP_BACKOFF_MAXVAL 65535
>> +
>>  /*
>>   *	Socket option code for TCP.
>>   */
>> @@ -1939,6 +1941,21 @@
>>  		}
>>  		break;
>>  
>> +        case TCP_BACKOFF_MAX:
>> +                if (val < 1 || val > TCP_BACKOFF_MAXVAL)
>> +                        err = -EINVAL;
>> +                else
>> +                        tp->rto_max = val;
>> +                break;
>> + 
>> +        case TCP_BACKOFF_INIT:
>> +                if (val < 1 || val > TCP_BACKOFF_MAXVAL)
>> +                        err = -EINVAL;
>> +                else
>> +                        tp->rto_init = val;
>> +                break;
>> + 
>> + 
>>  	default:
>>  		err = -ENOPROTOOPT;
>>  		break;
>> @@ -2110,6 +2127,12 @@
>>  		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
>>  			return -EFAULT;
>>  		return 0;
>> +        case TCP_BACKOFF_MAX:
>> +	        val = tcp_rto_max(tp)*1000/HZ;
>> +                break;
>> +        case TCP_BACKOFF_INIT:
>> +	        val = tcp_rto_init(tp)*1000/HZ;
> 
> The two divisions above introduce truncation errors such the input doesn't
> match the output.  Better to use jiffies_to_msecs().
> 
> Regards
> -vlad


[-- Attachment #2: tcp-backoff-18-5.diff --]
[-- Type: text/x-patch, Size: 6796 bytes --]

diff -ru linux-2.6.18/include/linux/sysctl.h linux-2.6.18.new/include/linux/sysctl.h
--- linux-2.6.18/include/linux/sysctl.h	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/include/linux/sysctl.h	2006-10-11 10:27:52.000000000 -0700
@@ -411,6 +411,8 @@
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_TCP_RTO_MAX=118,
+	NET_TCP_RTO_INIT=119,
 };
 
 enum {
diff -ru linux-2.6.18/include/linux/tcp.h linux-2.6.18.new/include/linux/tcp.h
--- linux-2.6.18/include/linux/tcp.h	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/include/linux/tcp.h	2006-10-11 10:41:40.000000000 -0700
@@ -94,6 +94,8 @@
 #define TCP_INFO		11	/* Information about this connection. */
 #define TCP_QUICKACK		12	/* Block/reenable quick acks */
 #define TCP_CONGESTION		13	/* Congestion control algorithm */
+#define TCP_BACKOFF_MAX         14      /* Maximum backoff value */
+#define TCP_BACKOFF_INIT        15      /* Initial backoff value */
 
 #define TCPI_OPT_TIMESTAMPS	1
 #define TCPI_OPT_SACK		2
@@ -257,6 +259,8 @@
 	__u8	frto_counter;	/* Number of new acks after RTO */
 	__u8	nonagle;	/* Disable Nagle algorithm?             */
 	__u8	keepalive_probes; /* num of allowed keep alive probes	*/
+	__u16	rto_max;	/* Maximum backoff value in ms		*/
+	__u16	rto_init;	/* Initial backoff value in ms		*/
 
 /* RTT measurement */
 	__u32	srtt;		/* smoothed round trip time << 3	*/
diff -ru linux-2.6.18/include/net/tcp.h linux-2.6.18.new/include/net/tcp.h
--- linux-2.6.18/include/net/tcp.h	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/include/net/tcp.h	2006-10-11 17:43:23.091431000 -0700
@@ -227,11 +227,23 @@
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
+extern unsigned long sysctl_tcp_rto_max;
+extern unsigned long sysctl_tcp_rto_init;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
+static inline unsigned long tcp_rto_max(struct tcp_sock *tp)
+{
+	return tp->rto_max ? msecs_to_jiffies(tp->rto_max) : sysctl_tcp_rto_max;
+}
+
+static inline unsigned long tcp_rto_init(struct tcp_sock *tp)
+{
+	return tp->rto_init ? msecs_to_jiffies(tp->rto_init) : sysctl_tcp_rto_init;
+}
+
 /*
  * The next routines deal with comparing 32 bit unsigned ints
  * and worry about wraparound (automatic with unsigned arithmetic).
Only in linux-2.6.18.new/include/net: tcp.h~
diff -ru linux-2.6.18/net/ipv4/sysctl_net_ipv4.c linux-2.6.18.new/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.18/net/ipv4/sysctl_net_ipv4.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/net/ipv4/sysctl_net_ipv4.c	2006-10-11 16:00:37.000000000 -0700
@@ -128,6 +128,8 @@
 	return ret;
 }
 
+static unsigned long tcp_rto_min=0;
+static unsigned long tcp_rto_max=65535;
 
 ctl_table ipv4_table[] = {
         {
@@ -697,6 +699,26 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_TCP_RTO_MAX,
+		.procname	= "tcp_rto_max",
+		.data		= &sysctl_tcp_rto_max,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &tcp_rto_min_constant,
+		.extra2		= &tcp_rto_max_constant,
+	},
+	{
+		.ctl_name	= NET_TCP_RTO_INIT,
+		.procname	= "tcp_rto_init",
+		.data		= &sysctl_tcp_rto_init,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &tcp_rto_min_constant,
+		.extra2		= &tcp_rto_max_constant,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -ru linux-2.6.18/net/ipv4/tcp.c linux-2.6.18.new/net/ipv4/tcp.c
--- linux-2.6.18/net/ipv4/tcp.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/net/ipv4/tcp.c	2006-10-11 16:00:37.000000000 -0700
@@ -1764,6 +1764,8 @@
 	return err;
 }
 
+#define TCP_BACKOFF_MAXVAL 65535
+
 /*
  *	Socket option code for TCP.
  */
@@ -1939,6 +1941,20 @@
 		}
 		break;
 
+	case TCP_BACKOFF_MAX:
+		if (val < 1 || val > TCP_BACKOFF_MAXVAL)
+			err = -EINVAL;
+		else
+			tp->rto_max = val;
+		break;
+ 
+	case TCP_BACKOFF_INIT:
+		if (val < 1 || val > TCP_BACKOFF_MAXVAL)
+			err = -EINVAL;
+		else
+			tp->rto_init = val;
+		break;
+ 
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2110,6 +2126,12 @@
 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
 			return -EFAULT;
 		return 0;
+	case TCP_BACKOFF_MAX:
+		val = jiffies_to_msecs(tcp_rto_max(tp));
+		break;
+	case TCP_BACKOFF_INIT:
+		val = jiffies_to_msecs(tcp_rto_init(tp));
+		break;
 	default:
 		return -ENOPROTOOPT;
 	};
diff -ru linux-2.6.18/net/ipv4/tcp_timer.c linux-2.6.18.new/net/ipv4/tcp_timer.c
--- linux-2.6.18/net/ipv4/tcp_timer.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.new/net/ipv4/tcp_timer.c	2006-10-11 10:41:39.000000000 -0700
@@ -31,6 +31,8 @@
 int sysctl_tcp_retries1 = TCP_RETR1;
 int sysctl_tcp_retries2 = TCP_RETR2;
 int sysctl_tcp_orphan_retries;
+unsigned long sysctl_tcp_rto_max  = TCP_RTO_MAX;
+unsigned long sysctl_tcp_rto_init = TCP_TIMEOUT_INIT;
 
 static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
@@ -71,7 +73,8 @@
 
 	/* If peer does not open window for long time, or did not transmit 
 	 * anything for long time, penalize it. */
-	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*tcp_rto_max(tp) || 
+	    !do_reset)
 		orphans <<= 1;
 
 	/* If some dubious ICMP arrived, penalize even more. */
@@ -256,8 +259,8 @@
 	max_probes = sysctl_tcp_retries2;
 
 	if (sock_flag(sk, SOCK_DEAD)) {
-		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
- 
+		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < 
+				   tcp_rto_max(tp));
 		max_probes = tcp_orphan_retries(sk, alive);
 
 		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
@@ -301,7 +304,7 @@
 			       inet->num, tp->snd_una, tp->snd_nxt);
 		}
 #endif
-		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+		if (tcp_time_stamp - tp->rcv_tstamp > tcp_rto_max(tp)) {
 			tcp_write_err(sk);
 			goto out;
 		}
@@ -373,7 +376,8 @@
 
 out_reset_timer:
 	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, 
+				  tcp_rto_max(tp));
 	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 		__sk_dst_reset(sk);
 
@@ -427,8 +431,8 @@
 
 static void tcp_synack_timer(struct sock *sk)
 {
-	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
-				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, TCP_TIMEOUT_INIT, 
+				   tcp_rto_max(tcp_sk(sk)));
 }
 
 void tcp_set_keepalive(struct sock *sk, int val)

  reply	other threads:[~2006-10-12  0:51 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-09-27 18:52 [PATCH] Customizable TCP backoff patch Ben Woodard
2006-09-27 23:16 ` David Miller
2006-09-27 23:00   ` Stephen Hemminger
2006-09-28  2:06     ` David Miller
2006-10-03 18:14   ` Ben Woodard
2006-10-04  7:07     ` David Miller
2006-10-04  8:56       ` Ingo Oeser
2006-10-04  9:08         ` David Miller
2006-10-04 17:17       ` Stephen Hemminger
2006-10-11  1:46       ` Ben Woodard
2006-10-11  1:59         ` YOSHIFUJI Hideaki / 吉藤英明
2006-10-11  2:53         ` David Miller
2006-10-11  2:54         ` David Miller
2006-10-11 14:01         ` Vlad Yasevich
2006-10-12  0:51           ` Ben Woodard [this message]
2006-10-12  1:06             ` YOSHIFUJI Hideaki / 吉藤英明
2006-10-12 15:49               ` Ben Woodard

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=452D918C.6050300@redhat.com \
    --to=woodard@redhat.com \
    --cc=behlendorf1@llnl.gov \
    --cc=davem@davemloft.net \
    --cc=mgrondona@llnl.gov \
    --cc=netdev@vger.kernel.org \
    --cc=vladislav.yasevich@hp.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).