netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Daniele Lacamera <root@danielinux.net>
To: Stephen Hemminger <shemminger@osdl.org>,
	"David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org, Carlo Caini <ccaini@deis.unibo.it>,
	Rosario Firrincieli <rfirrincieli@arces.unibo.it>,
	Giovanni Pau <gpau@cs.ucla.edu>
Subject: TCP Pacing
Date: Tue, 12 Sep 2006 19:58:21 +0200	[thread overview]
Message-ID: <200609121958.22820.root@danielinux.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 698 bytes --]

Hello,

Please let me insist once again on the importance of adding a TCP Pacing 
mechanism in our TCP, as many people are including this algorithm in 
their congestion control proposals. Recent researches have found out 
that it really can help improving performance in different scenarios, 
like satellites and long-delay high-speed channels (>100ms RTT, Gbit). 
Hybla module itself is cripple without this feature in its natural 
scenario. 

The following patch is totally non-invasive: it has a config option and 
a sysctl switch, both turned off by default. When the config option is 
enabled, it adds only 6B to the tcp_sock.

Signed-off by: Daniele Lacamera <root@danielinux.net>
--- 







[-- Attachment #2: TCP_Pacing.diff --]
[-- Type: text/x-diff, Size: 10098 bytes --]

diff -ruN linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt linux-pacing/Documentation/networking/ip-sysctl.txt
--- linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/Documentation/networking/ip-sysctl.txt	2006-09-12 16:38:14.000000000 +0200
@@ -369,6 +369,12 @@
 	be timed out after an idle period.
 	Default: 1
 
+tcp_pacing - BOOLEAN
+	If set, enable time-based TCP segment sending, instead of normal
+	ack-based sending. A software timer is set every time a new ack 
+	is received, then packets are spreaded across round-trip time.
+	Default: 0
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff -ruN linux-2.6.18-rc6/include/linux/sysctl.h linux-pacing/include/linux/sysctl.h
--- linux-2.6.18-rc6/include/linux/sysctl.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/sysctl.h	2006-09-12 18:13:38.000000000 +0200
@@ -411,6 +411,7 @@
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_TCP_PACING=118,
 };
 
 enum {
diff -ruN linux-2.6.18-rc6/include/linux/tcp.h linux-pacing/include/linux/tcp.h
--- linux-2.6.18-rc6/include/linux/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/tcp.h	2006-09-12 16:45:32.000000000 +0200
@@ -356,6 +356,17 @@
 		__u32		  probe_seq_start;
 		__u32		  probe_seq_end;
 	} mtu_probe;
+	
+#ifdef CONFIG_TCP_PACING
+/* TCP Pacing structure */
+	struct {
+		struct timer_list timer;
+		__u16   count;
+		__u16   burst;
+		__u8    lock;
+		__u8    delta;
+	} pacing;
+#endif
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -ruN linux-2.6.18-rc6/include/net/tcp.h linux-pacing/include/net/tcp.h
--- linux-2.6.18-rc6/include/net/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/net/tcp.h	2006-09-12 17:07:49.000000000 +0200
@@ -227,6 +227,9 @@
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
+#ifdef CONFIG_TCP_PACING
+extern int sysctl_tcp_pacing;
+#endif
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -449,6 +452,11 @@
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 extern unsigned int tcp_current_mss(struct sock *sk, int large);
 
+#ifdef CONFIG_TCP_PACING
+extern void tcp_pacing_recalc_delta(struct sock *sk);
+extern void tcp_pacing_reset_timer(struct sock *sk);
+#endif
+
 /* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/Kconfig linux-pacing/net/ipv4/Kconfig
--- linux-2.6.18-rc6/net/ipv4/Kconfig	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/Kconfig	2006-09-12 16:59:37.000000000 +0200
@@ -572,6 +572,20 @@
 	loss packets.
 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
 
+config TCP_PACING
+	bool "TCP Pacing"
+	depends on EXPERIMENTAL
+	select HZ_1000
+	default n
+	---help---
+	Many researchers have observed that TCP's congestion control mechanisms 
+	can lead to bursty traffic flows on modern high-speed networks, with a 
+	negative impact on overall network efficiency. A proposed solution to this 
+	problem is to evenly space, or "pace", data sent into the network over an 
+	entire round-trip time, so that data is not sent in a burst.
+	To enable this feature, please refer to Documentation/networking/ip-sysctl.txt.
+	If unsure, say N.
+	
 endmenu
 
 config TCP_CONG_BIC
diff -ruN linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c linux-pacing/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/sysctl_net_ipv4.c	2006-09-12 18:33:36.000000000 +0200
@@ -697,6 +697,16 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_TCP_PACING
+	{
+		.ctl_name	= NET_TCP_PACING,
+		.procname	= "tcp_pacing",
+		.data		= &sysctl_tcp_pacing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_input.c linux-pacing/net/ipv4/tcp_input.c
--- linux-2.6.18-rc6/net/ipv4/tcp_input.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_input.c	2006-09-12 17:11:38.000000000 +0200
@@ -2569,6 +2569,11 @@
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
+#ifdef CONFIG_TCP_PACING
+	if(sysctl_tcp_pacing)
+		tcp_pacing_recalc_delta(sk);
+#endif
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
 		dst_confirm(sk->sk_dst_cache);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_output.c linux-pacing/net/ipv4/tcp_output.c
--- linux-2.6.18-rc6/net/ipv4/tcp_output.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_output.c	2006-09-12 18:12:38.000000000 +0200
@@ -62,6 +62,10 @@
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle = 1;
 
+#ifdef CONFIG_TCP_PACING
+int sysctl_tcp_pacing=0;
+#endif
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -414,7 +418,13 @@
 		
 	if (tcp_packets_in_flight(tp) == 0)
 		tcp_ca_event(sk, CA_EVENT_TX_START);
-
+	
+#ifdef CONFIG_TCP_PACING
+	if(sysctl_tcp_pacing) {
+		tcp_pacing_reset_timer(sk);
+		tp->pacing.lock = 1;
+	}
+#endif
 	th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 	skb->h.th = th;
 	skb_set_owner_w(skb, sk);
@@ -1085,7 +1095,15 @@
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
-
+	
+#ifdef CONFIG_TCP_PACING
+	/* TCP Pacing conflicts with this algorithm.
+	 * When Pacing is enabled, don't try to defer.
+	 */
+	if(sysctl_tcp_pacing)
+		return 0;
+#endif
+	
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
@@ -1308,7 +1326,12 @@
 
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
-
+		
+#ifdef CONFIG_TCP_PACING
+		if (sysctl_tcp_pacing && tp->pacing.lock)
+			return 0;
+#endif
+		
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1323,6 +1346,10 @@
 		if (tso_segs > 1) {
 			limit = tcp_window_allows(tp, skb,
 						  mss_now, cwnd_quota);
+#ifdef CONFIG_TCP_PACING
+		if (sysctl_tcp_pacing && sent_pkts >= tp->pacing.burst)
+			tp->pacing.lock=1;
+#endif
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1733,6 +1760,11 @@
 		}
 	}
 
+#ifdef CONFIG_TCP_PACING
+	if (sysctl_tcp_pacing && tp->pacing.lock)
+		return -EAGAIN;
+#endif
+
 	/* Make a copy, if the first transmission SKB clone we made
 	 * is still in somebody's hands, else make a clone.
 	 */
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_timer.c linux-pacing/net/ipv4/tcp_timer.c
--- linux-2.6.18-rc6/net/ipv4/tcp_timer.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_timer.c	2006-09-12 18:03:17.000000000 +0200
@@ -36,10 +36,21 @@
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
+#ifdef CONFIG_TCP_PACING
+static void tcp_pacing_timer(unsigned long data);
+#endif
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
+	
+#ifdef CONFIG_TCP_PACING
+	init_timer(&(tcp_sk(sk)->pacing.timer));
+	tcp_sk(sk)->pacing.timer.function=&tcp_pacing_timer;
+	tcp_sk(sk)->pacing.timer.data = (unsigned long) sk;
+#endif
+
 }
 
 EXPORT_SYMBOL(tcp_init_xmit_timers);
@@ -522,3 +533,115 @@
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+#ifdef CONFIG_TCP_PACING
+/*
+ * This is the timer used to spread packets.
+ * a delta value is computed on rtt/cwnd,
+ * and will be our expire interval.
+ * The timer has to be restarted when a segment is sent out.
+ */
+static void tcp_pacing_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if(!sysctl_tcp_pacing)
+		return;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		if (!mod_timer(&tp->pacing.timer, jiffies + 1))
+			sock_hold(sk);
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/* Unlock sending, so when next ack is received it will pass.
+	 *If there are no packets scheduled, do nothing.
+	 */
+	tp->pacing.lock=0;
+	
+	if(!sk->sk_send_head){
+		/* Sending queue empty */
+		goto out;
+	}
+	
+	/*  Handler */
+	tcp_push_pending_frames(sk,tp);
+
+	out:
+	if (tcp_memory_pressure)
+		sk_stream_mem_reclaim(sk);
+
+	out_unlock:
+		bh_unlock_sock(sk);
+		sock_put(sk);
+}
+
+void tcp_pacing_reset_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 timeout = jiffies+tp->pacing.delta;
+
+	if(!sysctl_tcp_pacing)
+		return;
+	if (!mod_timer(&tp->pacing.timer, timeout))
+			sock_hold(sk);
+}
+EXPORT_SYMBOL(tcp_pacing_reset_timer);
+
+/*
+ * This routine computes tcp_pacing delay, using
+ * a simplified uniform pacing policy.
+ */
+void tcp_pacing_recalc_delta(struct sock *sk)
+{
+       struct tcp_sock *tp=tcp_sk(sk);
+       __u32 window=(tp->snd_cwnd)<<3;
+       __u32 srtt = tp->srtt;
+       __u32 round=0;
+       __u32 curmss=tp->mss_cache;
+       int state=inet_csk(sk)->icsk_ca_state;
+
+       if( (state==TCP_CA_Recovery) &&(tp->snd_cwnd < tp->snd_ssthresh))
+		window=(tp->snd_ssthresh)<<3;
+
+       if( (tp->snd_wnd/curmss) < tp->snd_cwnd )
+		window = (tp->snd_wnd/curmss)<<3;
+
+       if (window>1 && srtt){
+               if (window <= srtt){
+                       tp->pacing.delta=(srtt/window);
+			if(srtt%window)
+				round=( (srtt/(srtt%window)) / tp->pacing.delta);
+			if (tp->pacing.count >= (round-1) &&(round>1)){
+				tp->pacing.delta++;
+				tp->pacing.count=0;
+			}
+			tp->pacing.burst=1;
+		} else {
+			tp->pacing.delta=1;
+			tp->pacing.burst=(window/srtt);
+			if(window%srtt)
+				round=( (window/(window%srtt)) * tp->pacing.burst);
+			if (tp->pacing.count >= (round-1) && (round>1)){
+				tp->pacing.burst++;
+				tp->pacing.count=0;
+			}
+		}
+	} else {
+		tp->pacing.delta=0;
+		tp->pacing.burst=1;
+       }
+}
+
+EXPORT_SYMBOL(tcp_pacing_recalc_delta);
+
+#endif
+
+
+

             reply	other threads:[~2006-09-12 17:58 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-09-12 17:58 Daniele Lacamera [this message]
2006-09-12 18:21 ` TCP Pacing Arnaldo Carvalho de Melo
2006-09-12 21:26 ` Ian McDonald
2006-09-13  8:18   ` Daniele Lacamera
2006-09-13 15:46     ` Daniele Lacamera
2006-09-16  0:41       ` Xiaoliang (David) Wei
2006-09-19 11:31         ` Daniele Lacamera
2006-09-13 18:30     ` Ian McDonald
2006-09-13  3:41 ` Stephen Hemminger
2006-09-13  8:18   ` Daniele Lacamera
2006-09-14  1:21     ` Stephen Hemminger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200609121958.22820.root@danielinux.net \
    --to=root@danielinux.net \
    --cc=ccaini@deis.unibo.it \
    --cc=davem@davemloft.net \
    --cc=gpau@cs.ucla.edu \
    --cc=netdev@vger.kernel.org \
    --cc=rfirrincieli@arces.unibo.it \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).