Re: TCP Pacing - Daniele Lacamera

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Daniele Lacamera <root@danielinux.net>
To: Stephen Hemminger <shemminger@osdl.org>
Cc: "David S. Miller" <davem@davemloft.net>,
	netdev@vger.kernel.org, Carlo Caini <ccaini@deis.unibo.it>,
	Rosario Firrincieli <rfirrincieli@arces.unibo.it>,
	Giovanni Pau <gpau@cs.ucla.edu>
Subject: Re: TCP Pacing
Date: Wed, 13 Sep 2006 10:18:31 +0200	[thread overview]
Message-ID: <200609131018.33231.root@danielinux.net> (raw)
In-Reply-To: <20060913124152.350cd9b2@localhost.localdomain>

[-- Attachment #1: Type: text/plain, Size: 407 bytes --]

On Wednesday 13 September 2006 05:41, Stephen Hemminger wrote:
> Pacing in itself isn't a bad idea, but:
<cut>
> * Since it is most useful over long delay links, maybe it should be a 
route parameter.

What does this mean? Should I move the sysctl switch elsewhere?

A new (cleaner) patch follows.
Thanks to you all for your attention & advices.

Signed-off by: Daniele Lacamera <root@danielinux.net>
--- 


[-- Attachment #2: TCP_Pacing.diff --]
[-- Type: text/x-diff, Size: 10844 bytes --]

diff -ruN linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt linux-pacing/Documentation/networking/ip-sysctl.txt
--- linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/Documentation/networking/ip-sysctl.txt	2006-09-12 16:38:14.000000000 +0200
@@ -369,6 +369,12 @@
 	be timed out after an idle period.
 	Default: 1
 
+tcp_pacing - BOOLEAN
+	If set, enable time-based TCP segment sending, instead of normal
+	ack-based sending. A software timer is set every time a new ack 
+	is received, then packets are spreaded across round-trip time.
+	Default: 0
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff -ruN linux-2.6.18-rc6/include/linux/sysctl.h linux-pacing/include/linux/sysctl.h
--- linux-2.6.18-rc6/include/linux/sysctl.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/sysctl.h	2006-09-12 18:13:38.000000000 +0200
@@ -411,6 +411,7 @@
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_TCP_PACING=118,
 };
 
 enum {
diff -ruN linux-2.6.18-rc6/include/linux/tcp.h linux-pacing/include/linux/tcp.h
--- linux-2.6.18-rc6/include/linux/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/tcp.h	2006-09-12 16:45:32.000000000 +0200
@@ -356,6 +356,17 @@
 		__u32		  probe_seq_start;
 		__u32		  probe_seq_end;
 	} mtu_probe;
+	
+#ifdef CONFIG_TCP_PACING
+/* TCP Pacing structure */
+	struct {
+		struct timer_list timer;
+		__u16   count;
+		__u16   burst;
+		__u8    lock;
+		__u8    delta;
+	} pacing;
+#endif
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -ruN linux-2.6.18-rc6/include/net/tcp.h linux-pacing/include/net/tcp.h
--- linux-2.6.18-rc6/include/net/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/net/tcp.h	2006-09-13 09:33:02.000000000 +0200
@@ -449,6 +449,58 @@
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 extern unsigned int tcp_current_mss(struct sock *sk, int large);
 
+#ifdef CONFIG_TCP_PACING
+extern int sysctl_tcp_pacing;
+extern void __tcp_pacing_recalc_delta(struct sock *sk);
+extern void __tcp_pacing_reset_timer(struct sock *sk);
+static inline void tcp_pacing_recalc_delta(struct sock *sk)
+{
+	if (sysctl_tcp_pacing) 
+		__tcp_pacing_recalc_delta(sk);
+}
+
+static inline void tcp_pacing_reset_timer(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		__tcp_pacing_reset_timer(sk);
+}
+
+static inline void tcp_pacing_lock_tx(struct sock *sk)
+{
+	if (sysctl_tcp_pacing) 
+		tcp_sk(sk)->pacing.lock=1;
+}
+
+static inline int tcp_pacing_locked(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		return tcp_sk(sk)->pacing.lock;
+	else 
+		return 0;
+}
+
+static inline int tcp_pacing_enabled(struct sock *sk)
+{
+	return sysctl_tcp_pacing;
+}
+
+static inline int tcp_pacing_burst(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		return tcp_sk(sk)->pacing.burst;
+	else 
+		return 0;
+}
+	
+#else
+static inline void tcp_pacing_recalc_delta(struct sock *sk) {};
+static inline void tcp_pacing_reset_timer(struct sock *sk) {};
+static inline void tcp_pacing_lock_tx(struct sock *sk) {};
+#define tcp_pacing_locked(sk) 0 
+#define tcp_pacing_enabled(sk) 0
+#define tcp_pacing_burst(sk) 0
+#endif
+
 /* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/Kconfig linux-pacing/net/ipv4/Kconfig
--- linux-2.6.18-rc6/net/ipv4/Kconfig	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/Kconfig	2006-09-13 09:31:27.000000000 +0200
@@ -572,6 +572,19 @@
 	loss packets.
 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
 
+config TCP_PACING
+	bool "TCP Pacing"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	Many researchers have observed that TCP's congestion control mechanisms 
+	can lead to bursty traffic flows on modern high-speed networks, with a 
+	negative impact on overall network efficiency. A proposed solution to this 
+	problem is to evenly space, or "pace", data sent into the network over an 
+	entire round-trip time, so that data is not sent in a burst.
+	To enable this feature, please refer to Documentation/networking/ip-sysctl.txt.
+	If unsure, say N.
+	
 endmenu
 
 config TCP_CONG_BIC
diff -ruN linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c linux-pacing/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/sysctl_net_ipv4.c	2006-09-12 18:33:36.000000000 +0200
@@ -697,6 +697,16 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_TCP_PACING
+	{
+		.ctl_name	= NET_TCP_PACING,
+		.procname	= "tcp_pacing",
+		.data		= &sysctl_tcp_pacing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_input.c linux-pacing/net/ipv4/tcp_input.c
--- linux-2.6.18-rc6/net/ipv4/tcp_input.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_input.c	2006-09-13 08:08:32.000000000 +0200
@@ -2569,6 +2569,8 @@
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
+	tcp_pacing_recalc_delta(sk);
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
 		dst_confirm(sk->sk_dst_cache);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_output.c linux-pacing/net/ipv4/tcp_output.c
--- linux-2.6.18-rc6/net/ipv4/tcp_output.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_output.c	2006-09-13 09:19:05.000000000 +0200
@@ -414,6 +414,9 @@
 		
 	if (tcp_packets_in_flight(tp) == 0)
 		tcp_ca_event(sk, CA_EVENT_TX_START);
+	
+	tcp_pacing_reset_timer(sk);
+	tcp_pacing_lock_tx(sk);
 
 	th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 	skb->h.th = th;
@@ -1086,6 +1089,12 @@
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
 
+	/* TCP Pacing conflicts with this algorithm.
+	 * When Pacing is enabled, don't try to defer.
+	 */
+	if (tcp_pacing_enabled(sk))
+		return 0;
+	
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
@@ -1309,6 +1318,9 @@
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
 
+		if (tcp_pacing_locked(sk))
+			return 0;
+		
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1323,6 +1335,8 @@
 		if (tso_segs > 1) {
 			limit = tcp_window_allows(tp, skb,
 						  mss_now, cwnd_quota);
+		if (tcp_pacing_enabled(sk) && sent_pkts >= tcp_pacing_burst(sk))
+			tcp_pacing_lock_tx(sk);
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1733,6 +1747,9 @@
 		}
 	}
 
+	if (tcp_pacing_locked(sk))
+		return -EAGAIN;
+
 	/* Make a copy, if the first transmission SKB clone we made
 	 * is still in somebody's hands, else make a clone.
 	 */
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_timer.c linux-pacing/net/ipv4/tcp_timer.c
--- linux-2.6.18-rc6/net/ipv4/tcp_timer.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_timer.c	2006-09-13 09:10:58.000000000 +0200
@@ -19,6 +19,9 @@
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
  */
+/* Changes:
+ * 		Daniele Lacamera, <root@danielinux.net> TCP Pacing algorithm
+ */
 
 #include <linux/module.h>
 #include <net/tcp.h>
@@ -36,10 +39,22 @@
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
+#ifdef CONFIG_TCP_PACING
+int sysctl_tcp_pacing = 0;
+static void tcp_pacing_timer(unsigned long data);
+#endif
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
+	
+#ifdef CONFIG_TCP_PACING
+	init_timer(&(tcp_sk(sk)->pacing.timer));
+	tcp_sk(sk)->pacing.timer.function = &tcp_pacing_timer;
+	tcp_sk(sk)->pacing.timer.data = (unsigned long) sk;
+#endif
+
 }
 
 EXPORT_SYMBOL(tcp_init_xmit_timers);
@@ -522,3 +537,117 @@
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+#ifdef CONFIG_TCP_PACING
+/* Routines for TCP Pacing.
+ *
+ * Amit Aggarwal, Stefan Savage, and Thomas Anderson, "Understanding the Performance of TCP Pacing"
+ * Proc. of the IEEE INFOCOM 2000 Conference on Computer Communications, March 2000, pages 1157 - 1165.
+ *
+ * This is the timer used to spread packets.
+ * a delta value is computed on rtt/cwnd,
+ * and will be our expire interval.
+ */
+static void tcp_pacing_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*) data;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!sysctl_tcp_pacing)
+		return;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		if (!mod_timer(&tp->pacing.timer, jiffies + 1))
+			sock_hold(sk);
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/* Unlock sending, so when next ack is received it will pass.
+	 * If there are no packets scheduled, do nothing.
+	 */
+	tp->pacing.lock = 0;
+	
+	if (!sk->sk_send_head){
+		/* Sending queue empty */
+		goto out;
+	}
+	
+	/* Handler */
+	tcp_push_pending_frames(sk, tp);
+
+	out:
+	if (tcp_memory_pressure)
+		sk_stream_mem_reclaim(sk);
+
+	out_unlock:
+		bh_unlock_sock(sk);
+		sock_put(sk);
+}
+
+/* 
+ * The timer has to be restarted when a segment is sent out.
+ */
+void __tcp_pacing_reset_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 timeout = jiffies + tp->pacing.delta;
+
+	if (!mod_timer(&tp->pacing.timer, timeout))
+			sock_hold(sk);
+}
+EXPORT_SYMBOL(__tcp_pacing_reset_timer);
+
+/*
+ * This routine computes tcp_pacing delay, using
+ * a simplified uniform pacing policy.
+ */
+void __tcp_pacing_recalc_delta(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       __u32 window = (tp->snd_cwnd)<<3;
+       __u32 srtt = tp->srtt;
+       __u32 round = 0;
+       __u32 curmss = tp->mss_cache;
+       int state = inet_csk(sk)->icsk_ca_state;
+
+       if (state == TCP_CA_Recovery && tp->snd_cwnd < tp->snd_ssthresh)
+		window = tp->snd_ssthresh << 3;
+
+       if (tp->snd_wnd/curmss < tp->snd_cwnd)
+		window = (tp->snd_wnd / curmss) << 3;
+
+       if (window>1 && srtt){
+               if (window <= srtt){
+                       tp->pacing.delta = (srtt/window);
+			if (srtt % window)
+				round=((srtt / (srtt % window)) / tp->pacing.delta);
+			if (tp->pacing.count >= (round - 1) && round > 1){
+				tp->pacing.delta++;
+				tp->pacing.count = 0;
+			}
+			tp->pacing.burst = 1;
+		} else {
+			tp->pacing.delta = 1;
+			tp->pacing.burst = (window / srtt);
+			if (window % srtt)
+				round=( (window / (window % srtt)) * tp->pacing.burst);
+			if (tp->pacing.count >= (round - 1) && (round > 1)){
+				tp->pacing.burst++;
+				tp->pacing.count = 0;
+			}
+		}
+	} else {
+		tp->pacing.delta = 0;
+		tp->pacing.burst = 1;
+       }
+}
+
+EXPORT_SYMBOL(__tcp_pacing_recalc_delta);
+
+#endif
+

next prev parent reply	other threads:[~2006-09-13  8:18 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-09-12 17:58 TCP Pacing Daniele Lacamera
2006-09-12 18:21 ` Arnaldo Carvalho de Melo
2006-09-12 21:26 ` Ian McDonald
2006-09-13  8:18   ` Daniele Lacamera
2006-09-13 15:46     ` Daniele Lacamera
2006-09-16  0:41       ` Xiaoliang (David) Wei
2006-09-19 11:31         ` Daniele Lacamera
2006-09-13 18:30     ` Ian McDonald
2006-09-13  3:41 ` Stephen Hemminger
2006-09-13  8:18   ` Daniele Lacamera [this message]
2006-09-14  1:21     ` Stephen Hemminger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200609131018.33231.root@danielinux.net \
    --to=root@danielinux.net \
    --cc=ccaini@deis.unibo.it \
    --cc=davem@davemloft.net \
    --cc=gpau@cs.ucla.edu \
    --cc=netdev@vger.kernel.org \
    --cc=rfirrincieli@arces.unibo.it \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).