Re: TCP Pacing - Daniele Lacamera

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Daniele Lacamera <root@danielinux.net>
To: Stephen Hemminger <shemminger@osdl.org>
Cc: "David S. Miller" <davem@davemloft.net>,
	netdev@vger.kernel.org, Carlo Caini <ccaini@deis.unibo.it>,
	Rosario Firrincieli <rfirrincieli@arces.unibo.it>,
	Giovanni Pau <gpau@cs.ucla.edu>
Subject: Re: TCP Pacing
Date: Wed, 13 Sep 2006 10:18:31 +0200	[thread overview]
Message-ID: <200609131018.33231.root@danielinux.net> (raw)
In-Reply-To: <20060913124152.350cd9b2@localhost.localdomain>

[-- Attachment #1: Type: text/plain, Size: 407 bytes --]

On Wednesday 13 September 2006 05:41, Stephen Hemminger wrote:
> Pacing in itself isn't a bad idea, but:
<cut>
> * Since it is most useful over long delay links, maybe it should be a 
route parameter.

What does this mean? Should I move the sysctl switch elsewhere?

A new (cleaner) patch follows.
Thanks to you all for your attention & advices.

Signed-off by: Daniele Lacamera <root@danielinux.net>
--- 


[-- Attachment #2: TCP_Pacing.diff --]
[-- Type: text/x-diff, Size: 10844 bytes --]

diff -ruN linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt linux-pacing/Documentation/networking/ip-sysctl.txt
--- linux-2.6.18-rc6/Documentation/networking/ip-sysctl.txt	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/Documentation/networking/ip-sysctl.txt	2006-09-12 16:38:14.000000000 +0200
@@ -369,6 +369,12 @@
 	be timed out after an idle period.
 	Default: 1
 
+tcp_pacing - BOOLEAN
+	If set, enable time-based TCP segment sending, instead of normal
+	ack-based sending. A software timer is set every time a new ack 
+	is received, then packets are spreaded across round-trip time.
+	Default: 0
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff -ruN linux-2.6.18-rc6/include/linux/sysctl.h linux-pacing/include/linux/sysctl.h
--- linux-2.6.18-rc6/include/linux/sysctl.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/sysctl.h	2006-09-12 18:13:38.000000000 +0200
@@ -411,6 +411,7 @@
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_TCP_PACING=118,
 };
 
 enum {
diff -ruN linux-2.6.18-rc6/include/linux/tcp.h linux-pacing/include/linux/tcp.h
--- linux-2.6.18-rc6/include/linux/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/linux/tcp.h	2006-09-12 16:45:32.000000000 +0200
@@ -356,6 +356,17 @@
 		__u32		  probe_seq_start;
 		__u32		  probe_seq_end;
 	} mtu_probe;
+	
+#ifdef CONFIG_TCP_PACING
+/* TCP Pacing structure */
+	struct {
+		struct timer_list timer;
+		__u16   count;
+		__u16   burst;
+		__u8    lock;
+		__u8    delta;
+	} pacing;
+#endif
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -ruN linux-2.6.18-rc6/include/net/tcp.h linux-pacing/include/net/tcp.h
--- linux-2.6.18-rc6/include/net/tcp.h	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/include/net/tcp.h	2006-09-13 09:33:02.000000000 +0200
@@ -449,6 +449,58 @@
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 extern unsigned int tcp_current_mss(struct sock *sk, int large);
 
+#ifdef CONFIG_TCP_PACING
+extern int sysctl_tcp_pacing;
+extern void __tcp_pacing_recalc_delta(struct sock *sk);
+extern void __tcp_pacing_reset_timer(struct sock *sk);
+static inline void tcp_pacing_recalc_delta(struct sock *sk)
+{
+	if (sysctl_tcp_pacing) 
+		__tcp_pacing_recalc_delta(sk);
+}
+
+static inline void tcp_pacing_reset_timer(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		__tcp_pacing_reset_timer(sk);
+}
+
+static inline void tcp_pacing_lock_tx(struct sock *sk)
+{
+	if (sysctl_tcp_pacing) 
+		tcp_sk(sk)->pacing.lock=1;
+}
+
+static inline int tcp_pacing_locked(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		return tcp_sk(sk)->pacing.lock;
+	else 
+		return 0;
+}
+
+static inline int tcp_pacing_enabled(struct sock *sk)
+{
+	return sysctl_tcp_pacing;
+}
+
+static inline int tcp_pacing_burst(struct sock *sk)
+{
+	if (sysctl_tcp_pacing)
+		return tcp_sk(sk)->pacing.burst;
+	else 
+		return 0;
+}
+	
+#else
+static inline void tcp_pacing_recalc_delta(struct sock *sk) {};
+static inline void tcp_pacing_reset_timer(struct sock *sk) {};
+static inline void tcp_pacing_lock_tx(struct sock *sk) {};
+#define tcp_pacing_locked(sk) 0 
+#define tcp_pacing_enabled(sk) 0
+#define tcp_pacing_burst(sk) 0
+#endif
+
 /* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/Kconfig linux-pacing/net/ipv4/Kconfig
--- linux-2.6.18-rc6/net/ipv4/Kconfig	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/Kconfig	2006-09-13 09:31:27.000000000 +0200
@@ -572,6 +572,19 @@
 	loss packets.
 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
 
+config TCP_PACING
+	bool "TCP Pacing"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	Many researchers have observed that TCP's congestion control mechanisms 
+	can lead to bursty traffic flows on modern high-speed networks, with a 
+	negative impact on overall network efficiency. A proposed solution to this 
+	problem is to evenly space, or "pace", data sent into the network over an 
+	entire round-trip time, so that data is not sent in a burst.
+	To enable this feature, please refer to Documentation/networking/ip-sysctl.txt.
+	If unsure, say N.
+	
 endmenu
 
 config TCP_CONG_BIC
diff -ruN linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c linux-pacing/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.18-rc6/net/ipv4/sysctl_net_ipv4.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/sysctl_net_ipv4.c	2006-09-12 18:33:36.000000000 +0200
@@ -697,6 +697,16 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_TCP_PACING
+	{
+		.ctl_name	= NET_TCP_PACING,
+		.procname	= "tcp_pacing",
+		.data		= &sysctl_tcp_pacing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_input.c linux-pacing/net/ipv4/tcp_input.c
--- linux-2.6.18-rc6/net/ipv4/tcp_input.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_input.c	2006-09-13 08:08:32.000000000 +0200
@@ -2569,6 +2569,8 @@
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
+	tcp_pacing_recalc_delta(sk);
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
 		dst_confirm(sk->sk_dst_cache);
 
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_output.c linux-pacing/net/ipv4/tcp_output.c
--- linux-2.6.18-rc6/net/ipv4/tcp_output.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_output.c	2006-09-13 09:19:05.000000000 +0200
@@ -414,6 +414,9 @@
 		
 	if (tcp_packets_in_flight(tp) == 0)
 		tcp_ca_event(sk, CA_EVENT_TX_START);
+	
+	tcp_pacing_reset_timer(sk);
+	tcp_pacing_lock_tx(sk);
 
 	th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 	skb->h.th = th;
@@ -1086,6 +1089,12 @@
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
 
+	/* TCP Pacing conflicts with this algorithm.
+	 * When Pacing is enabled, don't try to defer.
+	 */
+	if (tcp_pacing_enabled(sk))
+		return 0;
+	
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
@@ -1309,6 +1318,9 @@
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
 
+		if (tcp_pacing_locked(sk))
+			return 0;
+		
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1323,6 +1335,8 @@
 		if (tso_segs > 1) {
 			limit = tcp_window_allows(tp, skb,
 						  mss_now, cwnd_quota);
+		if (tcp_pacing_enabled(sk) && sent_pkts >= tcp_pacing_burst(sk))
+			tcp_pacing_lock_tx(sk);
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1733,6 +1747,9 @@
 		}
 	}
 
+	if (tcp_pacing_locked(sk))
+		return -EAGAIN;
+
 	/* Make a copy, if the first transmission SKB clone we made
 	 * is still in somebody's hands, else make a clone.
 	 */
diff -ruN linux-2.6.18-rc6/net/ipv4/tcp_timer.c linux-pacing/net/ipv4/tcp_timer.c
--- linux-2.6.18-rc6/net/ipv4/tcp_timer.c	2006-09-04 04:19:48.000000000 +0200
+++ linux-pacing/net/ipv4/tcp_timer.c	2006-09-13 09:10:58.000000000 +0200
@@ -19,6 +19,9 @@
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
  */
+/* Changes:
+ * 		Daniele Lacamera, <root@danielinux.net> TCP Pacing algorithm
+ */
 
 #include <linux/module.h>
 #include <net/tcp.h>
@@ -36,10 +39,22 @@
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
+#ifdef CONFIG_TCP_PACING
+int sysctl_tcp_pacing = 0;
+static void tcp_pacing_timer(unsigned long data);
+#endif
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
+	
+#ifdef CONFIG_TCP_PACING
+	init_timer(&(tcp_sk(sk)->pacing.timer));
+	tcp_sk(sk)->pacing.timer.function = &tcp_pacing_timer;
+	tcp_sk(sk)->pacing.timer.data = (unsigned long) sk;
+#endif
+
 }
 
 EXPORT_SYMBOL(tcp_init_xmit_timers);
@@ -522,3 +537,117 @@
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+#ifdef CONFIG_TCP_PACING
+/* Routines for TCP Pacing.
+ *
+ * Amit Aggarwal, Stefan Savage, and Thomas Anderson, "Understanding the Performance of TCP Pacing"
+ * Proc. of the IEEE INFOCOM 2000 Conference on Computer Communications, March 2000, pages 1157 - 1165.
+ *
+ * This is the timer used to spread packets.
+ * a delta value is computed on rtt/cwnd,
+ * and will be our expire interval.
+ */
+static void tcp_pacing_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*) data;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!sysctl_tcp_pacing)
+		return;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		if (!mod_timer(&tp->pacing.timer, jiffies + 1))
+			sock_hold(sk);
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/* Unlock sending, so when next ack is received it will pass.
+	 * If there are no packets scheduled, do nothing.
+	 */
+	tp->pacing.lock = 0;
+	
+	if (!sk->sk_send_head){
+		/* Sending queue empty */
+		goto out;
+	}
+	
+	/* Handler */
+	tcp_push_pending_frames(sk, tp);
+
+	out:
+	if (tcp_memory_pressure)
+		sk_stream_mem_reclaim(sk);
+
+	out_unlock:
+		bh_unlock_sock(sk);
+		sock_put(sk);
+}
+
+/* 
+ * The timer has to be restarted when a segment is sent out.
+ */
+void __tcp_pacing_reset_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 timeout = jiffies + tp->pacing.delta;
+
+	if (!mod_timer(&tp->pacing.timer, timeout))
+			sock_hold(sk);
+}
+EXPORT_SYMBOL(__tcp_pacing_reset_timer);
+
+/*
+ * This routine computes tcp_pacing delay, using
+ * a simplified uniform pacing policy.
+ */
+void __tcp_pacing_recalc_delta(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       __u32 window = (tp->snd_cwnd)<<3;
+       __u32 srtt = tp->srtt;
+       __u32 round = 0;
+       __u32 curmss = tp->mss_cache;
+       int state = inet_csk(sk)->icsk_ca_state;
+
+       if (state == TCP_CA_Recovery && tp->snd_cwnd < tp->snd_ssthresh)
+		window = tp->snd_ssthresh << 3;
+
+       if (tp->snd_wnd/curmss < tp->snd_cwnd)
+		window = (tp->snd_wnd / curmss) << 3;
+
+       if (window>1 && srtt){
+               if (window <= srtt){
+                       tp->pacing.delta = (srtt/window);
+			if (srtt % window)
+				round=((srtt / (srtt % window)) / tp->pacing.delta);
+			if (tp->pacing.count >= (round - 1) && round > 1){
+				tp->pacing.delta++;
+				tp->pacing.count = 0;
+			}
+			tp->pacing.burst = 1;
+		} else {
+			tp->pacing.delta = 1;
+			tp->pacing.burst = (window / srtt);
+			if (window % srtt)
+				round=( (window / (window % srtt)) * tp->pacing.burst);
+			if (tp->pacing.count >= (round - 1) && (round > 1)){
+				tp->pacing.burst++;
+				tp->pacing.count = 0;
+			}
+		}
+	} else {
+		tp->pacing.delta = 0;
+		tp->pacing.burst = 1;
+       }
+}
+
+EXPORT_SYMBOL(__tcp_pacing_recalc_delta);
+
+#endif
+

next prev parent reply	other threads:[~2006-09-13  8:18 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-09-12 17:58 TCP Pacing Daniele Lacamera
2006-09-12 18:21 ` Arnaldo Carvalho de Melo
2006-09-12 21:26 ` Ian McDonald
2006-09-13  8:18   ` Daniele Lacamera
2006-09-13 15:46     ` Daniele Lacamera
2006-09-16  0:41       ` Xiaoliang (David) Wei
2006-09-19 11:31         ` Daniele Lacamera
2006-09-13 18:30     ` Ian McDonald
2006-09-13  3:41 ` Stephen Hemminger
2006-09-13  8:18   ` Daniele Lacamera [this message]
2006-09-14  1:21     ` Stephen Hemminger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200609131018.33231.root@danielinux.net \
    --to=root@danielinux.net \
    --cc=ccaini@deis.unibo.it \
    --cc=davem@davemloft.net \
    --cc=gpau@cs.ucla.edu \
    --cc=netdev@vger.kernel.org \
    --cc=rfirrincieli@arces.unibo.it \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.