netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Make TCP prequeue configurable
@ 2007-09-27 22:08 Eric Dumazet
  2007-09-27 22:44 ` Stephen Hemminger
  2007-09-27 23:09 ` David Miller
  0 siblings, 2 replies; 6+ messages in thread
From: Eric Dumazet @ 2007-09-27 22:08 UTC (permalink / raw)
  To: Linux Netdev List

[-- Attachment #1: Type: text/plain, Size: 863 bytes --]

Hi all

I am sure some of you are going to tell me that prequeue is not
all black :)

Thank you

[RFC] Make TCP prequeue configurable

The TCP prequeue thing is based on old facts, and has drawbacks.

1) It adds 48 bytes per 'struct tcp_sock'
2) It adds some ugly code in hot paths
3) It has a small hit ratio on typical servers using many sockets
4) It may have a high hit ratio on UP machines running one process,
    where the prequeue adds litle gain. (In fact, letting the user
    doing the copy after being woke up is better for cache reuse)
5) Doing a copy to user in softirq handler is not good, because of
    potential page faults :(
6) Maybe the NET_DMA thing is the only thing that might need prequeue.

This patch introduces a CONFIG_TCP_PREQUEUE, automatically selected if 
CONFIG_NET_DMA is on.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: net_prequeue.patch --]
[-- Type: text/plain, Size: 7426 bytes --]

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 8f670da..14e3f01 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -16,6 +16,7 @@ comment "DMA Clients"
 config NET_DMA
 	bool "Network: TCP receive copy offload"
 	depends on DMA_ENGINE && NET
+	select TCP_PREQUEUE
 	default y
 	---help---
 	  This enables the use of DMA engines in the network stack to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c6b9f92..844a05e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,11 +268,13 @@ struct tcp_sock {
 
 	/* Data for direct copy to user */
 	struct {
+#ifdef CONFIG_TCP_PREQUEUE
 		struct sk_buff_head	prequeue;
 		struct task_struct	*task;
 		struct iovec		*iov;
 		int			memory;
 		int			len;
+#endif
 #ifdef CONFIG_NET_DMA
 		/* members for async copy */
 		struct dma_chan		*dma_chan;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 185c7ec..3430d8e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -835,10 +835,12 @@ static inline int tcp_checksum_complete(struct sk_buff *skb)
 
 static inline void tcp_prequeue_init(struct tcp_sock *tp)
 {
+#ifdef CONFIG_TCP_PREQUEUE
 	tp->ucopy.task = NULL;
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
+#endif
 #ifdef CONFIG_NET_DMA
 	tp->ucopy.dma_chan = NULL;
 	tp->ucopy.wakeup = 0;
@@ -857,6 +859,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
  */
 static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 {
+#ifdef CONFIG_TCP_PREQUEUE
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!sysctl_tcp_low_latency && tp->ucopy.task) {
@@ -882,6 +885,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 		}
 		return 1;
 	}
+#endif
 	return 0;
 }
 
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index fb79097..b770829 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -616,5 +616,20 @@ config TCP_MD5SIG
 
 	  If unsure, say N.
 
+config TCP_PREQUEUE
+	bool "Enable TCP prequeue"
+	default n
+	---help---
+	  TCP PREQUEUE is an 'optimization' loosely based on the famous
+	  "30 instruction TCP receive" Van Jacobson mail.
+	  Van's trick is to deposit buffers into socket queue
+	  on a device interrupt, to call tcp_recv function
+	  on the receive process context and checksum and copy
+	  the buffer to user space. smart...
+	 
+	  Some people believe this 'optimization' is not really needed
+	  but for some benchmarks. Also, taking potential pagefaults in 
+	  softirq handler seems a high price to pay.
+
 source "net/ipv4/ipvs/Kconfig"
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7e74011..8659533 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -994,6 +994,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		tcp_send_ack(sk);
 }
 
+#ifdef CONFIG_TCP_PREQUEUE
 static void tcp_prequeue_process(struct sock *sk)
 {
 	struct sk_buff *skb;
@@ -1011,6 +1012,7 @@ static void tcp_prequeue_process(struct sock *sk)
 	/* Clear memory counter. */
 	tp->ucopy.memory = 0;
 }
+#endif
 
 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
@@ -1251,6 +1253,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 		tcp_cleanup_rbuf(sk, copied);
 
+#ifdef CONFIG_TCP_PREQUEUE
 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
@@ -1295,7 +1298,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 			/* __ Set realtime policy in scheduler __ */
 		}
-
+#endif
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
@@ -1307,6 +1310,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		tp->ucopy.wakeup = 0;
 #endif
 
+#ifdef CONFIG_TCP_PREQUEUE
 		if (user_recv) {
 			int chunk;
 
@@ -1330,6 +1334,7 @@ do_prequeue:
 				}
 			}
 		}
+#endif
 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
 			if (net_ratelimit())
 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
@@ -1430,6 +1435,7 @@ skip_copy:
 		break;
 	} while (len > 0);
 
+#ifdef CONFIG_TCP_PREQUEUE
 	if (user_recv) {
 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
@@ -1448,6 +1454,7 @@ skip_copy:
 		tp->ucopy.task = NULL;
 		tp->ucopy.len = 0;
 	}
+#endif
 
 #ifdef CONFIG_NET_DMA
 	if (tp->ucopy.dma_chan) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bbad2cd..85d3a5c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3467,6 +3467,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			goto out_of_window;
 
 		/* Ok. In sequence. In window. */
+#ifdef CONFIG_TCP_PREQUEUE
 		if (tp->ucopy.task == current &&
 		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
 		    sock_owned_by_user(sk) && !tp->urg_data) {
@@ -3484,7 +3485,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			}
 			local_bh_disable();
 		}
-
+#endif
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0 &&
@@ -4078,6 +4079,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
 	}
 }
 
+#ifdef CONFIG_TCP_PREQUEUE
 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4100,6 +4102,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 	local_bh_disable();
 	return err;
 }
+#endif
 
 static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
 {
@@ -4279,8 +4282,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			}
 		} else {
 			int eaten = 0;
-			int copied_early = 0;
 
+#ifdef CONFIG_TCP_PREQUEUE
+			int copied_early = 0;
 			if (tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len) {
 #ifdef CONFIG_NET_DMA
@@ -4315,6 +4319,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				if (copied_early)
 					tcp_cleanup_rbuf(sk, skb->len);
 			}
+#endif
 			if (!eaten) {
 				if (tcp_checksum_complete_user(sk, skb))
 					goto csum_error;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9c94627..7ac5bc1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1916,8 +1916,10 @@ int tcp_v4_destroy_sock(struct sock *sk)
 	__skb_queue_purge(&sk->sk_async_wait_queue);
 #endif
 
+#ifdef CONFIG_TCP_PREQUEUE
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);
+#endif
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e9b151b..5f3b38c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -167,7 +167,9 @@ static int tcp_write_timeout(struct sock *sk)
 static void tcp_delack_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
+#ifdef CONFIG_TCP_PREQUEUE
 	struct tcp_sock *tp = tcp_sk(sk);
+#endif
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	bh_lock_sock(sk);
@@ -190,6 +192,7 @@ static void tcp_delack_timer(unsigned long data)
 	}
 	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 
+#ifdef CONFIG_TCP_PREQUEUE
 	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
 
@@ -200,6 +203,7 @@ static void tcp_delack_timer(unsigned long data)
 
 		tp->ucopy.memory = 0;
 	}
+#endif
 
 	if (inet_csk_ack_scheduled(sk)) {
 		if (!icsk->icsk_ack.pingpong) {

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-10-01 12:24 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-09-27 22:08 [RFC] Make TCP prequeue configurable Eric Dumazet
2007-09-27 22:44 ` Stephen Hemminger
2007-09-28  2:26   ` John Heffner
2007-09-28 22:40     ` David Miller
2007-10-01 12:24       ` Andi Kleen
2007-09-27 23:09 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).