[PATCH] netif_rx: receive path optimization

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] netif_rx: receive path optimization
@ 2005-03-30 21:28 Stephen Hemminger
  2005-03-30 21:57 ` jamal
  2005-03-31 20:04 ` [RFC] " Stephen Hemminger
  0 siblings, 2 replies; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-30 21:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

This patch cleans up the netif_rx and related code in the network
receive core.

     - Eliminate vestiges of fastroute.
       The leftover statistics no longer needed.

     - Get rid of high/med/low threshold return from netif_rx.
       Drivers rarely check return value of netif_rx, and those
       that do can handle the DROP vs SUCCESS return

     - Remove dead code for RAND_LINE and OFFLINE_SAMPLE

     - Get rid of weight_p since setting sysctl has no effect.
       Increase default weight of netif_rx path because it can receive
       packets from multiple devices and loopback.

     - Separate out max packets per softirq vs. max queued packets.
       Today, netdev_max_burst is used for both. Add new parameter
       that is for the per-cpu max queued packets.

     - Increase queue defaults to meet modern CPU speeds.
       Make max_backlog be about 1ms, and max_queue be about 10ms

     - Switch to pure drop tail when queue fills.
       Better for TCP performance under load to drop a few packets
       then go into full discard mode.

This needs more testing on range of hardware before possible inclusion
in 2.6.13. Will split out to finer grain patches then.

diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h	2005-03-30 13:17:14 -08:00
+++ b/include/linux/netdevice.h	2005-03-30 13:17:14 -08:00
@@ -164,12 +164,6 @@
 	unsigned total;
 	unsigned dropped;
 	unsigned time_squeeze;
-	unsigned throttled;
-	unsigned fastroute_hit;
-	unsigned fastroute_success;
-	unsigned fastroute_defer;
-	unsigned fastroute_deferred_out;
-	unsigned fastroute_latency_reduction;
 	unsigned cpu_collision;
 };
 
diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h	2005-03-30 13:17:14 -08:00
+++ b/include/linux/sysctl.h	2005-03-30 13:17:14 -08:00
@@ -242,6 +242,7 @@
 	NET_CORE_MOD_CONG=16,
 	NET_CORE_DEV_WEIGHT=17,
 	NET_CORE_SOMAXCONN=18,
+	NET_CORE_MAX_QUEUE=19,
 };
 
 /* /proc/sys/net/ethernet */
diff -Nru a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c	2005-03-30 13:17:14 -08:00
+++ b/net/core/dev.c	2005-03-30 13:17:14 -08:00
@@ -115,18 +115,6 @@
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
 
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -159,11 +147,6 @@
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -215,7 +198,7 @@
  *	Device drivers call our routines to queue packets here. We empty the
  *	queue in the local softnet handler.
  */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data);
 
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1338,70 +1321,11 @@
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 300;
-int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
-
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
-
-
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-	unsigned long rd;
-	int rq;
-#endif
-	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-	int blog = sd->input_pkt_queue.qlen;
-	int avg_blog = sd->avg_blog;
-
-	avg_blog = (avg_blog >> 1) + (blog >> 1);
-
-	if (avg_blog > mod_cong) {
-		/* Above moderate congestion levels. */
-		sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-		if (rq < avg_blog) /* unlucky bastard */
-			sd->cng_level = NET_RX_DROP;
-#endif
-	} else if (avg_blog > lo_cong) {
-		sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-			if (rq < avg_blog) /* unlucky bastard */
-				sd->cng_level = NET_RX_CN_HIGH;
-#endif
-	} else if (avg_blog > no_cong)
-		sd->cng_level = NET_RX_CN_LOW;
-	else  /* no congestion */
-		sd->cng_level = NET_RX_SUCCESS;
-
-	sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-	int next_tick = 1;
-	int cpu = smp_processor_id();
-
-	get_sample_stats(cpu);
-	next_tick += jiffies;
-	mod_timer(&samp_timer, next_tick);
-}
-#endif
+/* Reasonablly fast CPU can process 1 packet per us */
+int netdev_max_backlog = 1000;
+int netdev_max_queue   = 10000;
 
+DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
 
 /**
  *	netif_rx	-	post buffer to the network code
@@ -1414,16 +1338,12 @@
  *
  *	return values:
  *	NET_RX_SUCCESS	(no congestion)
- *	NET_RX_CN_LOW   (low congestion)
- *	NET_RX_CN_MOD   (moderate congestion)
- *	NET_RX_CN_HIGH  (high congestion)
  *	NET_RX_DROP     (packet was dropped)
  *
  */
 
 int netif_rx(struct sk_buff *skb)
 {
-	int this_cpu;
 	struct softnet_data *queue;
 	unsigned long flags;
 
@@ -1439,43 +1359,25 @@
 	 * short when CPU is congested, but is still operating.
 	 */
 	local_irq_save(flags);
-	this_cpu = smp_processor_id();
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-			if (queue->throttle)
-				goto drop;
-
-enqueue:
-			dev_hold(skb->dev);
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-			get_sample_stats(this_cpu);
-#endif
-			local_irq_restore(flags);
-			return queue->cng_level;
-		}
+	if (likely(queue->input_pkt_queue.qlen <= netdev_max_queue)) {
+		if (unlikely(queue->input_pkt_queue.qlen == 0))
+			netif_rx_schedule(&queue->backlog_dev);
+
+		dev_hold(skb->dev);
+		__skb_queue_tail(&queue->input_pkt_queue, skb);
+		local_irq_restore(flags);
 
-		if (queue->throttle)
-			queue->throttle = 0;
-
-		netif_rx_schedule(&queue->backlog_dev);
-		goto enqueue;
-	}
+		return NET_RX_SUCCESS;
+	} else {
+		__get_cpu_var(netdev_rx_stat).dropped++;
+		local_irq_restore(flags);
 
-	if (!queue->throttle) {
-		queue->throttle = 1;
-		__get_cpu_var(netdev_rx_stat).throttled++;
+		kfree_skb(skb);
+		return NET_RX_DROP;
 	}
-
-drop:
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
 }
 
 int netif_rx_ni(struct sk_buff *skb)
@@ -1754,8 +1656,6 @@
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	if (queue->throttle)
-		queue->throttle = 0;
 	local_irq_enable();
 	return 0;
 }
@@ -2024,20 +1924,18 @@
 {
 }
 
+/* Output softnet statistics.
+ * For compatiablity include zero's for old deprecated values
+ * for throttling and fastroute statistics.
+ */
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct netif_rx_stats *s = v;
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-		   s->total, s->dropped, s->time_squeeze, s->throttled,
-		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
-		   s->fastroute_deferred_out,
-#if 0
-		   s->fastroute_latency_reduction
-#else
-		   s->cpu_collision
-#endif
-		  );
+		   s->total, s->dropped, s->time_squeeze, 
+		   0, 0, 0, 0, 0,
+		   s->cpu_collision);
 	return 0;
 }
 
@@ -3279,21 +3177,13 @@
 
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->throttle = 0;
-		queue->cng_level = 0;
-		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-		queue->backlog_dev.weight = weight_p;
+		queue->backlog_dev.weight = 128;
 		queue->backlog_dev.poll = process_backlog;
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
-
-#ifdef OFFLINE_SAMPLE
-	samp_timer.expires = jiffies + (10 * HZ);
-	add_timer(&samp_timer);
-#endif
 
 	dev_boot_phase = 0;
 
diff -Nru a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
--- a/net/core/sysctl_net_core.c	2005-03-30 13:17:14 -08:00
+++ b/net/core/sysctl_net_core.c	2005-03-30 13:17:14 -08:00
@@ -13,12 +13,8 @@
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
-extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
-extern int netdev_fastroute;
+extern int netdev_max_queue;
+
 extern int net_msg_cost;
 extern int net_msg_burst;
 
@@ -27,7 +23,6 @@
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
-extern int sysctl_core_destroy_delay;
 extern int sysctl_optmem_max;
 extern int sysctl_somaxconn;
 
@@ -83,14 +78,6 @@
 		.proc_handler	= &proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_DEV_WEIGHT,
-		.procname	= "dev_weight",
-		.data		= &weight_p,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
 		.ctl_name	= NET_CORE_MAX_BACKLOG,
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
@@ -99,33 +86,9 @@
 		.proc_handler	= &proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_NO_CONG_THRESH,
-		.procname	= "no_cong_thresh",
-		.data		= &no_cong_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG,
-		.procname	= "no_cong",
-		.data		= &no_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_LO_CONG,
-		.procname	= "lo_cong",
-		.data		= &lo_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_MOD_CONG,
-		.procname	= "mod_cong",
-		.data		= &mod_cong,
+		.ctl_name	= NET_CORE_MAX_QUEUE,
+		.procname	= "netdev_max_queue",
+		.data		= &netdev_max_queue,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] netif_rx: receive path optimization
  2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger
@ 2005-03-30 21:57 ` jamal
  2005-03-30 22:08   ` jamal
  2005-03-30 23:53   ` Stephen Hemminger
  2005-03-31 20:04 ` [RFC] " Stephen Hemminger
  1 sibling, 2 replies; 24+ messages in thread
From: jamal @ 2005-03-30 21:57 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Wed, 2005-03-30 at 16:28, Stephen Hemminger wrote:
> This patch cleans up the netif_rx and related code in the network
> receive core.
> 
>      - Eliminate vestiges of fastroute.
>        The leftover statistics no longer needed.
> 
>      - Get rid of high/med/low threshold return from netif_rx.
>        Drivers rarely check return value of netif_rx, and those
>        that do can handle the DROP vs SUCCESS return
> 

Please leave this feature in. Drivers that used it have moved on to a
better life under NAPI; however, it is still useful for anyone who wants
to take  heed of congestion. And infact it is highly advisable for
anyone not using NAPI to using it.
In other words: the work should be to convert users of netif_rx and not
to get rid of this feature.

>      - Remove dead code for RAND_LINE and OFFLINE_SAMPLE
> 

OFLINE SAMPLE can go. The other refer to my comments above.

>      - Get rid of weight_p since setting sysctl has no effect.
>        Increase default weight of netif_rx path because it can receive
>        packets from multiple devices and loopback.
> 

Same here.

>      - Separate out max packets per softirq vs. max queued packets.
>        Today, netdev_max_burst is used for both. Add new parameter
>        that is for the per-cpu max queued packets.
> 
>      - Increase queue defaults to meet modern CPU speeds.
>        Make max_backlog be about 1ms, and max_queue be about 10ms
> 

kind of hard to compute what 1 or 10 ms in packet count. But probably
justfied to make the default larger. 

>      - Switch to pure drop tail when queue fills.
>        Better for TCP performance under load to drop a few packets
>        then go into full discard mode.
> 

Like discussed in that thread with person who enhanced the SACK queue
traversal that for a serious use a TCP user really oughta migrate to a
NAPI driver. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] netif_rx: receive path optimization
  2005-03-30 21:57 ` jamal
@ 2005-03-30 22:08   ` jamal
  2005-03-30 23:53   ` Stephen Hemminger
  1 sibling, 0 replies; 24+ messages in thread
From: jamal @ 2005-03-30 22:08 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Wed, 2005-03-30 at 16:57, jamal wrote:

> Like discussed in that thread with person who enhanced the SACK queue
> traversal that for a serious use a TCP user really oughta migrate to a
> NAPI driver. 
> 

I think i wasnt clear:
The change is fine - just saying that someone serious about benchmarky
numbers should probably not use that interface.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] netif_rx: receive path optimization
  2005-03-30 21:57 ` jamal
  2005-03-30 22:08   ` jamal
@ 2005-03-30 23:53   ` Stephen Hemminger
  2005-03-31  3:16     ` jamal
  1 sibling, 1 reply; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-30 23:53 UTC (permalink / raw)
  To: hadi; +Cc: David S. Miller, netdev

On 30 Mar 2005 16:57:29 -0500
jamal <hadi@cyberus.ca> wrote:

> On Wed, 2005-03-30 at 16:28, Stephen Hemminger wrote:
> > This patch cleans up the netif_rx and related code in the network
> > receive core.
> > 
> >      - Eliminate vestiges of fastroute.
> >        The leftover statistics no longer needed.
> > 
> >      - Get rid of high/med/low threshold return from netif_rx.
> >        Drivers rarely check return value of netif_rx, and those
> >        that do can handle the DROP vs SUCCESS return
> > 
> 
> Please leave this feature in. Drivers that used it have moved on to a
> better life under NAPI; however, it is still useful for anyone who wants
> to take  heed of congestion. And infact it is highly advisable for
> anyone not using NAPI to using it.
> In other words: the work should be to convert users of netif_rx and not
> to get rid of this feature.

How about percentages instead of multiple sysctl values? Or some relationship
of max_queue and max_backlog.
	success  qlen < max_backlog
	low 	 qlen > max_backlog
	medium   qlen > max_queue/2
	high	 qlen > max_queue - max_backlog
	drop	 qlen > max_queue

Also, RAND_LIE (dead code) is
kind of confusing because I expected it to be a receive version of Random
Drop, but it really just lies back to the caller (and keeps the packet).

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] netif_rx: receive path optimization
  2005-03-30 23:53   ` Stephen Hemminger
@ 2005-03-31  3:16     ` jamal
  0 siblings, 0 replies; 24+ messages in thread
From: jamal @ 2005-03-31  3:16 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Wed, 2005-03-30 at 18:53, Stephen Hemminger wrote:

> How about percentages instead of multiple sysctl values? Or some relationship
> of max_queue and max_backlog.
> 	success  qlen < max_backlog
> 	low 	 qlen > max_backlog
> 	medium   qlen > max_queue/2
> 	high	 qlen > max_queue - max_backlog
> 	drop	 qlen > max_queue
> 

Well, you still need the moving window average computation to detect the
second order effect (of an oncoming tsunami).
Unless i misunderstood - you are suggesting looking at instantenous
values

> Also, RAND_LIE (dead code) is
> kind of confusing because I expected it to be a receive version of Random
> Drop, but it really just lies back to the caller (and keeps the packet).

If you have drivers which look at the feedback value - then they will
back off. We dont drop but we do hope that by telling the driver to back
off that it listens to us. Clearly if you have drivers that dont listen
then its like running UDP on a wire with everyone trying to shove
packets. 
For some numbers and experiments on this stuff circa 1999/2000 look at:
http://robur.slu.se/Linux/net-development/jamal/FF-html/
You may find at least one entertaining:
http://robur.slu.se/Linux/net-development/jamal/FF-html/img20.htm

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC] netif_rx: receive path optimization
  2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger
  2005-03-30 21:57 ` jamal
@ 2005-03-31 20:04 ` Stephen Hemminger
  2005-03-31 21:10   ` Jamal Hadi Salim
  1 sibling, 1 reply; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-31 20:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: Jamal Hadi Salim, netdev

Here is another alternative that seems better than the earlier posting. It uses
a per device receive queue for non-NAPI devices.  The only issue is that then
we lose the per-cpu queue's and that could impact the loopback device performance.
If that is really an issue, then the per-cpu magic should be moved to the loopback
device.

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/03/31 11:51:14-08:00 shemminger@linux.site 
#   Use per-device rx_queue for non NAPI devices.
# 
# net/core/dev.c
#   2005/03/31 11:51:00-08:00 shemminger@linux.site +28 -57
#   Use per-device rx_queue for non NAPI devices.
# 
# include/linux/netdevice.h
#   2005/03/31 11:51:00-08:00 shemminger@linux.site +2 -7
#   Use per-device rx_queue for non NAPI devices.
# 
# ChangeSet
#   2005/03/30 12:02:44-08:00 shemminger@linux.site 
#   netif_rx redux:
#     - eliminate vestiages of fastroute
#     - get rid of high/med/low return never used
#     - get rid of weight_p since setting sysctl has no effect
#     - separate out max packets per softirq vs. max queued packets
#     - increase queue defaults to meet modern CPU speeds
#     - switch to pure drop tail when queue fills
# 
# net/core/sysctl_net_core.c
#   2005/03/30 12:02:30-08:00 shemminger@linux.site +5 -42
#   update net_core_sysctl
# 
# net/core/dev.c
#   2005/03/30 12:02:30-08:00 shemminger@linux.site +26 -136
#   cleanup of netif_rx path.
# 
# include/linux/sysctl.h
#   2005/03/30 12:02:30-08:00 shemminger@linux.site +1 -0
#   add max queue sysctl
# 
# include/linux/netdevice.h
#   2005/03/30 12:02:30-08:00 shemminger@linux.site +0 -6
#   Get rid of unused statistics
# 
diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h	2005-03-31 11:52:39 -08:00
+++ b/include/linux/netdevice.h	2005-03-31 11:52:39 -08:00
@@ -164,12 +164,6 @@
 	unsigned total;
 	unsigned dropped;
 	unsigned time_squeeze;
-	unsigned throttled;
-	unsigned fastroute_hit;
-	unsigned fastroute_success;
-	unsigned fastroute_defer;
-	unsigned fastroute_deferred_out;
-	unsigned fastroute_latency_reduction;
 	unsigned cpu_collision;
 };
 
@@ -362,6 +356,7 @@
 	void			*ec_ptr;	/* Econet specific data	*/
 	void			*ax25_ptr;	/* AX.25 specific data */
 
+	struct sk_buff_head	rx_queue;	/* Receive queue (non NAPI) */
 	struct list_head	poll_list;	/* Link to poll list	*/
 	int			quota;
 	int			weight;
@@ -562,15 +557,9 @@
 
 struct softnet_data
 {
-	int			throttle;
-	int			cng_level;
-	int			avg_blog;
-	struct sk_buff_head	input_pkt_queue;
-	struct list_head	poll_list;
 	struct net_device	*output_queue;
+	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
-
-	struct net_device	backlog_dev;	/* Sorry. 8) */
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h	2005-03-31 11:52:39 -08:00
+++ b/include/linux/sysctl.h	2005-03-31 11:52:39 -08:00
@@ -242,6 +242,7 @@
 	NET_CORE_MOD_CONG=16,
 	NET_CORE_DEV_WEIGHT=17,
 	NET_CORE_SOMAXCONN=18,
+	NET_CORE_MAX_QUEUE=19,
 };
 
 /* /proc/sys/net/ethernet */
diff -Nru a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c	2005-03-31 11:52:39 -08:00
+++ b/net/core/dev.c	2005-03-31 11:52:39 -08:00
@@ -115,18 +115,6 @@
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
 
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -159,11 +147,6 @@
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -215,7 +198,7 @@
  *	Device drivers call our routines to queue packets here. We empty the
  *	queue in the local softnet handler.
  */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1338,70 +1321,11 @@
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 300;
-int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
-
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
-
-
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-	unsigned long rd;
-	int rq;
-#endif
-	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-	int blog = sd->input_pkt_queue.qlen;
-	int avg_blog = sd->avg_blog;
-
-	avg_blog = (avg_blog >> 1) + (blog >> 1);
-
-	if (avg_blog > mod_cong) {
-		/* Above moderate congestion levels. */
-		sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-		if (rq < avg_blog) /* unlucky bastard */
-			sd->cng_level = NET_RX_DROP;
-#endif
-	} else if (avg_blog > lo_cong) {
-		sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-			if (rq < avg_blog) /* unlucky bastard */
-				sd->cng_level = NET_RX_CN_HIGH;
-#endif
-	} else if (avg_blog > no_cong)
-		sd->cng_level = NET_RX_CN_LOW;
-	else  /* no congestion */
-		sd->cng_level = NET_RX_SUCCESS;
-
-	sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-	int next_tick = 1;
-	int cpu = smp_processor_id();
-
-	get_sample_stats(cpu);
-	next_tick += jiffies;
-	mod_timer(&samp_timer, next_tick);
-}
-#endif
+/* Reasonablly fast CPU can process 1 packet per us */
+int netdev_max_backlog = 1000;
+int netdev_max_queue   = 10000;
 
+DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
 
 /**
  *	netif_rx	-	post buffer to the network code
@@ -1414,18 +1338,13 @@
  *
  *	return values:
  *	NET_RX_SUCCESS	(no congestion)
- *	NET_RX_CN_LOW   (low congestion)
- *	NET_RX_CN_MOD   (moderate congestion)
- *	NET_RX_CN_HIGH  (high congestion)
  *	NET_RX_DROP     (packet was dropped)
  *
  */
 
 int netif_rx(struct sk_buff *skb)
 {
-	int this_cpu;
-	struct softnet_data *queue;
-	unsigned long flags;
+	struct net_device *dev = skb->dev;
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -1434,48 +1353,20 @@
 	if (!skb->stamp.tv_sec)
 		net_timestamp(&skb->stamp);
 
-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	this_cpu = smp_processor_id();
-	queue = &__get_cpu_var(softnet_data);
-
 	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-			if (queue->throttle)
-				goto drop;
-
-enqueue:
-			dev_hold(skb->dev);
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-			get_sample_stats(this_cpu);
-#endif
-			local_irq_restore(flags);
-			return queue->cng_level;
-		}
+	if (likely(skb_queue_len(&dev->rx_queue) <= netdev_max_queue)) {
+		dev_hold(skb->dev);
+		skb_queue_tail(&dev->rx_queue, skb);
 
-		if (queue->throttle)
-			queue->throttle = 0;
+		if (!test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state))
+			__netif_rx_schedule(dev);
 
-		netif_rx_schedule(&queue->backlog_dev);
-		goto enqueue;
-	}
-
-	if (!queue->throttle) {
-		queue->throttle = 1;
-		__get_cpu_var(netdev_rx_stat).throttled++;
+		return NET_RX_SUCCESS;
+	} else {
+		__get_cpu_var(netdev_rx_stat).dropped++;
+		kfree_skb(skb);
+		return NET_RX_DROP;
 	}
-
-drop:
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
 }
 
 int netif_rx_ni(struct sk_buff *skb)
@@ -1712,51 +1603,30 @@
 	return ret;
 }
 
-static int process_backlog(struct net_device *backlog_dev, int *budget)
+static int netrx_nonapi_poll(struct net_device *dev, int *budget)
 {
+	struct sk_buff *skb;
 	int work = 0;
-	int quota = min(backlog_dev->quota, *budget);
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	int quota = min(dev->quota, *budget);
 	unsigned long start_time = jiffies;
 
-	for (;;) {
-		struct sk_buff *skb;
-		struct net_device *dev;
-
-		local_irq_disable();
-		skb = __skb_dequeue(&queue->input_pkt_queue);
-		if (!skb)
-			goto job_done;
-		local_irq_enable();
-
-		dev = skb->dev;
-
+	while ((skb = skb_dequeue(&dev->rx_queue)) != NULL) {
 		netif_receive_skb(skb);
 
 		dev_put(dev);
 
 		work++;
 
-		if (work >= quota || jiffies - start_time > 1)
-			break;
-
+		if (work >= quota || jiffies - start_time > 1) {
+			dev->quota -= work;
+			*budget -= work;
+			return 1;	/* not done */
+		}
 	}
 
-	backlog_dev->quota -= work;
+	dev->quota -= work;
 	*budget -= work;
-	return -1;
-
-job_done:
-	backlog_dev->quota -= work;
-	*budget -= work;
-
-	list_del(&backlog_dev->poll_list);
-	smp_mb__before_clear_bit();
-	netif_poll_enable(backlog_dev);
-
-	if (queue->throttle)
-		queue->throttle = 0;
-	local_irq_enable();
+	netif_rx_complete(dev);
 	return 0;
 }
 
@@ -2024,20 +1894,18 @@
 {
 }
 
+/* Output softnet statistics.
+ * For compatiablity include zero's for old deprecated values
+ * for throttling and fastroute statistics.
+ */
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct netif_rx_stats *s = v;
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-		   s->total, s->dropped, s->time_squeeze, s->throttled,
-		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
-		   s->fastroute_deferred_out,
-#if 0
-		   s->fastroute_latency_reduction
-#else
-		   s->cpu_collision
-#endif
-		  );
+		   s->total, s->dropped, s->time_squeeze, 
+		   0, 0, 0, 0, 0,
+		   s->cpu_collision);
 	return 0;
 }
 
@@ -2722,6 +2590,7 @@
 
 	spin_lock_init(&dev->queue_lock);
 	spin_lock_init(&dev->xmit_lock);
+	skb_queue_head_init(&dev->rx_queue);
 	dev->xmit_lock_owner = -1;
 #ifdef CONFIG_NET_CLS_ACT
 	spin_lock_init(&dev->ingress_lock);
@@ -2790,6 +2659,14 @@
 		dev->rebuild_header = default_rebuild_header;
 
 	/*
+	 *	Simulate NAPI for non-NAPI devices
+	 */
+	if (!dev->poll) {
+		dev->weight = 64;
+		dev->poll = netrx_nonapi_poll;
+	}
+
+	/*
 	 *	Default initial state at registry is that the
 	 *	device is present.
 	 */
@@ -3275,25 +3152,9 @@
 	 */
 
 	for (i = 0; i < NR_CPUS; i++) {
-		struct softnet_data *queue;
-
-		queue = &per_cpu(softnet_data, i);
-		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->throttle = 0;
-		queue->cng_level = 0;
-		queue->avg_blog = 10; /* arbitrary non-zero */
-		queue->completion_queue = NULL;
+		struct softnet_data *queue = &per_cpu(softnet_data, i);
 		INIT_LIST_HEAD(&queue->poll_list);
-		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-		queue->backlog_dev.weight = weight_p;
-		queue->backlog_dev.poll = process_backlog;
-		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
-
-#ifdef OFFLINE_SAMPLE
-	samp_timer.expires = jiffies + (10 * HZ);
-	add_timer(&samp_timer);
-#endif
 
 	dev_boot_phase = 0;
 
diff -Nru a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
--- a/net/core/sysctl_net_core.c	2005-03-31 11:52:39 -08:00
+++ b/net/core/sysctl_net_core.c	2005-03-31 11:52:39 -08:00
@@ -13,12 +13,8 @@
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
-extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
-extern int netdev_fastroute;
+extern int netdev_max_queue;
+
 extern int net_msg_cost;
 extern int net_msg_burst;
 
@@ -27,7 +23,6 @@
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
-extern int sysctl_core_destroy_delay;
 extern int sysctl_optmem_max;
 extern int sysctl_somaxconn;
 
@@ -83,14 +78,6 @@
 		.proc_handler	= &proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_DEV_WEIGHT,
-		.procname	= "dev_weight",
-		.data		= &weight_p,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
 		.ctl_name	= NET_CORE_MAX_BACKLOG,
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
@@ -99,33 +86,9 @@
 		.proc_handler	= &proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_NO_CONG_THRESH,
-		.procname	= "no_cong_thresh",
-		.data		= &no_cong_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG,
-		.procname	= "no_cong",
-		.data		= &no_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_LO_CONG,
-		.procname	= "lo_cong",
-		.data		= &lo_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_MOD_CONG,
-		.procname	= "mod_cong",
-		.data		= &mod_cong,
+		.ctl_name	= NET_CORE_MAX_QUEUE,
+		.procname	= "netdev_max_queue",
+		.data		= &netdev_max_queue,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 20:04 ` [RFC] " Stephen Hemminger
@ 2005-03-31 21:10   ` Jamal Hadi Salim
  2005-03-31 21:17     ` Stephen Hemminger
  2005-03-31 21:24     ` Rick Jones
  0 siblings, 2 replies; 24+ messages in thread
From: Jamal Hadi Salim @ 2005-03-31 21:10 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Thu, 2005-03-31 at 15:04, Stephen Hemminger wrote:
> Here is another alternative that seems better than the earlier posting. It uses
> a per device receive queue for non-NAPI devices.  The only issue is that then
> we lose the per-cpu queue's and that could impact the loopback device performance.
> If that is really an issue, then the per-cpu magic should be moved to the loopback
> device.
> 

The repurcassions of going from per-CPU-for-all-devices queue
(introduced by softnet) to per-device-for-all-CPUs maybe huge in my
opinion especially in SMP. A closer view of whats there now maybe
per-device-per-CPU backlog queue.
I think performance will be impacted in all devices. imo, whatever needs
to go in needs to have some experimental data to back it

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:10   ` Jamal Hadi Salim
@ 2005-03-31 21:17     ` Stephen Hemminger
  2005-03-31 21:25       ` Jamal Hadi Salim
  2005-03-31 21:43       ` Eric Lemoine
  2005-03-31 21:24     ` Rick Jones
  1 sibling, 2 replies; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-31 21:17 UTC (permalink / raw)
  To: hadi; +Cc: David S. Miller, netdev

On 31 Mar 2005 16:10:32 -0500
Jamal Hadi Salim <hadi@znyx.com> wrote:

> On Thu, 2005-03-31 at 15:04, Stephen Hemminger wrote:
> > Here is another alternative that seems better than the earlier posting. It uses
> > a per device receive queue for non-NAPI devices.  The only issue is that then
> > we lose the per-cpu queue's and that could impact the loopback device performance.
> > If that is really an issue, then the per-cpu magic should be moved to the loopback
> > device.
> > 
> 
> The repurcassions of going from per-CPU-for-all-devices queue
> (introduced by softnet) to per-device-for-all-CPUs maybe huge in my
> opinion especially in SMP. A closer view of whats there now maybe
> per-device-per-CPU backlog queue.

Any real hardware only has a single receive packet source (the interrupt routine),
and the only collision would be in the case of interrupt migration.  So having
per-device-per-CPU queue's would be overkill and more complex because
the NAPI scheduling is per-netdevice rather than per-queue (though that
could be fixed).

> I think performance will be impacted in all devices. imo, whatever needs
> to go in needs to have some experimental data to back it

Experiment with what? Proving an absolute negative is impossible.
I will test loopback and non-NAPI version of a couple of gigabit drivers
to see. 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:10   ` Jamal Hadi Salim
  2005-03-31 21:17     ` Stephen Hemminger
@ 2005-03-31 21:24     ` Rick Jones
  2005-03-31 21:38       ` jamal
  2005-04-01 16:40       ` Andi Kleen
  1 sibling, 2 replies; 24+ messages in thread
From: Rick Jones @ 2005-03-31 21:24 UTC (permalink / raw)
  To: netdev

> The repurcassions of going from per-CPU-for-all-devices queue
> (introduced by softnet) to per-device-for-all-CPUs maybe huge in my
> opinion especially in SMP. A closer view of whats there now maybe
> per-device-per-CPU backlog queue.
> I think performance will be impacted in all devices. imo, whatever needs
> to go in needs to have some experimental data to back it

Indeed.

At the risk of again chewing on my toes (yum), if multiple CPUs are pulling 
packets from the per-device queue there will be packet reordering.  HP-UX 10.0 
did just that and it was quite nasty even at low CPU counts (<=4).  It was 
changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed 
from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called 
IPS for Inbound Packet Scheduling.  11.0 (ca 1998) later changed that to "find 
where the connection last ran and queue to that CPU" That was called TOPS - 
Thread Optimized Packet Scheduling.

fwiw,

rick jones

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:17     ` Stephen Hemminger
@ 2005-03-31 21:25       ` Jamal Hadi Salim
  2005-03-31 21:43       ` Eric Lemoine
  1 sibling, 0 replies; 24+ messages in thread
From: Jamal Hadi Salim @ 2005-03-31 21:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Thu, 2005-03-31 at 16:17, Stephen Hemminger wrote:

> Any real hardware only has a single receive packet source (the interrupt routine),
> and the only collision would be in the case of interrupt migration.  So having
> per-device-per-CPU queue's would be overkill and more complex because
> the NAPI scheduling is per-netdevice rather than per-queue (though that
> could be fixed).

The idea behind the current per-CPU queues is to avoid cache
ping-ponging; same queue shared across multiple CPUs with roundrobin
interupts will get expensive. In other words these non-NAPI devices will
be migrating across CPUs based on interupts a lot more under heavy
traffic.
In the case of NAPI, the issue doesnt exist: A device stays on the same
queue until all packets are offloaded of it. Depending on CPU capacity
it could stay forever on the same CPU.

So my suggestion to do per CPU queues for these devices is avoid that.

> > I think performance will be impacted in all devices. imo, whatever needs
> > to go in needs to have some experimental data to back it
> 
> Experiment with what? Proving an absolute negative is impossible.
> I will test loopback and non-NAPI version of a couple of gigabit drivers
> to see. 

I think that will do. I dont know how heavy traffic you can pound.
Collecting and comparing some profiles between the two schems will help.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:24     ` Rick Jones
@ 2005-03-31 21:38       ` jamal
  2005-03-31 22:42         ` Rick Jones
  2005-04-01 16:40       ` Andi Kleen
  1 sibling, 1 reply; 24+ messages in thread
From: jamal @ 2005-03-31 21:38 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

On Thu, 2005-03-31 at 16:24, Rick Jones wrote:
> > The repurcassions of going from per-CPU-for-all-devices queue
> > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my
> > opinion especially in SMP. A closer view of whats there now maybe
> > per-device-per-CPU backlog queue.
> > I think performance will be impacted in all devices. imo, whatever needs
> > to go in needs to have some experimental data to back it
> 
> Indeed.
> 
> At the risk of again chewing on my toes (yum), if multiple CPUs are pulling 
> packets from the per-device queue there will be packet reordering. 

;-> This happens already _today_ on Linux on non-NAPI.

Take the following scenario in non-NAPI. 
-packet 1 arrives 
-interupt happens, NIC bound to CPU0
- in the meantime packets 2,3 arrive
- 3 packets put on queue for CPU0
- interupt processing done

- packet 4 arrives, interupt, CPU1 is bound to NIC
- in the meantime packets 5,6 arrive
- CPU1 backlog queue used.
- interupt processing done

Assume CPU0 is overloaded with other systenm work and CPU1 rx processing
kicks in first ... 
TCP sees packet 4, 5, 6 before 1, 2, 3 ..

Note Linux is quiet resilient to reordering compared to other OSes (as
you may know) but avoiding this is a better approach - hence my
suggestion to use NAPI when you want to do serious TCP.

Of course NAPI is not all that panacea under low traffic eating a little
bit more CPU (but you have CPU issues under low load you are in some
other deep shit)

>  HP-UX 10.0 
> did just that and it was quite nasty even at low CPU counts (<=4).  It was 
> changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed 
> from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called 
> IPS for Inbound Packet Scheduling.  11.0 (ca 1998) later changed that to "find 
> where the connection last ran and queue to that CPU" That was called TOPS - 
> Thread Optimized Packet Scheduling.
> 

Dont think we can do that unfortunately: We are screwed by the APIC
architecture on x86.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:17     ` Stephen Hemminger
  2005-03-31 21:25       ` Jamal Hadi Salim
@ 2005-03-31 21:43       ` Eric Lemoine
  2005-03-31 22:02         ` Stephen Hemminger
  1 sibling, 1 reply; 24+ messages in thread
From: Eric Lemoine @ 2005-03-31 21:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: hadi, David S. Miller, netdev

> > > Here is another alternative that seems better than the earlier posting. It uses
> > > a per device receive queue for non-NAPI devices.  The only issue is that then
> > > we lose the per-cpu queue's and that could impact the loopback device performance.
> > > If that is really an issue, then the per-cpu magic should be moved to the loopback
> > > device.
> > >
> >
> > The repurcassions of going from per-CPU-for-all-devices queue
> > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my
> > opinion especially in SMP. A closer view of whats there now maybe
> > per-device-per-CPU backlog queue.
> 
> Any real hardware only has a single receive packet source (the interrupt routine),
> and the only collision would be in the case of interrupt migration.  So having
> per-device-per-CPU queue's would be overkill and more complex because
> the NAPI scheduling is per-netdevice rather than per-queue (though that
> could be fixed).
> 
> > I think performance will be impacted in all devices. imo, whatever needs
> > to go in needs to have some experimental data to back it
> 
> Experiment with what? Proving an absolute negative is impossible.
> I will test loopback and non-NAPI version of a couple of gigabit drivers
> to see.

Just a naive question : why at all trying to accelerate netif_rx?
Isn't NAPI the best choice for high performance rx anyway?


-- 
Eric

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:43       ` Eric Lemoine
@ 2005-03-31 22:02         ` Stephen Hemminger
  0 siblings, 0 replies; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-31 22:02 UTC (permalink / raw)
  To: Eric Lemoine; +Cc: hadi, David S. Miller, netdev

On Thu, 31 Mar 2005 23:43:27 +0200
Eric Lemoine <eric.lemoine@gmail.com> wrote:

> > > > Here is another alternative that seems better than the earlier posting. It uses
> > > > a per device receive queue for non-NAPI devices.  The only issue is that then
> > > > we lose the per-cpu queue's and that could impact the loopback device performance.
> > > > If that is really an issue, then the per-cpu magic should be moved to the loopback
> > > > device.
> > > >
> > >
> > > The repurcassions of going from per-CPU-for-all-devices queue
> > > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my
> > > opinion especially in SMP. A closer view of whats there now maybe
> > > per-device-per-CPU backlog queue.
> > 
> > Any real hardware only has a single receive packet source (the interrupt routine),
> > and the only collision would be in the case of interrupt migration.  So having
> > per-device-per-CPU queue's would be overkill and more complex because
> > the NAPI scheduling is per-netdevice rather than per-queue (though that
> > could be fixed).
> > 
> > > I think performance will be impacted in all devices. imo, whatever needs
> > > to go in needs to have some experimental data to back it
> > 
> > Experiment with what? Proving an absolute negative is impossible.
> > I will test loopback and non-NAPI version of a couple of gigabit drivers
> > to see.
> 
> Just a naive question : why at all trying to accelerate netif_rx?
> Isn't NAPI the best choice for high performance rx anyway?

It was a side-effect of trying to clean out the dead code leftover from
removal of fastroute and other old experiments. netif_rx still gets used
for loopback and people do benchmarks on loopback.  Also, NAPI doesn't
work for all cases like layered devices and devices like wifi that need to
receive packets when the network device is "not running".

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:38       ` jamal
@ 2005-03-31 22:42         ` Rick Jones
  2005-03-31 23:03           ` Nivedita Singhvi
  2005-03-31 23:36           ` jamal
  0 siblings, 2 replies; 24+ messages in thread
From: Rick Jones @ 2005-03-31 22:42 UTC (permalink / raw)
  To: netdev

>>At the risk of again chewing on my toes (yum), if multiple CPUs are pulling 
>>packets from the per-device queue there will be packet reordering. 
> 
> 
> ;-> This happens already _today_ on Linux on non-NAPI.
> 
> Take the following scenario in non-NAPI. 
> -packet 1 arrives 
> -interupt happens, NIC bound to CPU0
> - in the meantime packets 2,3 arrive
> - 3 packets put on queue for CPU0
> - interupt processing done
> 
> - packet 4 arrives, interupt, CPU1 is bound to NIC
> - in the meantime packets 5,6 arrive
> - CPU1 backlog queue used.
> - interupt processing done
> 
> Assume CPU0 is overloaded with other systenm work and CPU1 rx processing
> kicks in first ... 
> TCP sees packet 4, 5, 6 before 1, 2, 3 ..

I "never" see that because I always bind a NIC to a specific CPU :)  Just about 
every networking-intensive benchmark report I've seen has done the same.

> Note Linux is quiet resilient to reordering compared to other OSes (as
> you may know) but avoiding this is a better approach - hence my
> suggestion to use NAPI when you want to do serious TCP.

Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to 
a single CPU.

>> HP-UX 10.0 
>>did just that and it was quite nasty even at low CPU counts (<=4).  It was 
>>changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed 
>>from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called 
>>IPS for Inbound Packet Scheduling.  11.0 (ca 1998) later changed that to "find 
>>where the connection last ran and queue to that CPU" That was called TOPS - 
>>Thread Optimized Packet Scheduling.
>>
> 
> 
> Dont think we can do that unfortunately: We are screwed by the APIC
> architecture on x86.

The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing 
still happened/s on a specific CPU, it is the higher-level processing which is 
done on another CPU.  The idea - with TOPS at least, is to try to access the ULP 
(TCP, UDP etc) structures on the same CPU as last accessed by the app to 
minimize that cache to cache migration.

rick jones

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 22:42         ` Rick Jones
@ 2005-03-31 23:03           ` Nivedita Singhvi
  2005-03-31 23:28             ` Rick Jones
  2005-03-31 23:36           ` jamal
  1 sibling, 1 reply; 24+ messages in thread
From: Nivedita Singhvi @ 2005-03-31 23:03 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

Rick Jones wrote:

>> Take the following scenario in non-NAPI. -packet 1 arrives -interupt 
>> happens, NIC bound to CPU0
>> - in the meantime packets 2,3 arrive
>> - 3 packets put on queue for CPU0
>> - interupt processing done
>>
>> - packet 4 arrives, interupt, CPU1 is bound to NIC
>> - in the meantime packets 5,6 arrive
>> - CPU1 backlog queue used.
>> - interupt processing done
>>
>> Assume CPU0 is overloaded with other systenm work and CPU1 rx processing
>> kicks in first ... TCP sees packet 4, 5, 6 before 1, 2, 3 ..
> 
> 
> I "never" see that because I always bind a NIC to a specific CPU :)  
> Just about every networking-intensive benchmark report I've seen has 
> done the same.

Just a reminder that the networking-benchmark world and
the real networking deployment world have a less than desirable
intersection (which I know you know only too well, Rick ;)).
How often do people use affinity? How often do they really tune
the system for their workloads? How often do they turn off things
like SACK etc? Not very often in the real world. Designing OSs to
do better at benchmarks is a different proposition than designing
OSs to do well in the real world.

>> Note Linux is quiet resilient to reordering compared to other OSes (as
>> you may know) but avoiding this is a better approach - hence my
>> suggestion to use NAPI when you want to do serious TCP.

The real killer for TCP is triggering fast retransmit
unnecessarily - and while we have some reordering detection
and safeguards for that - for other situations like apps
running over UDP and being unable to cope with reordering
(yes, there are dunderheads like that) there is not much
you can do. It does help them to avoid the reordering to
begin with.

thanks,
Nivedita

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 23:03           ` Nivedita Singhvi
@ 2005-03-31 23:28             ` Rick Jones
  2005-04-01  0:10               ` Stephen Hemminger
  2005-04-01  0:30               ` Nivedita Singhvi
  0 siblings, 2 replies; 24+ messages in thread
From: Rick Jones @ 2005-03-31 23:28 UTC (permalink / raw)
  To: netdev

>> I "never" see that because I always bind a NIC to a specific CPU :)  
>> Just about every networking-intensive benchmark report I've seen has 
>> done the same.
> 
> 
> Just a reminder that the networking-benchmark world and
> the real networking deployment world have a less than desirable
> intersection (which I know you know only too well, Rick ;)).

Touche :)

> How often do people use affinity? How often do they really tune
> the system for their workloads? 

Not as often as they should.

 > How often do they turn off things like SACK etc?

Well, I'm in an email discussion with someone who seems to bump their TCP 
windows quite large, and disable timestamps...

> Not very often in the real world. Designing OSs to
> do better at benchmarks is a different proposition than designing
> OSs to do well in the real world.

BTW what is the real world purpose of having the multiple CPU affinity of NIC 
interrupts?  I have to admit it seems rather alien to me.  (In the context of no 
onboard NIC smarts being involved that is)

>>> Note Linux is quiet resilient to reordering compared to other OSes (as
>>> you may know) but avoiding this is a better approach - hence my
>>> suggestion to use NAPI when you want to do serious TCP.
> 
> 
> The real killer for TCP is triggering fast retransmit
> unnecessarily

Agreed.  That is doubleplusungood.

rick

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 22:42         ` Rick Jones
  2005-03-31 23:03           ` Nivedita Singhvi
@ 2005-03-31 23:36           ` jamal
  2005-04-01  0:07             ` Rick Jones
  1 sibling, 1 reply; 24+ messages in thread
From: jamal @ 2005-03-31 23:36 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

On Thu, 2005-03-31 at 17:42, Rick Jones wrote:

> I "never" see that because I always bind a NIC to a specific CPU :)  Just about 
> every networking-intensive benchmark report I've seen has done the same.
> 

Do you have to be so clever? ;->

> > Note Linux is quiet resilient to reordering compared to other OSes (as
> > you may know) but avoiding this is a better approach - hence my
> > suggestion to use NAPI when you want to do serious TCP.
> 
> Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to 
> a single CPU.
> 

No reordering there.


> > Dont think we can do that unfortunately: We are screwed by the APIC
> > architecture on x86.
> 
> The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing 
> still happened/s on a specific CPU, it is the higher-level processing which is 
> done on another CPU.  The idea - with TOPS at least, is to try to access the ULP 
> (TCP, UDP etc) structures on the same CPU as last accessed by the app to 
> minimize that cache to cache migration.
> 

But if interupt happens on "wrong" cpu - and you decide higher level
processing is to be done on the "right" cpu (i assume queueing on some
per CPU queue); then isnt that expensive? Perhaps IPIs involved even?

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 23:36           ` jamal
@ 2005-04-01  0:07             ` Rick Jones
  2005-04-01  1:17               ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Rick Jones @ 2005-04-01  0:07 UTC (permalink / raw)
  To: netdev

>>>Note Linux is quiet resilient to reordering compared to other OSes (as
>>>you may know) but avoiding this is a better approach - hence my
>>>suggestion to use NAPI when you want to do serious TCP.
>>
>>Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to 
>>a single CPU.
>>
> 
> 
> No reordering there.

Ah, I wasn't clear - would someone doing serious TCP want to have the interrupts 
of a NIC go to a specific CPU.

>>>Dont think we can do that unfortunately: We are screwed by the APIC
>>>architecture on x86.
>>
>>The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing 
>>still happened/s on a specific CPU, it is the higher-level processing which is 
>>done on another CPU.  The idea - with TOPS at least, is to try to access the ULP 
>>(TCP, UDP etc) structures on the same CPU as last accessed by the app to 
>>minimize that cache to cache migration.
>>
> 
> 
> But if interupt happens on "wrong" cpu - and you decide higher level
> processing is to be done on the "right" cpu (i assume queueing on some
> per CPU queue); then isnt that expensive? Perhaps IPIs involved even?

More expensive than if one were lucky enough to have the interrupt on the 
"right" CPU in the first place, but as the CPU count goes-up, the chances of 
that go down.  The main idea behind TOPS and prior to that IPS was to spread-out 
the processing of packets across as many CPUs as we could, as "correctly" as we 
could.  Lots of small packets meant/means that a NIC could saturate its 
interrupt CPU before the NIC was saturated.  You don't necessarily see that on 
say single-instance netperf TCP_STREAM (or basic FTP) testing, but certainly can 
on aggregate netperf TCP_RR testing.

IPS, being driven by the packet header info, was good enough for simple 
benchmarking, but once you had more than one connection per process/thread that 
wasn't going to cut it, and even with one connection per process telling the 
process where it should run wasn't terribly easy :)   It wasn't _that_ much more 
expensive than the queueing already happening - IPS was when HP-UX networking 
was BSDish and it was done when things were being queued to the netisr queue(s).

TOPS lets the process (I suppose the scheduler really) decide where some of the 
processing for the packet will happen - the part after the handoff.

rick

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 23:28             ` Rick Jones
@ 2005-04-01  0:10               ` Stephen Hemminger
  2005-04-01  0:42                 ` Rick Jones
  2005-04-01  0:30               ` Nivedita Singhvi
  1 sibling, 1 reply; 24+ messages in thread
From: Stephen Hemminger @ 2005-04-01  0:10 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

On Thu, 31 Mar 2005 15:28:16 -0800
Rick Jones <rick.jones2@hp.com> wrote:

> >> I "never" see that because I always bind a NIC to a specific CPU :)  
> >> Just about every networking-intensive benchmark report I've seen has 
> >> done the same.
> > 
> > 
> > Just a reminder that the networking-benchmark world and
> > the real networking deployment world have a less than desirable
> > intersection (which I know you know only too well, Rick ;)).
> 
> Touche :)
> 
> > How often do people use affinity? How often do they really tune
> > the system for their workloads? 
> 
> Not as often as they should.
> 
>  > How often do they turn off things like SACK etc?
> 
> Well, I'm in an email discussion with someone who seems to bump their TCP 
> windows quite large, and disable timestamps...

And do they like the resulting data corruption.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 23:28             ` Rick Jones
  2005-04-01  0:10               ` Stephen Hemminger
@ 2005-04-01  0:30               ` Nivedita Singhvi
  1 sibling, 0 replies; 24+ messages in thread
From: Nivedita Singhvi @ 2005-04-01  0:30 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

Rick Jones wrote:

> Well, I'm in an email discussion with someone who seems to bump their 
> TCP windows quite large, and disable timestamps...

Ah, an oldie but a goodie :), disabling route inheritance
as a result, bumping up their default rto back to 300ms, just
so many things that could go wrong there...

> BTW what is the real world purpose of having the multiple CPU affinity 
> of NIC interrupts?  I have to admit it seems rather alien to me.  (In 
> the context of no onboard NIC smarts being involved that is)

Featuritis Bloatis ;). It's a marketing requirement :).


thanks,
Nivedita

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-04-01  0:10               ` Stephen Hemminger
@ 2005-04-01  0:42                 ` Rick Jones
  0 siblings, 0 replies; 24+ messages in thread
From: Rick Jones @ 2005-04-01  0:42 UTC (permalink / raw)
  To: netdev

>>Well, I'm in an email discussion with someone who seems to bump their TCP 
>>windows quite large, and disable timestamps...
> 
> 
> And do they like the resulting data corruption.

Minor nit - potential data corruption, perhaps even probable, but I don't think 
they are all that concerned yet - feeling secure in their belief that 2*MSL on a 
LAN is rather short indeed, and perhaps even in WANs where using 1GB TCP windows 
(although I may have mixed too much together there).

Of course, if we believe that stacks should be smart enough to limit the initial 
receive windows (or does a setsockopt() actrually override that?), and grow them 
over time based on what the transfer rates might be and the like, perhaps the 
stack should have a hard interlock on TCP window >= 65535 and timestamp option 
on.  No timestamps, no window > 65535 bytes. At present, it seems possible to 
have one without the other.  Of course, if one is indeed on a "LAN" and _knows_ 
(somehow, given the existence of remote bridges) that it is a LAN.

rick jones

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-04-01  0:07             ` Rick Jones
@ 2005-04-01  1:17               ` jamal
  2005-04-01 18:22                 ` Rick Jones
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2005-04-01  1:17 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

On Thu, 2005-03-31 at 19:07, Rick Jones wrote:

> Ah, I wasn't clear - would someone doing serious TCP want to have the interrupts 
> of a NIC go to a specific CPU.
> 

Not sure i followed:
Your TCP app (server probably) is running on CPU X;
You therefore want to tie the NIC which it goes out on the same CPU X?

AFAIK, Linux scheduler will reschedule a process on the last CPU it was
running on if possible - So if you bind a NIC to some CPU it is likely
that the CPU will also run the process. Just handwaving - never tried to
observe.
You could bind processes to CPUs (process affinity) from user space but
then also make sure you bind CPU-NIC statically

> More expensive than if one were lucky enough to have the interrupt on the 
> "right" CPU in the first place, but as the CPU count goes-up, the chances of 
> that go down.

Indeed.

> The main idea behind TOPS and prior to that IPS was to spread-out 
> the processing of packets across as many CPUs as we could, as "correctly" as we 
> could.

Very very hard to do. Isnt MSI supposed to give you ability such that a 
NIC can pick a CPU to interupt? That would help in a small way

>   Lots of small packets meant/means that a NIC could saturate its 
> interrupt CPU before the NIC was saturated.  You don't necessarily see that on 
> say single-instance netperf TCP_STREAM (or basic FTP) testing, but certainly can 
> on aggregate netperf TCP_RR testing.
> 
> IPS, being driven by the packet header info, was good enough for simple 
> benchmarking, but once you had more than one connection per process/thread that 
> wasn't going to cut it, and even with one connection per process telling the 
> process where it should run wasn't terribly easy :)   It wasn't _that_ much more 
> expensive than the queueing already happening - IPS was when HP-UX networking 
> was BSDish and it was done when things were being queued to the netisr queue(s).
> 
> TOPS lets the process (I suppose the scheduler really) decide where some of the 
> processing for the packet will happen - the part after the handoff.
> 

I think this last part should be easy to do - but perhaps the expense of
landing on the wrong CPU may override any benefits perceived.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-03-31 21:24     ` Rick Jones
  2005-03-31 21:38       ` jamal
@ 2005-04-01 16:40       ` Andi Kleen
  1 sibling, 0 replies; 24+ messages in thread
From: Andi Kleen @ 2005-04-01 16:40 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

Rick Jones <rick.jones2@hp.com> writes:

> At the risk of again chewing on my toes (yum), if multiple CPUs are
> pulling packets from the per-device queue there will be packet
> reordering.  HP-UX 10.0 did just that and it was quite nasty even at
> low CPU counts (<=4).  It was changed by HP-UX 10.20 (ca 1995) to
> per-CPU queues with queue selection computed from packet headers (hash
> the IP and TCP/UDP header to pick a CPU) It was called IPS for Inbound
> Packet Scheduling.  11.0 (ca 1998) later changed that to "find where
> the connection last ran and queue to that CPU" That was called TOPS -
> Thread Optimized Packet Scheduling.

We went over this a lot several years ago when Linux got multi threaded RX
with softnet in 2.1.  You might want to go over the archives.

Some things that came out of it was a sender side TCP optimization
to tolerate reordering without slowing down (works great with other
Linux peers) and NAPI style polling mode (which was mostly designed for routing
and still seems to have regressions for the client/server case :/)

Something like TOPS was discussed, but afaik nobody ever implemented
it. Of course benchmark guys do it manually by setting interrupt
and scheduler affinity.

-Andi

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC] netif_rx: receive path optimization
  2005-04-01  1:17               ` jamal
@ 2005-04-01 18:22                 ` Rick Jones
  0 siblings, 0 replies; 24+ messages in thread
From: Rick Jones @ 2005-04-01 18:22 UTC (permalink / raw)
  To: netdev

>>The main idea behind TOPS and prior to that IPS was to spread-out 
>>the processing of packets across as many CPUs as we could, as "correctly" as we 
>>could.
> 
> 
> Very very hard to do. 

Why do you say that?  "Correct" can be defined as either the same CPU for each 
packet in a given flow (IPS) or the same CPU as last accessed the endpoint (TOPS).

> Isnt MSI supposed to give you ability such that a 
> NIC can pick a CPU to interupt? That would help in a small way

That gives the NIC the knowledge of how to direct to a CPU, but as you know does 
not tell it how to decide where.  Since I doubt that the NIC wants to reach-out 
and touch connection state in the host (nor I suppose do we want it to either) 
the best a NIC with MSI could do would be IPS

>>TOPS lets the process (I suppose the scheduler really) decide where some of the 
>>processing for the packet will happen - the part after the handoff.
>>
> 
> I think this last part should be easy to do - but perhaps the expense of
> landing on the wrong CPU may override any benefits perceived.

Unless one has a scheduler that likes to migrate processes, the chances of 
landing on the wrong CPU are minimal and shortlived, and overall, the chances of 
being right are greater than if not doing anything and sticking with the 
interrupt CPU. (Handwaving based on experience-driven intuition and a bit of 
math as one increases the CPU count)  This is all on the premis that one is 
running with numNIC << numCPU.  With numNIC == numCPU one does things as seen in 
certain networking-intensive benchmarks :)

rick jones

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2005-04-01 18:22 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger
2005-03-30 21:57 ` jamal
2005-03-30 22:08   ` jamal
2005-03-30 23:53   ` Stephen Hemminger
2005-03-31  3:16     ` jamal
2005-03-31 20:04 ` [RFC] " Stephen Hemminger
2005-03-31 21:10   ` Jamal Hadi Salim
2005-03-31 21:17     ` Stephen Hemminger
2005-03-31 21:25       ` Jamal Hadi Salim
2005-03-31 21:43       ` Eric Lemoine
2005-03-31 22:02         ` Stephen Hemminger
2005-03-31 21:24     ` Rick Jones
2005-03-31 21:38       ` jamal
2005-03-31 22:42         ` Rick Jones
2005-03-31 23:03           ` Nivedita Singhvi
2005-03-31 23:28             ` Rick Jones
2005-04-01  0:10               ` Stephen Hemminger
2005-04-01  0:42                 ` Rick Jones
2005-04-01  0:30               ` Nivedita Singhvi
2005-03-31 23:36           ` jamal
2005-04-01  0:07             ` Rick Jones
2005-04-01  1:17               ` jamal
2005-04-01 18:22                 ` Rick Jones
2005-04-01 16:40       ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).