* [PATCH] netif_rx: receive path optimization
@ 2005-03-30 21:28 Stephen Hemminger
2005-03-30 21:57 ` jamal
2005-03-31 20:04 ` [RFC] " Stephen Hemminger
0 siblings, 2 replies; 24+ messages in thread
From: Stephen Hemminger @ 2005-03-30 21:28 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
This patch cleans up the netif_rx and related code in the network
receive core.
- Eliminate vestiges of fastroute.
The leftover statistics no longer needed.
- Get rid of high/med/low threshold return from netif_rx.
Drivers rarely check return value of netif_rx, and those
that do can handle the DROP vs SUCCESS return
- Remove dead code for RAND_LINE and OFFLINE_SAMPLE
- Get rid of weight_p since setting sysctl has no effect.
Increase default weight of netif_rx path because it can receive
packets from multiple devices and loopback.
- Separate out max packets per softirq vs. max queued packets.
Today, netdev_max_burst is used for both. Add new parameter
that is for the per-cpu max queued packets.
- Increase queue defaults to meet modern CPU speeds.
Make max_backlog be about 1ms, and max_queue be about 10ms
- Switch to pure drop tail when queue fills.
Better for TCP performance under load to drop a few packets
then go into full discard mode.
This needs more testing on range of hardware before possible inclusion
in 2.6.13. Will split out to finer grain patches then.
diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h 2005-03-30 13:17:14 -08:00
+++ b/include/linux/netdevice.h 2005-03-30 13:17:14 -08:00
@@ -164,12 +164,6 @@
unsigned total;
unsigned dropped;
unsigned time_squeeze;
- unsigned throttled;
- unsigned fastroute_hit;
- unsigned fastroute_success;
- unsigned fastroute_defer;
- unsigned fastroute_deferred_out;
- unsigned fastroute_latency_reduction;
unsigned cpu_collision;
};
diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h 2005-03-30 13:17:14 -08:00
+++ b/include/linux/sysctl.h 2005-03-30 13:17:14 -08:00
@@ -242,6 +242,7 @@
NET_CORE_MOD_CONG=16,
NET_CORE_DEV_WEIGHT=17,
NET_CORE_SOMAXCONN=18,
+ NET_CORE_MAX_QUEUE=19,
};
/* /proc/sys/net/ethernet */
diff -Nru a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c 2005-03-30 13:17:14 -08:00
+++ b/net/core/dev.c 2005-03-30 13:17:14 -08:00
@@ -115,18 +115,6 @@
#endif /* CONFIG_NET_RADIO */
#include <asm/current.h>
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate. It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
@@ -159,11 +147,6 @@
static struct list_head ptype_base[16]; /* 16 way hashed list */
static struct list_head ptype_all; /* Taps */
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
/*
* The @dev_base list is protected by @dev_base_lock and the rtln
* semaphore.
@@ -215,7 +198,7 @@
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data);
#ifdef CONFIG_SYSFS
extern int netdev_sysfs_init(void);
@@ -1338,70 +1321,11 @@
Receiver routines
=======================================================================*/
-int netdev_max_backlog = 300;
-int weight_p = 64; /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
-
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
-
-
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
- unsigned long rd;
- int rq;
-#endif
- struct softnet_data *sd = &per_cpu(softnet_data, cpu);
- int blog = sd->input_pkt_queue.qlen;
- int avg_blog = sd->avg_blog;
-
- avg_blog = (avg_blog >> 1) + (blog >> 1);
-
- if (avg_blog > mod_cong) {
- /* Above moderate congestion levels. */
- sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
- rd = net_random();
- rq = rd % netdev_max_backlog;
- if (rq < avg_blog) /* unlucky bastard */
- sd->cng_level = NET_RX_DROP;
-#endif
- } else if (avg_blog > lo_cong) {
- sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
- rd = net_random();
- rq = rd % netdev_max_backlog;
- if (rq < avg_blog) /* unlucky bastard */
- sd->cng_level = NET_RX_CN_HIGH;
-#endif
- } else if (avg_blog > no_cong)
- sd->cng_level = NET_RX_CN_LOW;
- else /* no congestion */
- sd->cng_level = NET_RX_SUCCESS;
-
- sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
- int next_tick = 1;
- int cpu = smp_processor_id();
-
- get_sample_stats(cpu);
- next_tick += jiffies;
- mod_timer(&samp_timer, next_tick);
-}
-#endif
+/* Reasonablly fast CPU can process 1 packet per us */
+int netdev_max_backlog = 1000;
+int netdev_max_queue = 10000;
+DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
/**
* netif_rx - post buffer to the network code
@@ -1414,16 +1338,12 @@
*
* return values:
* NET_RX_SUCCESS (no congestion)
- * NET_RX_CN_LOW (low congestion)
- * NET_RX_CN_MOD (moderate congestion)
- * NET_RX_CN_HIGH (high congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
- int this_cpu;
struct softnet_data *queue;
unsigned long flags;
@@ -1439,43 +1359,25 @@
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
- this_cpu = smp_processor_id();
queue = &__get_cpu_var(softnet_data);
__get_cpu_var(netdev_rx_stat).total++;
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
- if (queue->throttle)
- goto drop;
-
-enqueue:
- dev_hold(skb->dev);
- __skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
- get_sample_stats(this_cpu);
-#endif
- local_irq_restore(flags);
- return queue->cng_level;
- }
+ if (likely(queue->input_pkt_queue.qlen <= netdev_max_queue)) {
+ if (unlikely(queue->input_pkt_queue.qlen == 0))
+ netif_rx_schedule(&queue->backlog_dev);
+
+ dev_hold(skb->dev);
+ __skb_queue_tail(&queue->input_pkt_queue, skb);
+ local_irq_restore(flags);
- if (queue->throttle)
- queue->throttle = 0;
-
- netif_rx_schedule(&queue->backlog_dev);
- goto enqueue;
- }
+ return NET_RX_SUCCESS;
+ } else {
+ __get_cpu_var(netdev_rx_stat).dropped++;
+ local_irq_restore(flags);
- if (!queue->throttle) {
- queue->throttle = 1;
- __get_cpu_var(netdev_rx_stat).throttled++;
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
-
-drop:
- __get_cpu_var(netdev_rx_stat).dropped++;
- local_irq_restore(flags);
-
- kfree_skb(skb);
- return NET_RX_DROP;
}
int netif_rx_ni(struct sk_buff *skb)
@@ -1754,8 +1656,6 @@
smp_mb__before_clear_bit();
netif_poll_enable(backlog_dev);
- if (queue->throttle)
- queue->throttle = 0;
local_irq_enable();
return 0;
}
@@ -2024,20 +1924,18 @@
{
}
+/* Output softnet statistics.
+ * For compatiablity include zero's for old deprecated values
+ * for throttling and fastroute statistics.
+ */
static int softnet_seq_show(struct seq_file *seq, void *v)
{
struct netif_rx_stats *s = v;
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- s->total, s->dropped, s->time_squeeze, s->throttled,
- s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
- s->fastroute_deferred_out,
-#if 0
- s->fastroute_latency_reduction
-#else
- s->cpu_collision
-#endif
- );
+ s->total, s->dropped, s->time_squeeze,
+ 0, 0, 0, 0, 0,
+ s->cpu_collision);
return 0;
}
@@ -3279,21 +3177,13 @@
queue = &per_cpu(softnet_data, i);
skb_queue_head_init(&queue->input_pkt_queue);
- queue->throttle = 0;
- queue->cng_level = 0;
- queue->avg_blog = 10; /* arbitrary non-zero */
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
- queue->backlog_dev.weight = weight_p;
+ queue->backlog_dev.weight = 128;
queue->backlog_dev.poll = process_backlog;
atomic_set(&queue->backlog_dev.refcnt, 1);
}
-
-#ifdef OFFLINE_SAMPLE
- samp_timer.expires = jiffies + (10 * HZ);
- add_timer(&samp_timer);
-#endif
dev_boot_phase = 0;
diff -Nru a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
--- a/net/core/sysctl_net_core.c 2005-03-30 13:17:14 -08:00
+++ b/net/core/sysctl_net_core.c 2005-03-30 13:17:14 -08:00
@@ -13,12 +13,8 @@
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
-extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
-extern int netdev_fastroute;
+extern int netdev_max_queue;
+
extern int net_msg_cost;
extern int net_msg_burst;
@@ -27,7 +23,6 @@
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
-extern int sysctl_core_destroy_delay;
extern int sysctl_optmem_max;
extern int sysctl_somaxconn;
@@ -83,14 +78,6 @@
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_CORE_DEV_WEIGHT,
- .procname = "dev_weight",
- .data = &weight_p,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
.ctl_name = NET_CORE_MAX_BACKLOG,
.procname = "netdev_max_backlog",
.data = &netdev_max_backlog,
@@ -99,33 +86,9 @@
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_CORE_NO_CONG_THRESH,
- .procname = "no_cong_thresh",
- .data = &no_cong_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_NO_CONG,
- .procname = "no_cong",
- .data = &no_cong,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_LO_CONG,
- .procname = "lo_cong",
- .data = &lo_cong,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_MOD_CONG,
- .procname = "mod_cong",
- .data = &mod_cong,
+ .ctl_name = NET_CORE_MAX_QUEUE,
+ .procname = "netdev_max_queue",
+ .data = &netdev_max_queue,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH] netif_rx: receive path optimization 2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger @ 2005-03-30 21:57 ` jamal 2005-03-30 22:08 ` jamal 2005-03-30 23:53 ` Stephen Hemminger 2005-03-31 20:04 ` [RFC] " Stephen Hemminger 1 sibling, 2 replies; 24+ messages in thread From: jamal @ 2005-03-30 21:57 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David S. Miller, netdev On Wed, 2005-03-30 at 16:28, Stephen Hemminger wrote: > This patch cleans up the netif_rx and related code in the network > receive core. > > - Eliminate vestiges of fastroute. > The leftover statistics no longer needed. > > - Get rid of high/med/low threshold return from netif_rx. > Drivers rarely check return value of netif_rx, and those > that do can handle the DROP vs SUCCESS return > Please leave this feature in. Drivers that used it have moved on to a better life under NAPI; however, it is still useful for anyone who wants to take heed of congestion. And infact it is highly advisable for anyone not using NAPI to using it. In other words: the work should be to convert users of netif_rx and not to get rid of this feature. > - Remove dead code for RAND_LINE and OFFLINE_SAMPLE > OFLINE SAMPLE can go. The other refer to my comments above. > - Get rid of weight_p since setting sysctl has no effect. > Increase default weight of netif_rx path because it can receive > packets from multiple devices and loopback. > Same here. > - Separate out max packets per softirq vs. max queued packets. > Today, netdev_max_burst is used for both. Add new parameter > that is for the per-cpu max queued packets. > > - Increase queue defaults to meet modern CPU speeds. > Make max_backlog be about 1ms, and max_queue be about 10ms > kind of hard to compute what 1 or 10 ms in packet count. But probably justfied to make the default larger. > - Switch to pure drop tail when queue fills. > Better for TCP performance under load to drop a few packets > then go into full discard mode. > Like discussed in that thread with person who enhanced the SACK queue traversal that for a serious use a TCP user really oughta migrate to a NAPI driver. cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] netif_rx: receive path optimization 2005-03-30 21:57 ` jamal @ 2005-03-30 22:08 ` jamal 2005-03-30 23:53 ` Stephen Hemminger 1 sibling, 0 replies; 24+ messages in thread From: jamal @ 2005-03-30 22:08 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David S. Miller, netdev On Wed, 2005-03-30 at 16:57, jamal wrote: > Like discussed in that thread with person who enhanced the SACK queue > traversal that for a serious use a TCP user really oughta migrate to a > NAPI driver. > I think i wasnt clear: The change is fine - just saying that someone serious about benchmarky numbers should probably not use that interface. cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] netif_rx: receive path optimization 2005-03-30 21:57 ` jamal 2005-03-30 22:08 ` jamal @ 2005-03-30 23:53 ` Stephen Hemminger 2005-03-31 3:16 ` jamal 1 sibling, 1 reply; 24+ messages in thread From: Stephen Hemminger @ 2005-03-30 23:53 UTC (permalink / raw) To: hadi; +Cc: David S. Miller, netdev On 30 Mar 2005 16:57:29 -0500 jamal <hadi@cyberus.ca> wrote: > On Wed, 2005-03-30 at 16:28, Stephen Hemminger wrote: > > This patch cleans up the netif_rx and related code in the network > > receive core. > > > > - Eliminate vestiges of fastroute. > > The leftover statistics no longer needed. > > > > - Get rid of high/med/low threshold return from netif_rx. > > Drivers rarely check return value of netif_rx, and those > > that do can handle the DROP vs SUCCESS return > > > > Please leave this feature in. Drivers that used it have moved on to a > better life under NAPI; however, it is still useful for anyone who wants > to take heed of congestion. And infact it is highly advisable for > anyone not using NAPI to using it. > In other words: the work should be to convert users of netif_rx and not > to get rid of this feature. How about percentages instead of multiple sysctl values? Or some relationship of max_queue and max_backlog. success qlen < max_backlog low qlen > max_backlog medium qlen > max_queue/2 high qlen > max_queue - max_backlog drop qlen > max_queue Also, RAND_LIE (dead code) is kind of confusing because I expected it to be a receive version of Random Drop, but it really just lies back to the caller (and keeps the packet). ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] netif_rx: receive path optimization 2005-03-30 23:53 ` Stephen Hemminger @ 2005-03-31 3:16 ` jamal 0 siblings, 0 replies; 24+ messages in thread From: jamal @ 2005-03-31 3:16 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David S. Miller, netdev On Wed, 2005-03-30 at 18:53, Stephen Hemminger wrote: > How about percentages instead of multiple sysctl values? Or some relationship > of max_queue and max_backlog. > success qlen < max_backlog > low qlen > max_backlog > medium qlen > max_queue/2 > high qlen > max_queue - max_backlog > drop qlen > max_queue > Well, you still need the moving window average computation to detect the second order effect (of an oncoming tsunami). Unless i misunderstood - you are suggesting looking at instantenous values > Also, RAND_LIE (dead code) is > kind of confusing because I expected it to be a receive version of Random > Drop, but it really just lies back to the caller (and keeps the packet). If you have drivers which look at the feedback value - then they will back off. We dont drop but we do hope that by telling the driver to back off that it listens to us. Clearly if you have drivers that dont listen then its like running UDP on a wire with everyone trying to shove packets. For some numbers and experiments on this stuff circa 1999/2000 look at: http://robur.slu.se/Linux/net-development/jamal/FF-html/ You may find at least one entertaining: http://robur.slu.se/Linux/net-development/jamal/FF-html/img20.htm cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* [RFC] netif_rx: receive path optimization 2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger 2005-03-30 21:57 ` jamal @ 2005-03-31 20:04 ` Stephen Hemminger 2005-03-31 21:10 ` Jamal Hadi Salim 1 sibling, 1 reply; 24+ messages in thread From: Stephen Hemminger @ 2005-03-31 20:04 UTC (permalink / raw) To: David S. Miller; +Cc: Jamal Hadi Salim, netdev Here is another alternative that seems better than the earlier posting. It uses a per device receive queue for non-NAPI devices. The only issue is that then we lose the per-cpu queue's and that could impact the loopback device performance. If that is really an issue, then the per-cpu magic should be moved to the loopback device. # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2005/03/31 11:51:14-08:00 shemminger@linux.site # Use per-device rx_queue for non NAPI devices. # # net/core/dev.c # 2005/03/31 11:51:00-08:00 shemminger@linux.site +28 -57 # Use per-device rx_queue for non NAPI devices. # # include/linux/netdevice.h # 2005/03/31 11:51:00-08:00 shemminger@linux.site +2 -7 # Use per-device rx_queue for non NAPI devices. # # ChangeSet # 2005/03/30 12:02:44-08:00 shemminger@linux.site # netif_rx redux: # - eliminate vestiages of fastroute # - get rid of high/med/low return never used # - get rid of weight_p since setting sysctl has no effect # - separate out max packets per softirq vs. max queued packets # - increase queue defaults to meet modern CPU speeds # - switch to pure drop tail when queue fills # # net/core/sysctl_net_core.c # 2005/03/30 12:02:30-08:00 shemminger@linux.site +5 -42 # update net_core_sysctl # # net/core/dev.c # 2005/03/30 12:02:30-08:00 shemminger@linux.site +26 -136 # cleanup of netif_rx path. # # include/linux/sysctl.h # 2005/03/30 12:02:30-08:00 shemminger@linux.site +1 -0 # add max queue sysctl # # include/linux/netdevice.h # 2005/03/30 12:02:30-08:00 shemminger@linux.site +0 -6 # Get rid of unused statistics # diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h 2005-03-31 11:52:39 -08:00 +++ b/include/linux/netdevice.h 2005-03-31 11:52:39 -08:00 @@ -164,12 +164,6 @@ unsigned total; unsigned dropped; unsigned time_squeeze; - unsigned throttled; - unsigned fastroute_hit; - unsigned fastroute_success; - unsigned fastroute_defer; - unsigned fastroute_deferred_out; - unsigned fastroute_latency_reduction; unsigned cpu_collision; }; @@ -362,6 +356,7 @@ void *ec_ptr; /* Econet specific data */ void *ax25_ptr; /* AX.25 specific data */ + struct sk_buff_head rx_queue; /* Receive queue (non NAPI) */ struct list_head poll_list; /* Link to poll list */ int quota; int weight; @@ -562,15 +557,9 @@ struct softnet_data { - int throttle; - int cng_level; - int avg_blog; - struct sk_buff_head input_pkt_queue; - struct list_head poll_list; struct net_device *output_queue; + struct list_head poll_list; struct sk_buff *completion_queue; - - struct net_device backlog_dev; /* Sorry. 8) */ }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h 2005-03-31 11:52:39 -08:00 +++ b/include/linux/sysctl.h 2005-03-31 11:52:39 -08:00 @@ -242,6 +242,7 @@ NET_CORE_MOD_CONG=16, NET_CORE_DEV_WEIGHT=17, NET_CORE_SOMAXCONN=18, + NET_CORE_MAX_QUEUE=19, }; /* /proc/sys/net/ethernet */ diff -Nru a/net/core/dev.c b/net/core/dev.c --- a/net/core/dev.c 2005-03-31 11:52:39 -08:00 +++ b/net/core/dev.c 2005-03-31 11:52:39 -08:00 @@ -115,18 +115,6 @@ #endif /* CONFIG_NET_RADIO */ #include <asm/current.h> -/* This define, if set, will randomly drop a packet when congestion - * is more than moderate. It helps fairness in the multi-interface - * case when one of them is a hog, but it kills performance for the - * single interface case so it is off now by default. - */ -#undef RAND_LIE - -/* Setting this will sample the queue lengths and thus congestion - * via a timer instead of as each packet is received. - */ -#undef OFFLINE_SAMPLE - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -159,11 +147,6 @@ static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); -#endif - /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -215,7 +198,7 @@ * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; #ifdef CONFIG_SYSFS extern int netdev_sysfs_init(void); @@ -1338,70 +1321,11 @@ Receiver routines =======================================================================*/ -int netdev_max_backlog = 300; -int weight_p = 64; /* old backlog weight */ -/* These numbers are selected based on intuition and some - * experimentatiom, if you have more scientific way of doing this - * please go ahead and fix things. - */ -int no_cong_thresh = 10; -int no_cong = 20; -int lo_cong = 100; -int mod_cong = 290; - -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; - - -static void get_sample_stats(int cpu) -{ -#ifdef RAND_LIE - unsigned long rd; - int rq; -#endif - struct softnet_data *sd = &per_cpu(softnet_data, cpu); - int blog = sd->input_pkt_queue.qlen; - int avg_blog = sd->avg_blog; - - avg_blog = (avg_blog >> 1) + (blog >> 1); - - if (avg_blog > mod_cong) { - /* Above moderate congestion levels. */ - sd->cng_level = NET_RX_CN_HIGH; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_DROP; -#endif - } else if (avg_blog > lo_cong) { - sd->cng_level = NET_RX_CN_MOD; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_CN_HIGH; -#endif - } else if (avg_blog > no_cong) - sd->cng_level = NET_RX_CN_LOW; - else /* no congestion */ - sd->cng_level = NET_RX_SUCCESS; - - sd->avg_blog = avg_blog; -} - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy) -{ -/* 10 ms 0r 1ms -- i don't care -- JHS */ - int next_tick = 1; - int cpu = smp_processor_id(); - - get_sample_stats(cpu); - next_tick += jiffies; - mod_timer(&samp_timer, next_tick); -} -#endif +/* Reasonablly fast CPU can process 1 packet per us */ +int netdev_max_backlog = 1000; +int netdev_max_queue = 10000; +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat); /** * netif_rx - post buffer to the network code @@ -1414,18 +1338,13 @@ * * return values: * NET_RX_SUCCESS (no congestion) - * NET_RX_CN_LOW (low congestion) - * NET_RX_CN_MOD (moderate congestion) - * NET_RX_CN_HIGH (high congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { - int this_cpu; - struct softnet_data *queue; - unsigned long flags; + struct net_device *dev = skb->dev; /* if netpoll wants it, pretend we never saw it */ if (netpoll_rx(skb)) @@ -1434,48 +1353,20 @@ if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - this_cpu = smp_processor_id(); - queue = &__get_cpu_var(softnet_data); - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; - -enqueue: - dev_hold(skb->dev); - __skb_queue_tail(&queue->input_pkt_queue, skb); -#ifndef OFFLINE_SAMPLE - get_sample_stats(this_cpu); -#endif - local_irq_restore(flags); - return queue->cng_level; - } + if (likely(skb_queue_len(&dev->rx_queue) <= netdev_max_queue)) { + dev_hold(skb->dev); + skb_queue_tail(&dev->rx_queue, skb); - if (queue->throttle) - queue->throttle = 0; + if (!test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state)) + __netif_rx_schedule(dev); - netif_rx_schedule(&queue->backlog_dev); - goto enqueue; - } - - if (!queue->throttle) { - queue->throttle = 1; - __get_cpu_var(netdev_rx_stat).throttled++; + return NET_RX_SUCCESS; + } else { + __get_cpu_var(netdev_rx_stat).dropped++; + kfree_skb(skb); + return NET_RX_DROP; } - -drop: - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); - - kfree_skb(skb); - return NET_RX_DROP; } int netif_rx_ni(struct sk_buff *skb) @@ -1712,51 +1603,30 @@ return ret; } -static int process_backlog(struct net_device *backlog_dev, int *budget) +static int netrx_nonapi_poll(struct net_device *dev, int *budget) { + struct sk_buff *skb; int work = 0; - int quota = min(backlog_dev->quota, *budget); - struct softnet_data *queue = &__get_cpu_var(softnet_data); + int quota = min(dev->quota, *budget); unsigned long start_time = jiffies; - for (;;) { - struct sk_buff *skb; - struct net_device *dev; - - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) - goto job_done; - local_irq_enable(); - - dev = skb->dev; - + while ((skb = skb_dequeue(&dev->rx_queue)) != NULL) { netif_receive_skb(skb); dev_put(dev); work++; - if (work >= quota || jiffies - start_time > 1) - break; - + if (work >= quota || jiffies - start_time > 1) { + dev->quota -= work; + *budget -= work; + return 1; /* not done */ + } } - backlog_dev->quota -= work; + dev->quota -= work; *budget -= work; - return -1; - -job_done: - backlog_dev->quota -= work; - *budget -= work; - - list_del(&backlog_dev->poll_list); - smp_mb__before_clear_bit(); - netif_poll_enable(backlog_dev); - - if (queue->throttle) - queue->throttle = 0; - local_irq_enable(); + netif_rx_complete(dev); return 0; } @@ -2024,20 +1894,18 @@ { } +/* Output softnet statistics. + * For compatiablity include zero's for old deprecated values + * for throttling and fastroute statistics. + */ static int softnet_seq_show(struct seq_file *seq, void *v) { struct netif_rx_stats *s = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, s->throttled, - s->fastroute_hit, s->fastroute_success, s->fastroute_defer, - s->fastroute_deferred_out, -#if 0 - s->fastroute_latency_reduction -#else - s->cpu_collision -#endif - ); + s->total, s->dropped, s->time_squeeze, + 0, 0, 0, 0, 0, + s->cpu_collision); return 0; } @@ -2722,6 +2590,7 @@ spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->xmit_lock); + skb_queue_head_init(&dev->rx_queue); dev->xmit_lock_owner = -1; #ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock); @@ -2790,6 +2659,14 @@ dev->rebuild_header = default_rebuild_header; /* + * Simulate NAPI for non-NAPI devices + */ + if (!dev->poll) { + dev->weight = 64; + dev->poll = netrx_nonapi_poll; + } + + /* * Default initial state at registry is that the * device is present. */ @@ -3275,25 +3152,9 @@ */ for (i = 0; i < NR_CPUS; i++) { - struct softnet_data *queue; - - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ - queue->completion_queue = NULL; + struct softnet_data *queue = &per_cpu(softnet_data, i); INIT_LIST_HEAD(&queue->poll_list); - set_bit(__LINK_STATE_START, &queue->backlog_dev.state); - queue->backlog_dev.weight = weight_p; - queue->backlog_dev.poll = process_backlog; - atomic_set(&queue->backlog_dev.refcnt, 1); } - -#ifdef OFFLINE_SAMPLE - samp_timer.expires = jiffies + (10 * HZ); - add_timer(&samp_timer); -#endif dev_boot_phase = 0; diff -Nru a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c --- a/net/core/sysctl_net_core.c 2005-03-31 11:52:39 -08:00 +++ b/net/core/sysctl_net_core.c 2005-03-31 11:52:39 -08:00 @@ -13,12 +13,8 @@ #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; -extern int weight_p; -extern int no_cong_thresh; -extern int no_cong; -extern int lo_cong; -extern int mod_cong; -extern int netdev_fastroute; +extern int netdev_max_queue; + extern int net_msg_cost; extern int net_msg_burst; @@ -27,7 +23,6 @@ extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; -extern int sysctl_core_destroy_delay; extern int sysctl_optmem_max; extern int sysctl_somaxconn; @@ -83,14 +78,6 @@ .proc_handler = &proc_dointvec }, { - .ctl_name = NET_CORE_DEV_WEIGHT, - .procname = "dev_weight", - .data = &weight_p, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_CORE_MAX_BACKLOG, .procname = "netdev_max_backlog", .data = &netdev_max_backlog, @@ -99,33 +86,9 @@ .proc_handler = &proc_dointvec }, { - .ctl_name = NET_CORE_NO_CONG_THRESH, - .procname = "no_cong_thresh", - .data = &no_cong_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_NO_CONG, - .procname = "no_cong", - .data = &no_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_LO_CONG, - .procname = "lo_cong", - .data = &lo_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_MOD_CONG, - .procname = "mod_cong", - .data = &mod_cong, + .ctl_name = NET_CORE_MAX_QUEUE, + .procname = "netdev_max_queue", + .data = &netdev_max_queue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 20:04 ` [RFC] " Stephen Hemminger @ 2005-03-31 21:10 ` Jamal Hadi Salim 2005-03-31 21:17 ` Stephen Hemminger 2005-03-31 21:24 ` Rick Jones 0 siblings, 2 replies; 24+ messages in thread From: Jamal Hadi Salim @ 2005-03-31 21:10 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David S. Miller, netdev On Thu, 2005-03-31 at 15:04, Stephen Hemminger wrote: > Here is another alternative that seems better than the earlier posting. It uses > a per device receive queue for non-NAPI devices. The only issue is that then > we lose the per-cpu queue's and that could impact the loopback device performance. > If that is really an issue, then the per-cpu magic should be moved to the loopback > device. > The repurcassions of going from per-CPU-for-all-devices queue (introduced by softnet) to per-device-for-all-CPUs maybe huge in my opinion especially in SMP. A closer view of whats there now maybe per-device-per-CPU backlog queue. I think performance will be impacted in all devices. imo, whatever needs to go in needs to have some experimental data to back it cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:10 ` Jamal Hadi Salim @ 2005-03-31 21:17 ` Stephen Hemminger 2005-03-31 21:25 ` Jamal Hadi Salim 2005-03-31 21:43 ` Eric Lemoine 2005-03-31 21:24 ` Rick Jones 1 sibling, 2 replies; 24+ messages in thread From: Stephen Hemminger @ 2005-03-31 21:17 UTC (permalink / raw) To: hadi; +Cc: David S. Miller, netdev On 31 Mar 2005 16:10:32 -0500 Jamal Hadi Salim <hadi@znyx.com> wrote: > On Thu, 2005-03-31 at 15:04, Stephen Hemminger wrote: > > Here is another alternative that seems better than the earlier posting. It uses > > a per device receive queue for non-NAPI devices. The only issue is that then > > we lose the per-cpu queue's and that could impact the loopback device performance. > > If that is really an issue, then the per-cpu magic should be moved to the loopback > > device. > > > > The repurcassions of going from per-CPU-for-all-devices queue > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my > opinion especially in SMP. A closer view of whats there now maybe > per-device-per-CPU backlog queue. Any real hardware only has a single receive packet source (the interrupt routine), and the only collision would be in the case of interrupt migration. So having per-device-per-CPU queue's would be overkill and more complex because the NAPI scheduling is per-netdevice rather than per-queue (though that could be fixed). > I think performance will be impacted in all devices. imo, whatever needs > to go in needs to have some experimental data to back it Experiment with what? Proving an absolute negative is impossible. I will test loopback and non-NAPI version of a couple of gigabit drivers to see. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:17 ` Stephen Hemminger @ 2005-03-31 21:25 ` Jamal Hadi Salim 2005-03-31 21:43 ` Eric Lemoine 1 sibling, 0 replies; 24+ messages in thread From: Jamal Hadi Salim @ 2005-03-31 21:25 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David S. Miller, netdev On Thu, 2005-03-31 at 16:17, Stephen Hemminger wrote: > Any real hardware only has a single receive packet source (the interrupt routine), > and the only collision would be in the case of interrupt migration. So having > per-device-per-CPU queue's would be overkill and more complex because > the NAPI scheduling is per-netdevice rather than per-queue (though that > could be fixed). The idea behind the current per-CPU queues is to avoid cache ping-ponging; same queue shared across multiple CPUs with roundrobin interupts will get expensive. In other words these non-NAPI devices will be migrating across CPUs based on interupts a lot more under heavy traffic. In the case of NAPI, the issue doesnt exist: A device stays on the same queue until all packets are offloaded of it. Depending on CPU capacity it could stay forever on the same CPU. So my suggestion to do per CPU queues for these devices is avoid that. > > I think performance will be impacted in all devices. imo, whatever needs > > to go in needs to have some experimental data to back it > > Experiment with what? Proving an absolute negative is impossible. > I will test loopback and non-NAPI version of a couple of gigabit drivers > to see. I think that will do. I dont know how heavy traffic you can pound. Collecting and comparing some profiles between the two schems will help. cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:17 ` Stephen Hemminger 2005-03-31 21:25 ` Jamal Hadi Salim @ 2005-03-31 21:43 ` Eric Lemoine 2005-03-31 22:02 ` Stephen Hemminger 1 sibling, 1 reply; 24+ messages in thread From: Eric Lemoine @ 2005-03-31 21:43 UTC (permalink / raw) To: Stephen Hemminger; +Cc: hadi, David S. Miller, netdev > > > Here is another alternative that seems better than the earlier posting. It uses > > > a per device receive queue for non-NAPI devices. The only issue is that then > > > we lose the per-cpu queue's and that could impact the loopback device performance. > > > If that is really an issue, then the per-cpu magic should be moved to the loopback > > > device. > > > > > > > The repurcassions of going from per-CPU-for-all-devices queue > > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my > > opinion especially in SMP. A closer view of whats there now maybe > > per-device-per-CPU backlog queue. > > Any real hardware only has a single receive packet source (the interrupt routine), > and the only collision would be in the case of interrupt migration. So having > per-device-per-CPU queue's would be overkill and more complex because > the NAPI scheduling is per-netdevice rather than per-queue (though that > could be fixed). > > > I think performance will be impacted in all devices. imo, whatever needs > > to go in needs to have some experimental data to back it > > Experiment with what? Proving an absolute negative is impossible. > I will test loopback and non-NAPI version of a couple of gigabit drivers > to see. Just a naive question : why at all trying to accelerate netif_rx? Isn't NAPI the best choice for high performance rx anyway? -- Eric ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:43 ` Eric Lemoine @ 2005-03-31 22:02 ` Stephen Hemminger 0 siblings, 0 replies; 24+ messages in thread From: Stephen Hemminger @ 2005-03-31 22:02 UTC (permalink / raw) To: Eric Lemoine; +Cc: hadi, David S. Miller, netdev On Thu, 31 Mar 2005 23:43:27 +0200 Eric Lemoine <eric.lemoine@gmail.com> wrote: > > > > Here is another alternative that seems better than the earlier posting. It uses > > > > a per device receive queue for non-NAPI devices. The only issue is that then > > > > we lose the per-cpu queue's and that could impact the loopback device performance. > > > > If that is really an issue, then the per-cpu magic should be moved to the loopback > > > > device. > > > > > > > > > > The repurcassions of going from per-CPU-for-all-devices queue > > > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my > > > opinion especially in SMP. A closer view of whats there now maybe > > > per-device-per-CPU backlog queue. > > > > Any real hardware only has a single receive packet source (the interrupt routine), > > and the only collision would be in the case of interrupt migration. So having > > per-device-per-CPU queue's would be overkill and more complex because > > the NAPI scheduling is per-netdevice rather than per-queue (though that > > could be fixed). > > > > > I think performance will be impacted in all devices. imo, whatever needs > > > to go in needs to have some experimental data to back it > > > > Experiment with what? Proving an absolute negative is impossible. > > I will test loopback and non-NAPI version of a couple of gigabit drivers > > to see. > > Just a naive question : why at all trying to accelerate netif_rx? > Isn't NAPI the best choice for high performance rx anyway? It was a side-effect of trying to clean out the dead code leftover from removal of fastroute and other old experiments. netif_rx still gets used for loopback and people do benchmarks on loopback. Also, NAPI doesn't work for all cases like layered devices and devices like wifi that need to receive packets when the network device is "not running". ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:10 ` Jamal Hadi Salim 2005-03-31 21:17 ` Stephen Hemminger @ 2005-03-31 21:24 ` Rick Jones 2005-03-31 21:38 ` jamal 2005-04-01 16:40 ` Andi Kleen 1 sibling, 2 replies; 24+ messages in thread From: Rick Jones @ 2005-03-31 21:24 UTC (permalink / raw) To: netdev > The repurcassions of going from per-CPU-for-all-devices queue > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my > opinion especially in SMP. A closer view of whats there now maybe > per-device-per-CPU backlog queue. > I think performance will be impacted in all devices. imo, whatever needs > to go in needs to have some experimental data to back it Indeed. At the risk of again chewing on my toes (yum), if multiple CPUs are pulling packets from the per-device queue there will be packet reordering. HP-UX 10.0 did just that and it was quite nasty even at low CPU counts (<=4). It was changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called IPS for Inbound Packet Scheduling. 11.0 (ca 1998) later changed that to "find where the connection last ran and queue to that CPU" That was called TOPS - Thread Optimized Packet Scheduling. fwiw, rick jones ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:24 ` Rick Jones @ 2005-03-31 21:38 ` jamal 2005-03-31 22:42 ` Rick Jones 2005-04-01 16:40 ` Andi Kleen 1 sibling, 1 reply; 24+ messages in thread From: jamal @ 2005-03-31 21:38 UTC (permalink / raw) To: Rick Jones; +Cc: netdev On Thu, 2005-03-31 at 16:24, Rick Jones wrote: > > The repurcassions of going from per-CPU-for-all-devices queue > > (introduced by softnet) to per-device-for-all-CPUs maybe huge in my > > opinion especially in SMP. A closer view of whats there now maybe > > per-device-per-CPU backlog queue. > > I think performance will be impacted in all devices. imo, whatever needs > > to go in needs to have some experimental data to back it > > Indeed. > > At the risk of again chewing on my toes (yum), if multiple CPUs are pulling > packets from the per-device queue there will be packet reordering. ;-> This happens already _today_ on Linux on non-NAPI. Take the following scenario in non-NAPI. -packet 1 arrives -interupt happens, NIC bound to CPU0 - in the meantime packets 2,3 arrive - 3 packets put on queue for CPU0 - interupt processing done - packet 4 arrives, interupt, CPU1 is bound to NIC - in the meantime packets 5,6 arrive - CPU1 backlog queue used. - interupt processing done Assume CPU0 is overloaded with other systenm work and CPU1 rx processing kicks in first ... TCP sees packet 4, 5, 6 before 1, 2, 3 .. Note Linux is quiet resilient to reordering compared to other OSes (as you may know) but avoiding this is a better approach - hence my suggestion to use NAPI when you want to do serious TCP. Of course NAPI is not all that panacea under low traffic eating a little bit more CPU (but you have CPU issues under low load you are in some other deep shit) > HP-UX 10.0 > did just that and it was quite nasty even at low CPU counts (<=4). It was > changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed > from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called > IPS for Inbound Packet Scheduling. 11.0 (ca 1998) later changed that to "find > where the connection last ran and queue to that CPU" That was called TOPS - > Thread Optimized Packet Scheduling. > Dont think we can do that unfortunately: We are screwed by the APIC architecture on x86. cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:38 ` jamal @ 2005-03-31 22:42 ` Rick Jones 2005-03-31 23:03 ` Nivedita Singhvi 2005-03-31 23:36 ` jamal 0 siblings, 2 replies; 24+ messages in thread From: Rick Jones @ 2005-03-31 22:42 UTC (permalink / raw) To: netdev >>At the risk of again chewing on my toes (yum), if multiple CPUs are pulling >>packets from the per-device queue there will be packet reordering. > > > ;-> This happens already _today_ on Linux on non-NAPI. > > Take the following scenario in non-NAPI. > -packet 1 arrives > -interupt happens, NIC bound to CPU0 > - in the meantime packets 2,3 arrive > - 3 packets put on queue for CPU0 > - interupt processing done > > - packet 4 arrives, interupt, CPU1 is bound to NIC > - in the meantime packets 5,6 arrive > - CPU1 backlog queue used. > - interupt processing done > > Assume CPU0 is overloaded with other systenm work and CPU1 rx processing > kicks in first ... > TCP sees packet 4, 5, 6 before 1, 2, 3 .. I "never" see that because I always bind a NIC to a specific CPU :) Just about every networking-intensive benchmark report I've seen has done the same. > Note Linux is quiet resilient to reordering compared to other OSes (as > you may know) but avoiding this is a better approach - hence my > suggestion to use NAPI when you want to do serious TCP. Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to a single CPU. >> HP-UX 10.0 >>did just that and it was quite nasty even at low CPU counts (<=4). It was >>changed by HP-UX 10.20 (ca 1995) to per-CPU queues with queue selection computed >>from packet headers (hash the IP and TCP/UDP header to pick a CPU) It was called >>IPS for Inbound Packet Scheduling. 11.0 (ca 1998) later changed that to "find >>where the connection last ran and queue to that CPU" That was called TOPS - >>Thread Optimized Packet Scheduling. >> > > > Dont think we can do that unfortunately: We are screwed by the APIC > architecture on x86. The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing still happened/s on a specific CPU, it is the higher-level processing which is done on another CPU. The idea - with TOPS at least, is to try to access the ULP (TCP, UDP etc) structures on the same CPU as last accessed by the app to minimize that cache to cache migration. rick jones ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 22:42 ` Rick Jones @ 2005-03-31 23:03 ` Nivedita Singhvi 2005-03-31 23:28 ` Rick Jones 2005-03-31 23:36 ` jamal 1 sibling, 1 reply; 24+ messages in thread From: Nivedita Singhvi @ 2005-03-31 23:03 UTC (permalink / raw) To: Rick Jones; +Cc: netdev Rick Jones wrote: >> Take the following scenario in non-NAPI. -packet 1 arrives -interupt >> happens, NIC bound to CPU0 >> - in the meantime packets 2,3 arrive >> - 3 packets put on queue for CPU0 >> - interupt processing done >> >> - packet 4 arrives, interupt, CPU1 is bound to NIC >> - in the meantime packets 5,6 arrive >> - CPU1 backlog queue used. >> - interupt processing done >> >> Assume CPU0 is overloaded with other systenm work and CPU1 rx processing >> kicks in first ... TCP sees packet 4, 5, 6 before 1, 2, 3 .. > > > I "never" see that because I always bind a NIC to a specific CPU :) > Just about every networking-intensive benchmark report I've seen has > done the same. Just a reminder that the networking-benchmark world and the real networking deployment world have a less than desirable intersection (which I know you know only too well, Rick ;)). How often do people use affinity? How often do they really tune the system for their workloads? How often do they turn off things like SACK etc? Not very often in the real world. Designing OSs to do better at benchmarks is a different proposition than designing OSs to do well in the real world. >> Note Linux is quiet resilient to reordering compared to other OSes (as >> you may know) but avoiding this is a better approach - hence my >> suggestion to use NAPI when you want to do serious TCP. The real killer for TCP is triggering fast retransmit unnecessarily - and while we have some reordering detection and safeguards for that - for other situations like apps running over UDP and being unable to cope with reordering (yes, there are dunderheads like that) there is not much you can do. It does help them to avoid the reordering to begin with. thanks, Nivedita ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 23:03 ` Nivedita Singhvi @ 2005-03-31 23:28 ` Rick Jones 2005-04-01 0:10 ` Stephen Hemminger 2005-04-01 0:30 ` Nivedita Singhvi 0 siblings, 2 replies; 24+ messages in thread From: Rick Jones @ 2005-03-31 23:28 UTC (permalink / raw) To: netdev >> I "never" see that because I always bind a NIC to a specific CPU :) >> Just about every networking-intensive benchmark report I've seen has >> done the same. > > > Just a reminder that the networking-benchmark world and > the real networking deployment world have a less than desirable > intersection (which I know you know only too well, Rick ;)). Touche :) > How often do people use affinity? How often do they really tune > the system for their workloads? Not as often as they should. > How often do they turn off things like SACK etc? Well, I'm in an email discussion with someone who seems to bump their TCP windows quite large, and disable timestamps... > Not very often in the real world. Designing OSs to > do better at benchmarks is a different proposition than designing > OSs to do well in the real world. BTW what is the real world purpose of having the multiple CPU affinity of NIC interrupts? I have to admit it seems rather alien to me. (In the context of no onboard NIC smarts being involved that is) >>> Note Linux is quiet resilient to reordering compared to other OSes (as >>> you may know) but avoiding this is a better approach - hence my >>> suggestion to use NAPI when you want to do serious TCP. > > > The real killer for TCP is triggering fast retransmit > unnecessarily Agreed. That is doubleplusungood. rick ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 23:28 ` Rick Jones @ 2005-04-01 0:10 ` Stephen Hemminger 2005-04-01 0:42 ` Rick Jones 2005-04-01 0:30 ` Nivedita Singhvi 1 sibling, 1 reply; 24+ messages in thread From: Stephen Hemminger @ 2005-04-01 0:10 UTC (permalink / raw) To: Rick Jones; +Cc: netdev On Thu, 31 Mar 2005 15:28:16 -0800 Rick Jones <rick.jones2@hp.com> wrote: > >> I "never" see that because I always bind a NIC to a specific CPU :) > >> Just about every networking-intensive benchmark report I've seen has > >> done the same. > > > > > > Just a reminder that the networking-benchmark world and > > the real networking deployment world have a less than desirable > > intersection (which I know you know only too well, Rick ;)). > > Touche :) > > > How often do people use affinity? How often do they really tune > > the system for their workloads? > > Not as often as they should. > > > How often do they turn off things like SACK etc? > > Well, I'm in an email discussion with someone who seems to bump their TCP > windows quite large, and disable timestamps... And do they like the resulting data corruption. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-04-01 0:10 ` Stephen Hemminger @ 2005-04-01 0:42 ` Rick Jones 0 siblings, 0 replies; 24+ messages in thread From: Rick Jones @ 2005-04-01 0:42 UTC (permalink / raw) To: netdev >>Well, I'm in an email discussion with someone who seems to bump their TCP >>windows quite large, and disable timestamps... > > > And do they like the resulting data corruption. Minor nit - potential data corruption, perhaps even probable, but I don't think they are all that concerned yet - feeling secure in their belief that 2*MSL on a LAN is rather short indeed, and perhaps even in WANs where using 1GB TCP windows (although I may have mixed too much together there). Of course, if we believe that stacks should be smart enough to limit the initial receive windows (or does a setsockopt() actrually override that?), and grow them over time based on what the transfer rates might be and the like, perhaps the stack should have a hard interlock on TCP window >= 65535 and timestamp option on. No timestamps, no window > 65535 bytes. At present, it seems possible to have one without the other. Of course, if one is indeed on a "LAN" and _knows_ (somehow, given the existence of remote bridges) that it is a LAN. rick jones ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 23:28 ` Rick Jones 2005-04-01 0:10 ` Stephen Hemminger @ 2005-04-01 0:30 ` Nivedita Singhvi 1 sibling, 0 replies; 24+ messages in thread From: Nivedita Singhvi @ 2005-04-01 0:30 UTC (permalink / raw) To: Rick Jones; +Cc: netdev Rick Jones wrote: > Well, I'm in an email discussion with someone who seems to bump their > TCP windows quite large, and disable timestamps... Ah, an oldie but a goodie :), disabling route inheritance as a result, bumping up their default rto back to 300ms, just so many things that could go wrong there... > BTW what is the real world purpose of having the multiple CPU affinity > of NIC interrupts? I have to admit it seems rather alien to me. (In > the context of no onboard NIC smarts being involved that is) Featuritis Bloatis ;). It's a marketing requirement :). thanks, Nivedita ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 22:42 ` Rick Jones 2005-03-31 23:03 ` Nivedita Singhvi @ 2005-03-31 23:36 ` jamal 2005-04-01 0:07 ` Rick Jones 1 sibling, 1 reply; 24+ messages in thread From: jamal @ 2005-03-31 23:36 UTC (permalink / raw) To: Rick Jones; +Cc: netdev On Thu, 2005-03-31 at 17:42, Rick Jones wrote: > I "never" see that because I always bind a NIC to a specific CPU :) Just about > every networking-intensive benchmark report I've seen has done the same. > Do you have to be so clever? ;-> > > Note Linux is quiet resilient to reordering compared to other OSes (as > > you may know) but avoiding this is a better approach - hence my > > suggestion to use NAPI when you want to do serious TCP. > > Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to > a single CPU. > No reordering there. > > Dont think we can do that unfortunately: We are screwed by the APIC > > architecture on x86. > > The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing > still happened/s on a specific CPU, it is the higher-level processing which is > done on another CPU. The idea - with TOPS at least, is to try to access the ULP > (TCP, UDP etc) structures on the same CPU as last accessed by the app to > minimize that cache to cache migration. > But if interupt happens on "wrong" cpu - and you decide higher level processing is to be done on the "right" cpu (i assume queueing on some per CPU queue); then isnt that expensive? Perhaps IPIs involved even? cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 23:36 ` jamal @ 2005-04-01 0:07 ` Rick Jones 2005-04-01 1:17 ` jamal 0 siblings, 1 reply; 24+ messages in thread From: Rick Jones @ 2005-04-01 0:07 UTC (permalink / raw) To: netdev >>>Note Linux is quiet resilient to reordering compared to other OSes (as >>>you may know) but avoiding this is a better approach - hence my >>>suggestion to use NAPI when you want to do serious TCP. >> >>Would the same apply to NIC->CPU interrupt assignments? That is, bind the NIC to >>a single CPU. >> > > > No reordering there. Ah, I wasn't clear - would someone doing serious TCP want to have the interrupts of a NIC go to a specific CPU. >>>Dont think we can do that unfortunately: We are screwed by the APIC >>>architecture on x86. >> >>The IPS and TOPS stuff was/is post-NIC-interrupt. Low-level driver processing >>still happened/s on a specific CPU, it is the higher-level processing which is >>done on another CPU. The idea - with TOPS at least, is to try to access the ULP >>(TCP, UDP etc) structures on the same CPU as last accessed by the app to >>minimize that cache to cache migration. >> > > > But if interupt happens on "wrong" cpu - and you decide higher level > processing is to be done on the "right" cpu (i assume queueing on some > per CPU queue); then isnt that expensive? Perhaps IPIs involved even? More expensive than if one were lucky enough to have the interrupt on the "right" CPU in the first place, but as the CPU count goes-up, the chances of that go down. The main idea behind TOPS and prior to that IPS was to spread-out the processing of packets across as many CPUs as we could, as "correctly" as we could. Lots of small packets meant/means that a NIC could saturate its interrupt CPU before the NIC was saturated. You don't necessarily see that on say single-instance netperf TCP_STREAM (or basic FTP) testing, but certainly can on aggregate netperf TCP_RR testing. IPS, being driven by the packet header info, was good enough for simple benchmarking, but once you had more than one connection per process/thread that wasn't going to cut it, and even with one connection per process telling the process where it should run wasn't terribly easy :) It wasn't _that_ much more expensive than the queueing already happening - IPS was when HP-UX networking was BSDish and it was done when things were being queued to the netisr queue(s). TOPS lets the process (I suppose the scheduler really) decide where some of the processing for the packet will happen - the part after the handoff. rick ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-04-01 0:07 ` Rick Jones @ 2005-04-01 1:17 ` jamal 2005-04-01 18:22 ` Rick Jones 0 siblings, 1 reply; 24+ messages in thread From: jamal @ 2005-04-01 1:17 UTC (permalink / raw) To: Rick Jones; +Cc: netdev On Thu, 2005-03-31 at 19:07, Rick Jones wrote: > Ah, I wasn't clear - would someone doing serious TCP want to have the interrupts > of a NIC go to a specific CPU. > Not sure i followed: Your TCP app (server probably) is running on CPU X; You therefore want to tie the NIC which it goes out on the same CPU X? AFAIK, Linux scheduler will reschedule a process on the last CPU it was running on if possible - So if you bind a NIC to some CPU it is likely that the CPU will also run the process. Just handwaving - never tried to observe. You could bind processes to CPUs (process affinity) from user space but then also make sure you bind CPU-NIC statically > More expensive than if one were lucky enough to have the interrupt on the > "right" CPU in the first place, but as the CPU count goes-up, the chances of > that go down. Indeed. > The main idea behind TOPS and prior to that IPS was to spread-out > the processing of packets across as many CPUs as we could, as "correctly" as we > could. Very very hard to do. Isnt MSI supposed to give you ability such that a NIC can pick a CPU to interupt? That would help in a small way > Lots of small packets meant/means that a NIC could saturate its > interrupt CPU before the NIC was saturated. You don't necessarily see that on > say single-instance netperf TCP_STREAM (or basic FTP) testing, but certainly can > on aggregate netperf TCP_RR testing. > > IPS, being driven by the packet header info, was good enough for simple > benchmarking, but once you had more than one connection per process/thread that > wasn't going to cut it, and even with one connection per process telling the > process where it should run wasn't terribly easy :) It wasn't _that_ much more > expensive than the queueing already happening - IPS was when HP-UX networking > was BSDish and it was done when things were being queued to the netisr queue(s). > > TOPS lets the process (I suppose the scheduler really) decide where some of the > processing for the packet will happen - the part after the handoff. > I think this last part should be easy to do - but perhaps the expense of landing on the wrong CPU may override any benefits perceived. cheers, jamal ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-04-01 1:17 ` jamal @ 2005-04-01 18:22 ` Rick Jones 0 siblings, 0 replies; 24+ messages in thread From: Rick Jones @ 2005-04-01 18:22 UTC (permalink / raw) To: netdev >>The main idea behind TOPS and prior to that IPS was to spread-out >>the processing of packets across as many CPUs as we could, as "correctly" as we >>could. > > > Very very hard to do. Why do you say that? "Correct" can be defined as either the same CPU for each packet in a given flow (IPS) or the same CPU as last accessed the endpoint (TOPS). > Isnt MSI supposed to give you ability such that a > NIC can pick a CPU to interupt? That would help in a small way That gives the NIC the knowledge of how to direct to a CPU, but as you know does not tell it how to decide where. Since I doubt that the NIC wants to reach-out and touch connection state in the host (nor I suppose do we want it to either) the best a NIC with MSI could do would be IPS >>TOPS lets the process (I suppose the scheduler really) decide where some of the >>processing for the packet will happen - the part after the handoff. >> > > I think this last part should be easy to do - but perhaps the expense of > landing on the wrong CPU may override any benefits perceived. Unless one has a scheduler that likes to migrate processes, the chances of landing on the wrong CPU are minimal and shortlived, and overall, the chances of being right are greater than if not doing anything and sticking with the interrupt CPU. (Handwaving based on experience-driven intuition and a bit of math as one increases the CPU count) This is all on the premis that one is running with numNIC << numCPU. With numNIC == numCPU one does things as seen in certain networking-intensive benchmarks :) rick jones ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RFC] netif_rx: receive path optimization 2005-03-31 21:24 ` Rick Jones 2005-03-31 21:38 ` jamal @ 2005-04-01 16:40 ` Andi Kleen 1 sibling, 0 replies; 24+ messages in thread From: Andi Kleen @ 2005-04-01 16:40 UTC (permalink / raw) To: Rick Jones; +Cc: netdev Rick Jones <rick.jones2@hp.com> writes: > At the risk of again chewing on my toes (yum), if multiple CPUs are > pulling packets from the per-device queue there will be packet > reordering. HP-UX 10.0 did just that and it was quite nasty even at > low CPU counts (<=4). It was changed by HP-UX 10.20 (ca 1995) to > per-CPU queues with queue selection computed from packet headers (hash > the IP and TCP/UDP header to pick a CPU) It was called IPS for Inbound > Packet Scheduling. 11.0 (ca 1998) later changed that to "find where > the connection last ran and queue to that CPU" That was called TOPS - > Thread Optimized Packet Scheduling. We went over this a lot several years ago when Linux got multi threaded RX with softnet in 2.1. You might want to go over the archives. Some things that came out of it was a sender side TCP optimization to tolerate reordering without slowing down (works great with other Linux peers) and NAPI style polling mode (which was mostly designed for routing and still seems to have regressions for the client/server case :/) Something like TOPS was discussed, but afaik nobody ever implemented it. Of course benchmark guys do it manually by setting interrupt and scheduler affinity. -Andi ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2005-04-01 18:22 UTC | newest] Thread overview: 24+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-03-30 21:28 [PATCH] netif_rx: receive path optimization Stephen Hemminger 2005-03-30 21:57 ` jamal 2005-03-30 22:08 ` jamal 2005-03-30 23:53 ` Stephen Hemminger 2005-03-31 3:16 ` jamal 2005-03-31 20:04 ` [RFC] " Stephen Hemminger 2005-03-31 21:10 ` Jamal Hadi Salim 2005-03-31 21:17 ` Stephen Hemminger 2005-03-31 21:25 ` Jamal Hadi Salim 2005-03-31 21:43 ` Eric Lemoine 2005-03-31 22:02 ` Stephen Hemminger 2005-03-31 21:24 ` Rick Jones 2005-03-31 21:38 ` jamal 2005-03-31 22:42 ` Rick Jones 2005-03-31 23:03 ` Nivedita Singhvi 2005-03-31 23:28 ` Rick Jones 2005-04-01 0:10 ` Stephen Hemminger 2005-04-01 0:42 ` Rick Jones 2005-04-01 0:30 ` Nivedita Singhvi 2005-03-31 23:36 ` jamal 2005-04-01 0:07 ` Rick Jones 2005-04-01 1:17 ` jamal 2005-04-01 18:22 ` Rick Jones 2005-04-01 16:40 ` Andi Kleen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).