* [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue @ 2010-04-23 8:12 Changli Gao 2010-04-23 9:27 ` Eric Dumazet 2010-04-23 10:26 ` Eric Dumazet 0 siblings, 2 replies; 108+ messages in thread From: Changli Gao @ 2010-04-23 8:12 UTC (permalink / raw) To: David S. Miller Cc: jamal, Tom Herbert, Eric Dumazet, Stephen Hemminger, netdev, Changli Gao batch skb dequeueing from softnet input_pkt_queue. batch skb dequeueing from softnet input_pkt_queue to reduce potential lock contention when RPS is enabled. Note: in the worst case, the number of packets in a softnet_data may be double of netdev_max_backlog. Signed-off-by: Changli Gao <xiaosuo@gmail.com> ---- include/linux/netdevice.h | 6 +++-- net/core/dev.c | 50 +++++++++++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c5ed5f..6ae9f2b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1387,6 +1387,7 @@ struct softnet_data { struct Qdisc *output_queue; struct list_head poll_list; struct sk_buff *completion_queue; + struct sk_buff_head process_queue; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; @@ -1401,10 +1402,11 @@ struct softnet_data { struct napi_struct backlog; }; -static inline void input_queue_head_incr(struct softnet_data *sd) +static inline void input_queue_head_add(struct softnet_data *sd, + unsigned int len) { #ifdef CONFIG_RPS - sd->input_queue_head++; + sd->input_queue_head += len; #endif } diff --git a/net/core/dev.c b/net/core/dev.c index a4a7c36..c1585f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, __get_cpu_var(netdev_rx_stat).total++; rps_lock(sd); - if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { - if (sd->input_pkt_queue.qlen) { + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); #ifdef CONFIG_RPS - *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; + *qtail = sd->input_queue_head + + skb_queue_len(&sd->input_pkt_queue); #endif rps_unlock(sd); local_irq_restore(flags); @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg) struct sk_buff *skb, *tmp; rps_lock(sd); - skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev == dev) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - input_queue_head_incr(sd); + input_queue_head_add(sd, 1); } + } rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + } + } } static int napi_gro_complete(struct sk_buff *skb) @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota) } #endif napi->weight = weight_p; - do { + local_irq_disable(); + while (1) { struct sk_buff *skb; - local_irq_disable(); + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + if (++work >= quota) + return work; + local_irq_disable(); + } + rps_lock(sd); - skb = __skb_dequeue(&sd->input_pkt_queue); - if (!skb) { + input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue)); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + if (skb_queue_empty(&sd->process_queue)) { __napi_complete(napi); rps_unlock(sd); - local_irq_enable(); break; } - input_queue_head_incr(sd); rps_unlock(sd); - local_irq_enable(); - - __netif_receive_skb(skb); - } while (++work < quota); + } + local_irq_enable(); return work; } @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + input_queue_head_add(oldsd, 1); } + while ((skb = __skb_dequeue(&oldsd->process_queue))) + netif_rx(skb); return NOTIFY_OK; } @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void) struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-23 8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao @ 2010-04-23 9:27 ` Eric Dumazet 2010-04-23 22:02 ` jamal 2010-04-23 10:26 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-23 9:27 UTC (permalink / raw) To: Changli Gao Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit : > batch skb dequeueing from softnet input_pkt_queue. > > batch skb dequeueing from softnet input_pkt_queue to reduce potential lock > contention when RPS is enabled. > > Note: in the worst case, the number of packets in a softnet_data may be double > of netdev_max_backlog. > > Signed-off-by: Changli Gao <xiaosuo@gmail.com> Very good patch Changli, thanks ! Lets see how it improves thing for Jamal benchs ;) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> > ---- > include/linux/netdevice.h | 6 +++-- > net/core/dev.c | 50 +++++++++++++++++++++++++++++++--------------- > 2 files changed, 38 insertions(+), 18 deletions(-) > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 3c5ed5f..6ae9f2b 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -1387,6 +1387,7 @@ struct softnet_data { > struct Qdisc *output_queue; > struct list_head poll_list; > struct sk_buff *completion_queue; > + struct sk_buff_head process_queue; > > #ifdef CONFIG_RPS > struct softnet_data *rps_ipi_list; > @@ -1401,10 +1402,11 @@ struct softnet_data { > struct napi_struct backlog; > }; > > -static inline void input_queue_head_incr(struct softnet_data *sd) > +static inline void input_queue_head_add(struct softnet_data *sd, > + unsigned int len) > { > #ifdef CONFIG_RPS > - sd->input_queue_head++; > + sd->input_queue_head += len; > #endif > } > > diff --git a/net/core/dev.c b/net/core/dev.c > index a4a7c36..c1585f9 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > __get_cpu_var(netdev_rx_stat).total++; > > rps_lock(sd); > - if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { > - if (sd->input_pkt_queue.qlen) { > + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { > + if (skb_queue_len(&sd->input_pkt_queue)) { > enqueue: > __skb_queue_tail(&sd->input_pkt_queue, skb); > #ifdef CONFIG_RPS > - *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; > + *qtail = sd->input_queue_head + > + skb_queue_len(&sd->input_pkt_queue); > #endif > rps_unlock(sd); > local_irq_restore(flags); > @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg) > struct sk_buff *skb, *tmp; > > rps_lock(sd); > - skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) > + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { > if (skb->dev == dev) { > __skb_unlink(skb, &sd->input_pkt_queue); > kfree_skb(skb); > - input_queue_head_incr(sd); > + input_queue_head_add(sd, 1); > } > + } > rps_unlock(sd); > + > + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { > + if (skb->dev == dev) { > + __skb_unlink(skb, &sd->process_queue); > + kfree_skb(skb); > + } > + } > } > > static int napi_gro_complete(struct sk_buff *skb) > @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota) > } > #endif > napi->weight = weight_p; > - do { > + local_irq_disable(); > + while (1) { > struct sk_buff *skb; > > - local_irq_disable(); > + while ((skb = __skb_dequeue(&sd->process_queue))) { > + local_irq_enable(); > + __netif_receive_skb(skb); > + if (++work >= quota) > + return work; > + local_irq_disable(); > + } > + > rps_lock(sd); > - skb = __skb_dequeue(&sd->input_pkt_queue); > - if (!skb) { > + input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue)); > + skb_queue_splice_tail_init(&sd->input_pkt_queue, > + &sd->process_queue); > + if (skb_queue_empty(&sd->process_queue)) { > __napi_complete(napi); > rps_unlock(sd); > - local_irq_enable(); > break; > } > - input_queue_head_incr(sd); > rps_unlock(sd); > - local_irq_enable(); > - > - __netif_receive_skb(skb); > - } while (++work < quota); > + } > + local_irq_enable(); > > return work; > } > @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, > /* Process offline CPU's input_pkt_queue */ > while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { > netif_rx(skb); > - input_queue_head_incr(oldsd); > + input_queue_head_add(oldsd, 1); > } > + while ((skb = __skb_dequeue(&oldsd->process_queue))) > + netif_rx(skb); > > return NOTIFY_OK; > } > @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void) > struct softnet_data *sd = &per_cpu(softnet_data, i); > > skb_queue_head_init(&sd->input_pkt_queue); > + skb_queue_head_init(&sd->process_queue); > sd->completion_queue = NULL; > INIT_LIST_HEAD(&sd->poll_list); > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-23 9:27 ` Eric Dumazet @ 2010-04-23 22:02 ` jamal 2010-04-24 14:10 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-23 22:02 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev On Fri, 2010-04-23 at 11:27 +0200, Eric Dumazet wrote: > > Lets see how it improves thing for Jamal benchs ;) Ive done a setup with the last patch from Changli + net-next - I will post test results tomorrow AM. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-23 22:02 ` jamal @ 2010-04-24 14:10 ` jamal 2010-04-26 14:03 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-24 14:10 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev [-- Attachment #1: Type: text/plain, Size: 203 bytes --] On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > Ive done a setup with the last patch from Changli + net-next - I will > post test results tomorrow AM. ok, annotated results attached. cheers, jamal [-- Attachment #2: summary-apr23.txt --] [-- Type: text/plain, Size: 45513 bytes --] sink cpu all cpuint cpuapp nn-standalone 93.95% 84.5% 99.8% 79.8% nn-rps 96.41% 85.4% 95.5% 82.5% nn-cl 97.29% 84.0% 99.9% 79.6% nn-cl-rps 97.76% 86.5% 96.5% 84.8% nn-standalone: Basic net-next from Apr23 nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 nn-cl: Basic net-next from Apr23 + Changli patch nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0 sink: the amount of traffic the system was able to sink in. cpu all: avg % system cpu consumed in test cpuint: avg %cpu consumed by the cpu where interrupts happened cpuapp: avg %cpu consumed by a sample cpu which did app processing Testing was as previously explained.. I repeated each test 4-5 times and took averages.. It seems the non-rps case has improved drammatically since the last net-next i tested. The rps case has also improved but the gap between rps and non-rps is smaller. [There are just too many variables for me to pinpoint to one item as being the contributor. For example sky2 driver may have become worse (consumes more cycles) but i cant quantify it yet (i just see sky2_rx_submit showing up higher in profiles than before). Also call_function_single_interrupt shows up prominently on application processing CPUs but improved by Changli's changes]. After doing the math, I dont trust my results after applying Changlis patch. It seems both the rps and non-rps case have gotten better (and i dont see Changlis contribution to non-rps). It also seems that the gap between rps and non-rps is non-existent now. In other words, there is no benefit to using rps (it consumes more cpu for the same throughput). So it is likely that i need to repeat these tests; maybe i did something wrong in my setup... And here are the profiles: -------------------------- cpu0 always received all the interrupts regardless of the tests. cpu1, 7 etc were processing apps.. I could not spot much difference between before and after Changli's I: Test setup : nn-standalone: Basic net-next from Apr23 All cpus ------------------------------------------------------------------------------- PerfTop: 3784 irqs/sec kernel:84.2% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 3254.00 10.3% sky2_poll [sky2] 1853.00 5.9% _raw_spin_lock_irqsave [kernel] 872.00 2.8% fget [kernel] 870.00 2.8% copy_user_generic_string [kernel] 819.00 2.6% _raw_spin_unlock_irqrestore [kernel] 729.00 2.3% sys_epoll_ctl [kernel] 701.00 2.2% datagram_poll [kernel] 615.00 2.0% udp_recvmsg [kernel] 602.00 1.9% _raw_spin_lock_bh [kernel] 595.00 1.9% system_call [kernel] 592.00 1.9% kmem_cache_free [kernel] 574.00 1.8% schedule [kernel] 568.00 1.8% _raw_spin_lock [kernel] ------------------------------------------------------------------------------- PerfTop: 3574 irqs/sec kernel:85.1% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 5023.00 10.9% sky2_poll [sky2] 2762.00 6.0% _raw_spin_lock_irqsave [kernel] 1319.00 2.9% copy_user_generic_string [kernel] 1306.00 2.8% fget [kernel] 1198.00 2.6% _raw_spin_unlock_irqrestore [kernel] 1071.00 2.3% datagram_poll [kernel] 1061.00 2.3% sys_epoll_ctl [kernel] 927.00 2.0% _raw_spin_lock_bh [kernel] 917.00 2.0% system_call [kernel] 901.00 1.9% udp_recvmsg [kernel] 895.00 1.9% kmem_cache_free [kernel] 819.00 1.8% _raw_spin_lock [kernel] 802.00 1.7% schedule [kernel] 774.00 1.7% sys_epoll_wait [kernel] 720.00 1.6% kmem_cache_alloc [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 751.00 36.1% sky2_poll [sky2] 108.00 5.2% __udp4_lib_lookup [kernel] 95.00 4.6% ip_route_input [kernel] 83.00 4.0% _raw_spin_lock [kernel] 79.00 3.8% _raw_spin_lock_irqsave [kernel] 77.00 3.7% __netif_receive_skb [kernel] 77.00 3.7% __alloc_skb [kernel] 66.00 3.2% ip_rcv [kernel] 60.00 2.9% __udp4_lib_rcv [kernel] 54.00 2.6% sock_queue_rcv_skb [kernel] 45.00 2.2% sky2_rx_submit [sky2] 42.00 2.0% __wake_up_common [kernel] 40.00 1.9% __kmalloc [kernel] 39.00 1.9% sock_def_readable [kernel] 30.00 1.4% ep_poll_callback [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:99.8% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 3511.00 36.7% sky2_poll [sky2] 519.00 5.4% __udp4_lib_lookup [kernel] 431.00 4.5% ip_route_input [kernel] 353.00 3.7% _raw_spin_lock_irqsave [kernel] 351.00 3.7% __alloc_skb [kernel] 338.00 3.5% __netif_receive_skb [kernel] 337.00 3.5% _raw_spin_lock [kernel] 307.00 3.2% ip_rcv [kernel] 264.00 2.8% sky2_rx_submit [sky2] 254.00 2.7% sock_queue_rcv_skb [kernel] 246.00 2.6% __udp4_lib_rcv [kernel] 206.00 2.2% sock_def_readable [kernel] 177.00 1.9% __wake_up_common [kernel] 168.00 1.8% __kmalloc [kernel] ------------------------------------------------------------------------------- PerfTop: 908 irqs/sec kernel:80.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 177.00 6.7% _raw_spin_lock_irqsave [kernel] 120.00 4.5% copy_user_generic_string [kernel] 110.00 4.2% fget [kernel] 108.00 4.1% datagram_poll [kernel] 98.00 3.7% _raw_spin_lock_bh [kernel] 91.00 3.4% sys_epoll_ctl [kernel] 89.00 3.4% kmem_cache_free [kernel] 77.00 2.9% system_call [kernel] 76.00 2.9% schedule [kernel] 76.00 2.9% _raw_spin_unlock_irqrestore [kernel] 63.00 2.4% fput [kernel] 61.00 2.3% sys_epoll_wait [kernel] 61.00 2.3% udp_recvmsg [kernel] 49.00 1.8% process_recv mcpudp ------------------------------------------------------------------------------- PerfTop: 815 irqs/sec kernel:79.8% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ _________________ 491.00 8.0% _raw_spin_lock_irqsave [kernel.kallsyms] 285.00 4.7% copy_user_generic_string [kernel.kallsyms] 252.00 4.1% fget [kernel.kallsyms] 215.00 3.5% datagram_poll [kernel.kallsyms] 206.00 3.4% _raw_spin_unlock_irqrestore [kernel.kallsyms] 204.00 3.3% sys_epoll_ctl [kernel.kallsyms] 196.00 3.2% _raw_spin_lock_bh [kernel.kallsyms] 184.00 3.0% udp_recvmsg [kernel.kallsyms] 184.00 3.0% kmem_cache_free [kernel.kallsyms] 180.00 2.9% system_call [kernel.kallsyms] 168.00 2.7% sys_epoll_wait [kernel.kallsyms] 159.00 2.6% schedule [kernel.kallsyms] 144.00 2.4% fput [kernel.kallsyms] II: Test setup nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 ------------------------------------------------------------------------------- PerfTop: 3558 irqs/sec kernel:85.0% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 3519.00 15.9% sky2_poll [sky2] 865.00 3.9% _raw_spin_lock_irqsave [kernel] 568.00 2.6% _raw_spin_unlock_irqrestore [kernel] 526.00 2.4% sky2_intr [sky2] 493.00 2.2% __netif_receive_skb [kernel] 477.00 2.2% _raw_spin_lock [kernel] 470.00 2.1% ip_rcv [kernel] 456.00 2.1% fget [kernel] 447.00 2.0% sys_epoll_ctl [kernel] 420.00 1.9% copy_user_generic_string [kernel] 387.00 1.8% ip_route_input [kernel] 359.00 1.6% system_call [kernel] 334.00 1.5% kmem_cache_free [kernel] 310.00 1.4% kmem_cache_alloc [kernel] 302.00 1.4% call_function_single_interrupt [kernel] ------------------------------------------------------------------------------- PerfTop: 3546 irqs/sec kernel:85.8% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 6592.00 16.2% sky2_poll [sky2] 1540.00 3.8% _raw_spin_lock_irqsave [kernel] 1014.00 2.5% _raw_spin_unlock_irqrestore [kernel] 885.00 2.2% fget [kernel] 881.00 2.2% _raw_spin_lock [kernel] 880.00 2.2% sky2_intr [sky2] 872.00 2.1% __netif_receive_skb [kernel] 858.00 2.1% ip_rcv [kernel] 802.00 2.0% sys_epoll_ctl [kernel] 710.00 1.7% copy_user_generic_string [kernel] 696.00 1.7% system_call [kernel] 692.00 1.7% ip_route_input [kernel] 634.00 1.6% schedule [kernel] 618.00 1.5% kmem_cache_free [kernel] 605.00 1.5% call_function_single_interrupt [kernel] cpu0 ------------------------------------------------------------------------------- PerfTop: 971 irqs/sec kernel:96.5% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 4222.00 58.2% sky2_poll [sky2] 668.00 9.2% sky2_intr [sky2] 228.00 3.1% __alloc_skb [kernel] 183.00 2.5% get_rps_cpu [kernel] 138.00 1.9% sky2_rx_submit [sky2] 124.00 1.7% enqueue_to_backlog [kernel] 119.00 1.6% __kmalloc [kernel] 103.00 1.4% kmem_cache_alloc [kernel] 91.00 1.3% _raw_spin_lock [kernel] 90.00 1.2% _raw_spin_lock_irqsave [kernel] 73.00 1.0% swiotlb_sync_single [kernel] 72.00 1.0% irq_entries_start [kernel] 55.00 0.8% copy_user_generic_string [kernel] 53.00 0.7% _raw_spin_unlock_irqrestore [kernel] 48.00 0.7% fget [kernel] ------------------------------------------------------------------------------- PerfTop: 998 irqs/sec kernel:94.8% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 6745.00 58.5% sky2_poll [sky2] 831.00 7.2% sky2_intr [sky2] 352.00 3.1% __alloc_skb [kernel] 281.00 2.4% get_rps_cpu [kernel] 226.00 2.0% sky2_rx_submit [sky2] 186.00 1.6% __kmalloc [kernel] 181.00 1.6% enqueue_to_backlog [kernel] 173.00 1.5% _raw_spin_lock_irqsave [kernel] 166.00 1.4% kmem_cache_alloc [kernel] 162.00 1.4% _raw_spin_lock [kernel] 99.00 0.9% swiotlb_sync_single [kernel] 98.00 0.9% irq_entries_start [kernel] 94.00 0.8% fget [kernel] 92.00 0.8% _raw_spin_unlock_irqrestore [kernel] 80.00 0.7% system_call [kernel] cpu1 ------------------------------------------------------------------------------- PerfTop: 724 irqs/sec kernel:82.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 204.00 5.3% _raw_spin_lock_irqsave [kernel.kallsyms] 153.00 4.0% _raw_spin_unlock_irqrestore [kernel.kallsyms] 147.00 3.8% call_function_single_interrupt [kernel.kallsyms] 139.00 3.6% __netif_receive_skb [kernel.kallsyms] 135.00 3.5% sys_epoll_ctl [kernel.kallsyms] 132.00 3.4% ip_rcv [kernel.kallsyms] 129.00 3.3% fget [kernel.kallsyms] 128.00 3.3% _raw_spin_lock [kernel.kallsyms] 122.00 3.2% system_call [kernel.kallsyms] 118.00 3.1% ip_route_input [kernel.kallsyms] 109.00 2.8% kmem_cache_free [kernel.kallsyms] 108.00 2.8% copy_user_generic_string [kernel.kallsyms] 90.00 2.3% schedule [kernel.kallsyms] 85.00 2.2% fput [kernel.kallsyms] ------------------------------------------------------------------------------- PerfTop: 763 irqs/sec kernel:83.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 428.00 6.2% _raw_spin_lock_irqsave [kernel.kallsyms] 302.00 4.4% _raw_spin_unlock_irqrestore [kernel.kallsyms] 269.00 3.9% __netif_receive_skb [kernel.kallsyms] 258.00 3.7% call_function_single_interrupt [kernel.kallsyms] 254.00 3.7% fget [kernel.kallsyms] 238.00 3.4% ip_rcv [kernel.kallsyms] 230.00 3.3% sys_epoll_ctl [kernel.kallsyms] 222.00 3.2% _raw_spin_lock [kernel.kallsyms] 220.00 3.2% ip_route_input [kernel.kallsyms] 197.00 2.9% system_call [kernel.kallsyms] 189.00 2.7% kmem_cache_free [kernel.kallsyms] 184.00 2.7% copy_user_generic_string [kernel.kallsyms] 144.00 2.1% ep_remove [kernel.kallsyms] 140.00 2.0% schedule [kernel.kallsyms] ------------------------------------------------------------------------------- PerfTop: 546 irqs/sec kernel:83.3% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 346.00 5.7% _raw_spin_lock_irqsave [kernel.kallsyms] 275.00 4.6% _raw_spin_unlock_irqrestore [kernel.kallsyms] 238.00 3.9% call_function_single_interrupt [kernel.kallsyms] 228.00 3.8% fget [kernel.kallsyms] 222.00 3.7% __netif_receive_skb [kernel.kallsyms] 219.00 3.6% sys_epoll_ctl [kernel.kallsyms] 209.00 3.5% _raw_spin_lock [kernel.kallsyms] 205.00 3.4% ip_rcv [kernel.kallsyms] 199.00 3.3% ip_route_input [kernel.kallsyms] 173.00 2.9% system_call [kernel.kallsyms] 170.00 2.8% copy_user_generic_string [kernel.kallsyms] 167.00 2.8% kmem_cache_free [kernel.kallsyms] 127.00 2.1% ep_remove [kernel.kallsyms] 123.00 2.0% dst_release [kernel.kalls III: Test setup nn-cl: Basic net-next from Apr23 + Changli patch ------------------------------------------------------------------------------- PerfTop: 3789 irqs/sec kernel:84.1% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 3514.00 10.2% sky2_poll [sky2] 1862.00 5.4% _raw_spin_lock_irqsave [kernel] 1274.00 3.7% system_call [kernel] 926.00 2.7% fget [kernel] 872.00 2.5% _raw_spin_unlock_irqrestore [kernel] 862.00 2.5% copy_user_generic_string [kernel] 766.00 2.2% sys_epoll_ctl [kernel] 765.00 2.2% datagram_poll [kernel] 671.00 2.0% _raw_spin_lock_bh [kernel] 668.00 1.9% kmem_cache_free [kernel] 602.00 1.8% udp_recvmsg [kernel] 586.00 1.7% _raw_spin_lock [kernel] 585.00 1.7% vread_tsc [kernel].vsyscall_fn ------------------------------------------------------------------------------- PerfTop: 3794 irqs/sec kernel:83.6% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 4756.00 9.8% sky2_poll [sky2] 2742.00 5.7% _raw_spin_lock_irqsave [kernel] 1826.00 3.8% system_call [kernel] 1285.00 2.7% fget [kernel] 1284.00 2.7% copy_user_generic_string [kernel] 1235.00 2.6% _raw_spin_unlock_irqrestore [kernel] 1096.00 2.3% sys_epoll_ctl [kernel] 1071.00 2.2% datagram_poll [kernel] 954.00 2.0% kmem_cache_free [kernel] 925.00 1.9% _raw_spin_lock_bh [kernel] 888.00 1.8% vread_tsc [kernel].vsyscall_fn 880.00 1.8% udp_recvmsg [kernel] 793.00 1.6% _raw_spin_lock [kernel] 790.00 1.6% schedule [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:99.9% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 675.00 32.6% sky2_poll [sky2] 116.00 5.6% __udp4_lib_lookup [kernel] 111.00 5.4% ip_route_input [kernel] 81.00 3.9% _raw_spin_lock_irqsave [kernel] 81.00 3.9% _raw_spin_lock [kernel] 70.00 3.4% __alloc_skb [kernel] 67.00 3.2% ip_rcv [kernel] 66.00 3.2% __netif_receive_skb [kernel] 61.00 2.9% __udp4_lib_rcv [kernel] 57.00 2.8% sock_queue_rcv_skb [kernel] 47.00 2.3% sock_def_readable [kernel] 42.00 2.0% __kmalloc [kernel] 42.00 2.0% __wake_up_common [kernel] 38.00 1.8% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 2526.00 32.8% sky2_poll [sky2] 406.00 5.3% ip_route_input [kernel] 399.00 5.2% __udp4_lib_lookup [kernel] 328.00 4.3% _raw_spin_lock_irqsave [kernel] 307.00 4.0% _raw_spin_lock [kernel] 296.00 3.8% ip_rcv [kernel] 287.00 3.7% __alloc_skb [kernel] 272.00 3.5% sock_queue_rcv_skb [kernel] 224.00 2.9% __udp4_lib_rcv [kernel] 224.00 2.9% __netif_receive_skb [kernel] 182.00 2.4% sock_def_readable [kernel] 163.00 2.1% __wake_up_common [kernel] 140.00 1.8% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 4445.00 33.4% sky2_poll [sky2] 707.00 5.3% __udp4_lib_lookup [kernel] 662.00 5.0% ip_route_input [kernel] 567.00 4.3% _raw_spin_lock_irqsave [kernel] 512.00 3.8% __alloc_skb [kernel] 506.00 3.8% ip_rcv [kernel] 476.00 3.6% sock_queue_rcv_skb [kernel] 473.00 3.6% _raw_spin_lock [kernel] 415.00 3.1% __udp4_lib_rcv [kernel] 408.00 3.1% __netif_receive_skb [kernel] 306.00 2.3% sock_def_readable [kernel] 272.00 2.0% __wake_up_common [kernel] 260.00 2.0% __kmalloc [kernel] 216.00 1.6% _raw_read_lock [kernel] 214.00 1.6% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 748 irqs/sec kernel:80.9% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 244.00 7.4% _raw_spin_lock_irqsave [kernel] 207.00 6.2% system_call [kernel] 127.00 3.8% _raw_spin_unlock_irqrestore [kernel] 124.00 3.7% copy_user_generic_string [kernel] 122.00 3.7% sys_epoll_ctl [kernel] 120.00 3.6% fget [kernel] 118.00 3.6% datagram_poll [kernel] 96.00 2.9% schedule [kernel] 94.00 2.8% _raw_spin_lock_bh [kernel] 86.00 2.6% vread_tsc [kernel].vsyscall_fn 82.00 2.5% udp_recvmsg [kernel] 76.00 2.3% fput [kernel] 73.00 2.2% kmem_cache_free [kernel] 67.00 2.0% sys_epoll_wait [kernel] ------------------------------------------------------------------------------- PerfTop: 625 irqs/sec kernel:78.6% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 488.00 7.5% _raw_spin_lock_irqsave [kernel] 380.00 5.9% system_call [kernel] 274.00 4.2% copy_user_generic_string [kernel] 252.00 3.9% fget [kernel] 244.00 3.8% datagram_poll [kernel] 217.00 3.3% _raw_spin_unlock_irqrestore [kernel] 211.00 3.3% sys_epoll_ctl [kernel] 186.00 2.9% schedule [kernel] 185.00 2.9% _raw_spin_lock_bh [kernel] 173.00 2.7% udp_recvmsg [kernel] 169.00 2.6% vread_tsc [kernel].vsyscall_fn 164.00 2.5% kmem_cache_free [kernel] 143.00 2.2% fput [kernel] 133.00 2.1% sys_epoll_wait [kernel] IV: Test setup nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff -------------------------------------------------------------------------- PerfTop: 3043 irqs/sec kernel:87.5% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 2240.00 20.4% sky2_poll [sky2] 375.00 3.4% _raw_spin_lock_irqsave [kernel] 335.00 3.0% sky2_intr [sky2] 326.00 3.0% system_call [kernel] 239.00 2.2% _raw_spin_unlock_irqrestor [kernel] 224.00 2.0% ip_rcv [kernel] 201.00 1.8% __netif_receive_skb [kernel] 198.00 1.8% sys_epoll_ctl [kernel] 190.00 1.7% _raw_spin_lock [kernel] 182.00 1.7% fget [kernel] 169.00 1.5% copy_user_generic_string [kernel] 165.00 1.5% kmem_cache_free [kernel] 149.00 1.4% load_balance [kernel] 146.00 1.3% ip_route_input [kernel] -------------------------------------------------------------------------- PerfTop: 3210 irqs/sec kernel:85.8% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 6539.00 20.4% sky2_poll [sky2] 1106.00 3.4% _raw_spin_lock_irqsave [kernel] 1014.00 3.2% sky2_intr [sky2] 976.00 3.0% system_call [kernel] 684.00 2.1% _raw_spin_unlock_irqrestor [kernel] 611.00 1.9% ip_rcv [kernel] 601.00 1.9% fget [kernel] 593.00 1.8% _raw_spin_lock [kernel] 592.00 1.8% sys_epoll_ctl [kernel] 574.00 1.8% __netif_receive_skb [kernel] 526.00 1.6% copy_user_generic_string [kernel] 482.00 1.5% kmem_cache_free [kernel] 480.00 1.5% ip_route_input [kernel] 425.00 1.3% vread_tsc [kernel].vsyscall_fn 410.00 1.3% kmem_cache_alloc [kernel] -------------------------------------------------------------------------- PerfTop: 999 irqs/sec kernel:97.2% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 2035.00 60.5% sky2_poll [sky2] 302.00 9.0% sky2_intr [sky2] 109.00 3.2% __alloc_skb [kernel] 57.00 1.7% _raw_spin_lock [kernel] 57.00 1.7% get_rps_cpu [kernel] 52.00 1.5% __kmalloc [kernel] 51.00 1.5% enqueue_to_backlog [kernel] 49.00 1.5% _raw_spin_lock_irqsave [kernel] 44.00 1.3% kmem_cache_alloc [kernel] 34.00 1.0% sky2_rx_submit [sky2] 33.00 1.0% swiotlb_sync_single [kernel] 31.00 0.9% system_call [kernel] 28.00 0.8% irq_entries_start [kernel] 22.00 0.7% _raw_spin_unlock_irqrestore [kernel] 21.00 0.6% sky2_remove [sky2] -------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:96.2% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 5493.00 60.1% sky2_poll [sky2] 803.00 8.8% sky2_intr [sky2] 281.00 3.1% __alloc_skb [kernel] 233.00 2.6% get_rps_cpu [kernel] 136.00 1.5% enqueue_to_backlog [kernel] 132.00 1.4% __kmalloc [kernel] 126.00 1.4% _raw_spin_lock [kernel] 122.00 1.3% kmem_cache_alloc [kernel] 122.00 1.3% _raw_spin_lock_irqsave [kernel] 102.00 1.1% swiotlb_sync_single [kernel] 88.00 1.0% sky2_rx_submit [sky2] 77.00 0.8% system_call [kernel] 69.00 0.8% irq_entries_start [kernel] 55.00 0.6% _raw_spin_unlock_irqrestore [kernel] 54.00 0.6% copy_user_generic_string [kernel] -------------------------------------------------------------------------- PerfTop: 999 irqs/sec kernel:97.5% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 6699.00 60.1% sky2_poll [sky2] 988.00 8.9% sky2_intr [sky2] 327.00 2.9% __alloc_skb [kernel] 261.00 2.3% get_rps_cpu [kernel] 168.00 1.5% __kmalloc [kernel] 161.00 1.4% kmem_cache_alloc [kernel] 160.00 1.4% enqueue_to_backlog [kernel] 157.00 1.4% _raw_spin_lock [kernel] 125.00 1.1% _raw_spin_lock_irqsave [kernel] 122.00 1.1% swiotlb_sync_single [kernel] 114.00 1.0% sky2_rx_submit [sky2] 96.00 0.9% system_call [kernel] 85.00 0.8% irq_entries_start [kernel] 66.00 0.6% sky2_remove [sky2] 64.00 0.6% _raw_spin_unlock_irqrestore [kernel] -------------------------------------------------------------------------- PerfTop: 420 irqs/sec kernel:84.8% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 188.00 4.8% _raw_spin_lock_irqsave [kernel] 175.00 4.5% system_call [kernel] 155.00 4.0% _raw_spin_unlock_irqrestor [kernel] 143.00 3.7% __netif_receive_skb [kernel] 124.00 3.2% ip_route_input [kernel] 122.00 3.1% fget [kernel] 118.00 3.0% ip_rcv [kernel] 115.00 2.9% sys_epoll_ctl [kernel] 107.00 2.7% call_function_single_inter [kernel] 98.00 2.5% vread_tsc [kernel].vsyscall_fn 97.00 2.5% _raw_spin_lock [kernel] 89.00 2.3% copy_user_generic_string [kernel] -------------------------------------------------------------------------- PerfTop: 372 irqs/sec kernel:87.9% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 212.00 4.6% _raw_spin_lock_irqsave [kernel] 192.00 4.2% system_call [kernel] 187.00 4.1% __netif_receive_skb [kernel] 184.00 4.0% ip_rcv [kernel] 174.00 3.8% ip_route_input [kernel] 165.00 3.6% _raw_spin_unlock_irqrestor [kernel] 143.00 3.1% call_function_single_inter [kernel] 135.00 3.0% fget [kernel] 133.00 2.9% sys_epoll_ctl [kernel] 122.00 2.7% _raw_spin_lock [kernel] 112.00 2.5% __udp4_lib_lookup [kernel] 99.00 2.2% copy_user_generic_string [kernel] 93.00 2.0% vread_tsc [kernel].vsyscall_fn 90.00 2.0% kmem_cache_free [kernel] 89.00 1.9% ep_remove [kernel] o -------------------------------------------------------------------------- PerfTop: 269 irqs/sec kernel:85.1% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 23.00 4.6% _raw_spin_lock_irqsave [kernel] 21.00 4.2% system_call [kernel] 19.00 3.8% _raw_spin_unlock_irqrestor [kernel] 17.00 3.4% fget [kernel] 15.00 3.0% __netif_receive_skb [kernel] 14.00 2.8% dst_release [kernel] 13.00 2.6% call_function_single_inter [kernel] 11.00 2.2% kmem_cache_free [kernel] 10.00 2.0% vread_tsc [kernel].vsyscall_fn 10.00 2.0% copy_user_generic_string [kernel] 10.00 2.0% ktime_get [kernel] 10.00 2.0% ip_route_input [kernel] 10.00 2.0% schedule [kernel] -------------------------------------------------------------------------- PerfTop: 253 irqs/sec kernel:84.6% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 109.00 4.9% system_call [kernel] 104.00 4.6% _raw_spin_lock_irqsave [kernel] 79.00 3.5% ip_rcv [kernel] 74.00 3.3% _raw_spin_unlock_irqrestor [kernel] 71.00 3.2% fget [kernel] 68.00 3.0% sys_epoll_ctl [kernel] 66.00 2.9% ip_route_input [kernel] 58.00 2.6% call_function_single_inter [kernel] 55.00 2.4% _raw_spin_lock [kernel] 54.00 2.4% copy_user_generic_string [kernel] 53.00 2.4% __netif_receive_skb [kernel] 51.00 2.3% schedule [kernel] 51.00 2.3% kmem_cache_free [kernel] 43.00 1.9% vread_tsc [kernel].vsyscall_fn 38.00 1.7% __udp4_lib_lookup [kernel] -------------------------------------------------------------------------- PerfTop: 236 irqs/sec kernel:84.3% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 131.00 4.9% _raw_spin_lock_irqsave [kernel] 128.00 4.8% system_call [kernel] 101.00 3.8% _raw_spin_unlock_irqrestor [kernel] 89.00 3.3% fget [kernel] 85.00 3.2% sys_epoll_ctl [kernel] 81.00 3.0% ip_rcv [kernel] 76.00 2.8% ip_route_input [kernel] 66.00 2.5% call_function_single_inter [kernel] 65.00 2.4% _raw_spin_lock [kernel] 65.00 2.4% kmem_cache_free [kernel] 64.00 2.4% copy_user_generic_string [kernel] 57.00 2.1% __netif_receive_skb [kernel] 47.00 1.8% schedule [kernel] 45.00 1.7% vread_tsc [kernel].vsyscall_fn -------------------------------------------------------------------------- PerfTop: 478 irqs/sec kernel:82.2% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 319.00 5.2% _raw_spin_lock_irqsave [kernel] 289.00 4.7% system_call [kernel] 246.00 4.0% _raw_spin_unlock_irqrestor [kernel] 199.00 3.2% ip_route_input [kernel] 198.00 3.2% __netif_receive_skb [kernel] 197.00 3.2% sys_epoll_ctl [kernel] 183.00 3.0% ip_rcv [kernel] 182.00 2.9% fget [kernel] 166.00 2.7% call_function_single_inter [kernel] 157.00 2.5% copy_user_generic_string [kernel] 149.00 2.4% kmem_cache_free [kernel] 146.00 2.4% vread_tsc [kernel].vsyscall_fn 133.00 2.1% _raw_spin_lock [kernel] 118.00 1.9% schedule [kernel] 112.00 1.8% __udp4_lib_lookup [kernel] -------------------------------------------------------------------------- PerfTop: 535 irqs/sec kernel:83.0% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 345.00 5.2% _raw_spin_lock_irqsave [kernel] 291.00 4.4% system_call [kernel] 255.00 3.9% _raw_spin_unlock_irqrestor [kernel] 218.00 3.3% fget [kernel] 201.00 3.0% ip_route_input [kernel] 193.00 2.9% __netif_receive_skb [kernel] 193.00 2.9% sys_epoll_ctl [kernel] 180.00 2.7% ip_rcv [kernel] 173.00 2.6% call_function_single_inter [kernel] 163.00 2.5% copy_user_generic_string [kernel] 152.00 2.3% kmem_cache_free [kernel] 151.00 2.3% vread_tsc [kernel].vsyscall_fn 142.00 2.1% _raw_spin_lock [kernel] 131.00 2.0% schedule [kernel] ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-24 14:10 ` jamal @ 2010-04-26 14:03 ` Eric Dumazet 2010-04-26 14:55 ` Eric Dumazet 2010-04-26 21:03 ` jamal 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-26 14:03 UTC (permalink / raw) To: hadi; +Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit : > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > > > Ive done a setup with the last patch from Changli + net-next - I will > > post test results tomorrow AM. > > ok, annotated results attached. > > cheers, > jamal Jamal, I have a Nehalem setup now, and I can see _raw_spin_lock_irqsave() abuse is not coming from network tree, but from clockevents_notify() My pktgen sends 1040989pps : # Samples: 389707198131 # # Overhead Command Shared Object Symbol # ........ .............. ............................ ...... # 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--94.74%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--4.10%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--0.58%-- lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary --0.58%-- [...] 8.94% init [kernel.kallsyms] [k] acpi_os_read_port | --- acpi_os_read_port | |--99.55%-- acpi_hw_read_port | acpi_hw_read | acpi_hw_read_multiple | acpi_hw_register_read | acpi_read_bit_register # Samples: 389233082962 # # Overhead Command Shared Object Symbol # ........ .............. ............................ ...... # 23.25% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave 8.90% init [kernel.kallsyms] [k] acpi_os_read_port 2.93% init [kernel.kallsyms] [k] mwait_idle_with_hints 1.99% init [kernel.kallsyms] [k] schedule 1.94% udpsink [kernel.kallsyms] [k] schedule 1.73% swapper [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.48% init [kernel.kallsyms] [k] bnx2x_rx_int 1.47% init [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 1.44% init [kernel.kallsyms] [k] _raw_spin_lock 1.36% udpsink [kernel.kallsyms] [k] udp_recvmsg 1.05% udpsink [kernel.kallsyms] [k] __skb_recv_datagram 1.05% init [kernel.kallsyms] [k] __udp4_lib_lookup 1.04% udpsink [kernel.kallsyms] [k] copy_user_generic_string 1.04% udpsink [kernel.kallsyms] [k] __slab_free 0.99% init [kernel.kallsyms] [k] select_task_rq_fair 0.99% init [kernel.kallsyms] [k] try_to_wake_up 0.98% init [kernel.kallsyms] [k] task_rq_lock 0.93% init [kernel.kallsyms] [k] tick_broadcast_oneshot_control 0.89% init [kernel.kallsyms] [k] sock_queue_rcv_skb 0.89% udpsink [kernel.kallsyms] [k] sock_recv_ts_and_drops 0.88% udpsink [kernel.kallsyms] [k] kfree 0.79% swapper [kernel.kallsyms] [k] acpi_os_read_port 0.76% udpsink [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.73% udpsink [kernel.kallsyms] [k] inet_recvmsg 0.71% udpsink [vdso] [.] 0x000000ffffe431 0.65% udpsink [kernel.kallsyms] [k] sock_recvmsg 0.62% init [kernel.kallsyms] [k] gs_change 0.61% init [kernel.kallsyms] [k] enqueue_task_fair 0.61% init [kernel.kallsyms] [k] eth_type_trans 0.61% init [kernel.kallsyms] [k] sock_def_readable 0.60% udpsink [kernel.kallsyms] [k] _raw_spin_lock_bh 0.59% init [kernel.kallsyms] [k] ip_route_input 0.59% udpsink libpthread-2.3.4.so [.] __pthread_disable_asynccancel 0.56% init [kernel.kallsyms] [k] bnx2x_poll 0.56% udpsink [kernel.kallsyms] [k] __get_user_4 ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-26 14:03 ` Eric Dumazet @ 2010-04-26 14:55 ` Eric Dumazet 2010-04-26 21:06 ` jamal [not found] ` <20100429174056.GA8044@gargoyle.fritz.box> 2010-04-26 21:03 ` jamal 1 sibling, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-26 14:55 UTC (permalink / raw) To: hadi Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev, Andi Kleen Le lundi 26 avril 2010 à 16:03 +0200, Eric Dumazet a écrit : > Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit : > > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > > > > > Ive done a setup with the last patch from Changli + net-next - I will > > > post test results tomorrow AM. > > > > ok, annotated results attached. > > > > cheers, > > jamal > > Jamal, I have a Nehalem setup now, and I can see > _raw_spin_lock_irqsave() abuse is not coming from network tree, but from > clockevents_notify() > Another interesting finding: - if all packets are received on a single queue, max speed seems to be 1.200.000 packets per second on my machine :-( And on profile of receiving cpu (RPS enabled, pakets sent to 15 other cpus), we can see default_send_IPI_mask_sequence_phys() is the slow thing... Andi, what do you think of this one ? Dont we have a function to send an IPI to an individual cpu instead ? void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { unsigned long query_cpu; unsigned long flags; /* * Hack. The clustered APIC addressing mode doesn't allow us to send * to an arbitrary mask, so I do a unicast to each CPU instead. * - mbligh */ local_irq_save(flags); for_each_cpu(query_cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } ----------------------------------------------------------------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 7) ----------------------------------------------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ _______ 668.00 17.7% default_send_IPI_mask_sequence_phys vmlinux 363.00 9.6% bnx2x_rx_int vmlinux 354.00 9.4% eth_type_trans vmlinux 332.00 8.8% kmem_cache_alloc_node vmlinux 285.00 7.6% __kmalloc_node_track_caller vmlinux 278.00 7.4% _raw_spin_lock vmlinux 166.00 4.4% __slab_alloc vmlinux 147.00 3.9% __memset vmlinux 136.00 3.6% list_del vmlinux 132.00 3.5% get_partial_node vmlinux 131.00 3.5% get_rps_cpu vmlinux 102.00 2.7% enqueue_to_backlog vmlinux 95.00 2.5% unmap_single vmlinux 94.00 2.5% __alloc_skb vmlinux 74.00 2.0% vlan_gro_common vmlinux 52.00 1.4% __phys_addr vmlinux 48.00 1.3% dev_gro_receive vmlinux 39.00 1.0% swiotlb_dma_mapping_error vmlinux 36.00 1.0% swiotlb_map_page vmlinux 34.00 0.9% skb_put vmlinux 27.00 0.7% is_swiotlb_buffer vmlinux 23.00 0.6% deactivate_slab vmlinux 20.00 0.5% vlan_gro_receive vmlinux 17.00 0.5% __skb_bond_should_drop vmlinux 14.00 0.4% netif_receive_skb vmlinux 14.00 0.4% __netdev_alloc_skb vmlinux 12.00 0.3% skb_gro_reset_offset vmlinux 12.00 0.3% get_slab vmlinux 11.00 0.3% napi_skb_finish vmlinux ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-26 14:55 ` Eric Dumazet @ 2010-04-26 21:06 ` jamal [not found] ` <20100429174056.GA8044@gargoyle.fritz.box> 1 sibling, 0 replies; 108+ messages in thread From: jamal @ 2010-04-26 21:06 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev, Andi Kleen On Mon, 2010-04-26 at 16:55 +0200, Eric Dumazet wrote: > Another interesting finding: > > - if all packets are received on a single queue, max speed seems to be > 1.200.000 packets per second on my machine :-( Well, if any consolation, it is not as bad as sky2 hardware;-> I cant do more than 750Kpps. Also, it seems you use VLANS - max pps will be lower than without VLANs by probably maybe 6-70Kpps (doesnt explain the 1.2Mpps of course). cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <20100429174056.GA8044@gargoyle.fritz.box>]
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue [not found] ` <20100429174056.GA8044@gargoyle.fritz.box> @ 2010-04-29 17:56 ` Eric Dumazet 2010-04-29 18:10 ` OFT - reserving CPU's for networking Stephen Hemminger [not found] ` <20100429182347.GA8512@gargoyle.fritz.box> 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 17:56 UTC (permalink / raw) To: Andi Kleen Cc: hadi, Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev, Andi Kleen Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > Andi, what do you think of this one ? > > Dont we have a function to send an IPI to an individual cpu instead ? > > That's what this function already does. You only set a single CPU > in the target mask, right? > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > which is a bit faster for this, but that's not available in the lower > end Nehalems. But even then it's not exactly fast. > > I don't think the IPI primitive can be optimized much. It's not a cheap > operation. > > If it's a problem do it less often and batch IPIs. > > It's essentially the same problem as interrupt mitigation or NAPI > are solving for NICs. I guess just need a suitable mitigation mechanism. > > Of course that would move more work to the sending CPU again, but > perhaps there's no alternative. I guess you could make it cheaper it by > minimizing access to packet data. > > -Andi Well, IPI are already batched, and rate is auto adaptative. After various changes, it seems things are going better, maybe there is something related to cache line trashing. I 'solved' it by using idle=poll, but you might take a look at clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly contended spinlock... 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--94.74%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--4.10%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | ^ permalink raw reply [flat|nested] 108+ messages in thread
* OFT - reserving CPU's for networking 2010-04-29 17:56 ` Eric Dumazet @ 2010-04-29 18:10 ` Stephen Hemminger 2010-04-29 19:19 ` Thomas Gleixner [not found] ` <20100429182347.GA8512@gargoyle.fritz.box> 1 sibling, 1 reply; 108+ messages in thread From: Stephen Hemminger @ 2010-04-29 18:10 UTC (permalink / raw) To: Eric Dumazet, Thomas Gleixner; +Cc: Andi Kleen, netdev, Andi Kleen > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > Andi, what do you think of this one ? > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > That's what this function already does. You only set a single CPU > > in the target mask, right? > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > which is a bit faster for this, but that's not available in the lower > > end Nehalems. But even then it's not exactly fast. > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > operation. > > > > If it's a problem do it less often and batch IPIs. > > > > It's essentially the same problem as interrupt mitigation or NAPI > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > Of course that would move more work to the sending CPU again, but > > perhaps there's no alternative. I guess you could make it cheaper it by > > minimizing access to packet data. > > > > -Andi > > Well, IPI are already batched, and rate is auto adaptative. > > After various changes, it seems things are going better, maybe there is > something related to cache line trashing. > > I 'solved' it by using idle=poll, but you might take a look at > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > contended spinlock... > > > > > 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave > | > --- _raw_spin_lock_irqsave > | > |--94.74%-- clockevents_notify > | lapic_timer_state_broadcast > | acpi_idle_enter_bm > | cpuidle_idle_call > | cpu_idle > | start_secondary > | > |--4.10%-- tick_broadcast_oneshot_control > | tick_notify > | notifier_call_chain > | __raw_notifier_call_chain > | raw_notifier_call_chain > | clockevents_do_notify > | clockevents_notify > | lapic_timer_state_broadcast > | acpi_idle_enter_bm > | cpuidle_idle_call > | cpu_idle > | start_secondary > | > I keep getting asked about taking some core's away from clock and scheduler to be reserved just for network processing. Seeing this kind of stuff makes me wonder if maybe that isn't a half bad idea. -- ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-29 18:10 ` OFT - reserving CPU's for networking Stephen Hemminger @ 2010-04-29 19:19 ` Thomas Gleixner 2010-04-29 20:02 ` Eric Dumazet 2010-04-30 18:57 ` David Miller 0 siblings, 2 replies; 108+ messages in thread From: Thomas Gleixner @ 2010-04-29 19:19 UTC (permalink / raw) To: Stephen Hemminger Cc: Eric Dumazet, Andi Kleen, netdev, Andi Kleen, Peter Zijlstra [-- Attachment #1: Type: TEXT/PLAIN, Size: 2800 bytes --] On Thu, 29 Apr 2010, Stephen Hemminger wrote: > > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > > Andi, what do you think of this one ? > > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > > > That's what this function already does. You only set a single CPU > > > in the target mask, right? > > > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > > which is a bit faster for this, but that's not available in the lower > > > end Nehalems. But even then it's not exactly fast. > > > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > > operation. > > > > > > If it's a problem do it less often and batch IPIs. > > > > > > It's essentially the same problem as interrupt mitigation or NAPI > > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > > > Of course that would move more work to the sending CPU again, but > > > perhaps there's no alternative. I guess you could make it cheaper it by > > > minimizing access to packet data. > > > > > > -Andi > > > > Well, IPI are already batched, and rate is auto adaptative. > > > > After various changes, it seems things are going better, maybe there is > > something related to cache line trashing. > > > > I 'solved' it by using idle=poll, but you might take a look at > > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > > contended spinlock... Say thanks to Intel/AMD for providing us timers which stop in lower c-states. Not much we can do about the broadcast lock when several cores are going idle and we need to setup a global timer to work around the lapic timer stops in C2/C3 issue. Simply the C-state timer broadcasting does not scale. And it was never meant to scale. It's a workaround for laptops to have functional NOHZ. There are several ways to work around that on larger machines: - Restrict c-states - Disable NOHZ and highres timers - idle=poll is definitely the worst of all possible solutions > I keep getting asked about taking some core's away from clock and scheduler > to be reserved just for network processing. Seeing this kind of stuff > makes me wonder if maybe that isn't a half bad idea. This comes up every few month and we pointed out several times what needs to be done to make this work w/o these weird hacks which put a core offline and then start some magic undebugable binary blob on it. We have not seen anyone working on this, but the "set cores aside and let them do X" idea seems to stick in peoples heads. Seriously, that's not a solution. It's going to be some hacked up nightmare which is completely unmaintainable. Aside of that I seriously doubt that you can do networking w/o time and timers. Thanks, tglx ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-29 19:19 ` Thomas Gleixner @ 2010-04-29 20:02 ` Eric Dumazet 2010-04-30 18:15 ` Brian Bloniarz 2010-04-30 18:57 ` David Miller 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 20:02 UTC (permalink / raw) To: Thomas Gleixner Cc: Stephen Hemminger, Andi Kleen, netdev, Andi Kleen, Peter Zijlstra Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit : > Say thanks to Intel/AMD for providing us timers which stop in lower > c-states. > > Not much we can do about the broadcast lock when several cores are > going idle and we need to setup a global timer to work around the > lapic timer stops in C2/C3 issue. > > Simply the C-state timer broadcasting does not scale. And it was never > meant to scale. It's a workaround for laptops to have functional NOHZ. > > There are several ways to work around that on larger machines: > > - Restrict c-states > - Disable NOHZ and highres timers > - idle=poll is definitely the worst of all possible solutions > > > I keep getting asked about taking some core's away from clock and scheduler > > to be reserved just for network processing. Seeing this kind of stuff > > makes me wonder if maybe that isn't a half bad idea. > > This comes up every few month and we pointed out several times what > needs to be done to make this work w/o these weird hacks which put a > core offline and then start some magic undebugable binary blob on it. > We have not seen anyone working on this, but the "set cores aside and > let them do X" idea seems to stick in peoples heads. > > Seriously, that's not a solution. It's going to be some hacked up > nightmare which is completely unmaintainable. > > Aside of that I seriously doubt that you can do networking w/o time > and timers. > Thanks a lot ! booting with processor.max_cstate=1 solves the problem (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled) Even with _carefuly_ chosen crazy configuration (receiving a packet on a cpu, then transfert it to another cpu, with a full 16x16 matrix involved), generating 700.000 IPI per second on the machine seems fine now. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-29 20:02 ` Eric Dumazet @ 2010-04-30 18:15 ` Brian Bloniarz 0 siblings, 0 replies; 108+ messages in thread From: Brian Bloniarz @ 2010-04-30 18:15 UTC (permalink / raw) To: Eric Dumazet Cc: Thomas Gleixner, Stephen Hemminger, netdev, Andi Kleen, Peter Zijlstra Eric Dumazet wrote: > Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit : > >> Say thanks to Intel/AMD for providing us timers which stop in lower >> c-states. >> >> Not much we can do about the broadcast lock when several cores are >> going idle and we need to setup a global timer to work around the >> lapic timer stops in C2/C3 issue. >> >> Simply the C-state timer broadcasting does not scale. And it was never >> meant to scale. It's a workaround for laptops to have functional NOHZ. >> >> There are several ways to work around that on larger machines: >> >> - Restrict c-states >> - Disable NOHZ and highres timers >> - idle=poll is definitely the worst of all possible solutions >> >>> I keep getting asked about taking some core's away from clock and scheduler >>> to be reserved just for network processing. Seeing this kind of stuff >>> makes me wonder if maybe that isn't a half bad idea. >> This comes up every few month and we pointed out several times what >> needs to be done to make this work w/o these weird hacks which put a >> core offline and then start some magic undebugable binary blob on it. >> We have not seen anyone working on this, but the "set cores aside and >> let them do X" idea seems to stick in peoples heads. >> >> Seriously, that's not a solution. It's going to be some hacked up >> nightmare which is completely unmaintainable. >> >> Aside of that I seriously doubt that you can do networking w/o time >> and timers. >> > > Thanks a lot ! > > booting with processor.max_cstate=1 solves the problem > > (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled) > > Even with _carefuly_ chosen crazy configuration (receiving a packet on a > cpu, then transfert it to another cpu, with a full 16x16 matrix > involved), generating 700.000 IPI per second on the machine seems fine > now. FYI you can also restrict c=states at runtime with PM QoS: Documentation/power/pm_qos_interface.txt On my machine, /sys/devices/system/cpu/cpu0/cpuidle/state2/latency is 205usec, so configuring a PM QoS request for <= 205usec latency should prevent it being entered: #!/usr/bin/python import os; import struct; import signal; latency_rec_usec = 100 f = os.open("/dev/cpu_dma_latency", os.O_WRONLY); os.write(f, struct.pack("=i", latency_rec_usec)); signal.pause(); ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-29 19:19 ` Thomas Gleixner 2010-04-29 20:02 ` Eric Dumazet @ 2010-04-30 18:57 ` David Miller 2010-04-30 19:58 ` Thomas Gleixner ` (2 more replies) 1 sibling, 3 replies; 108+ messages in thread From: David Miller @ 2010-04-30 18:57 UTC (permalink / raw) To: tglx; +Cc: shemminger, eric.dumazet, ak, netdev, andi, peterz From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST) > Aside of that I seriously doubt that you can do networking w/o time > and timers. You're right that we need timestamps and the like. But only if we actually process the packets on these restricted cpus :-) If we use RPS and farm out all packets to other cpus, ie. just doing the driver work and the remote cpu dispatch on these "offline" cpus, it is doable. Then we can do cool tricks like having the cpu spin on a mwait() on the network device's status descriptor in memory. In any event I agree with you, it's a cool idea at best, and likely not really practical. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-30 18:57 ` David Miller @ 2010-04-30 19:58 ` Thomas Gleixner 2010-04-30 21:01 ` Andi Kleen 2010-05-01 20:31 ` Martin Josefsson 2 siblings, 0 replies; 108+ messages in thread From: Thomas Gleixner @ 2010-04-30 19:58 UTC (permalink / raw) To: David Miller; +Cc: shemminger, eric.dumazet, ak, netdev, andi, peterz Dave, On Fri, 30 Apr 2010, David Miller wrote: > From: Thomas Gleixner <tglx@linutronix.de> > Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST) > > > Aside of that I seriously doubt that you can do networking w/o time > > and timers. > > You're right that we need timestamps and the like. > > But only if we actually process the packets on these restricted cpus :-) > > If we use RPS and farm out all packets to other cpus, ie. just doing > the driver work and the remote cpu dispatch on these "offline" cpus, > it is doable. > > Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. > > In any event I agree with you, it's a cool idea at best, and likely > not really practical. Well, it might be worth to experiment with that once we get the basic infrastructure in place to "isolate" cores under full kernel control. It's not too hard to solve the problems, but it seems nobody has a free time slot to tackle them. Thanks tglx ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-30 18:57 ` David Miller 2010-04-30 19:58 ` Thomas Gleixner @ 2010-04-30 21:01 ` Andi Kleen 2010-04-30 22:30 ` David Miller 2010-05-01 20:31 ` Martin Josefsson 2 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-04-30 21:01 UTC (permalink / raw) To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz > Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. When you specify a deep C state in that mwait then it will also have the long wakeup latency in the idle case. When you don't then you just killed higher Turbo mode on that socket and give away a lot of performance on the other cores. So you have to solve the idle state governour issue anyways, and then you likely don't need it anymore. Besides it seems to me that dispatching is something the NIC should just do directly. "RPS only CPU" would be essentially just an interrupt mitigation/flow redirection scheme that a lot of NICs do anyways. > In any event I agree with you, it's a cool idea at best, and likely > not really practical. s/cool// -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-30 21:01 ` Andi Kleen @ 2010-04-30 22:30 ` David Miller 2010-05-01 10:53 ` Andi Kleen 0 siblings, 1 reply; 108+ messages in thread From: David Miller @ 2010-04-30 22:30 UTC (permalink / raw) To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz From: Andi Kleen <andi@firstfloor.org> Date: Fri, 30 Apr 2010 23:01:31 +0200 > Besides it seems to me that dispatching is something the NIC should > just do directly. "RPS only CPU" would be essentially just an > interrupt mitigation/flow redirection scheme that a lot of NICs > do anyways. We've already established that the NIC can't do a complete job in all important cases, that's why we've integrated the RPS/RFS patches in the first place. And we don't want it to, because the decision mechanisms for steering that we using now are starting to get into the stateful territory and that's verbotton for NIC offload as far as we're concerned. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-30 22:30 ` David Miller @ 2010-05-01 10:53 ` Andi Kleen 2010-05-01 22:03 ` David Miller 0 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-01 10:53 UTC (permalink / raw) To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz > And we don't want it to, because the decision mechanisms for steering > that we using now are starting to get into the stateful territory and > that's verbotton for NIC offload as far as we're concerned. Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC (or someone else like netfilter) tracking flows is quite common and very far from full offload. AFAIK it doesn't have near all the problems full offload has. -Andi [1] although it seems to leak in more and more through the RDMA backdoor. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-05-01 10:53 ` Andi Kleen @ 2010-05-01 22:03 ` David Miller 2010-05-01 22:58 ` Andi Kleen 2010-05-01 23:44 ` Ben Hutchings 0 siblings, 2 replies; 108+ messages in thread From: David Miller @ 2010-05-01 22:03 UTC (permalink / raw) To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz From: Andi Kleen <andi@firstfloor.org> Date: Sat, 1 May 2010 12:53:04 +0200 >> And we don't want it to, because the decision mechanisms for steering >> that we using now are starting to get into the stateful territory and >> that's verbotton for NIC offload as far as we're concerned. > > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC > (or someone else like netfilter) tracking flows is quite common and very far > from full offload. AFAIK it doesn't have near all the problems full > offload has. We're tracking flow cpu location state at the socket operations, like recvmsg() and sendmsg(), where it belongs. Would you like us to call into the card drivers and firmware at these spots instead? ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-05-01 22:03 ` David Miller @ 2010-05-01 22:58 ` Andi Kleen 2010-05-01 23:29 ` David Miller 2010-05-01 23:44 ` Ben Hutchings 1 sibling, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-01 22:58 UTC (permalink / raw) To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz > We're tracking flow cpu location state at the socket operations, like > recvmsg() and sendmsg(), where it belongs. > > Would you like us to call into the card drivers and firmware at these > spots instead? No, that's not needed for lazy flow tracking like in netfilter or some NICs, it doesn't need exact updates. It just works with seen network packets. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-05-01 22:58 ` Andi Kleen @ 2010-05-01 23:29 ` David Miller 0 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-05-01 23:29 UTC (permalink / raw) To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz From: Andi Kleen <andi@firstfloor.org> Date: Sun, 2 May 2010 00:58:15 +0200 >> We're tracking flow cpu location state at the socket operations, like >> recvmsg() and sendmsg(), where it belongs. >> >> Would you like us to call into the card drivers and firmware at these >> spots instead? > > No, that's not needed for lazy flow tracking like in netfilter or > some NICs, it doesn't need exact updates. It just works with seen network > packets. Well what we need is exact flow updates so that we steer packets to where the applications actually are. Andi, this discussion is going in circles, can I just say "yeah you're right Andi" and this will satisfy your desire to be correct and we can be done with this? Thanks. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-05-01 22:03 ` David Miller 2010-05-01 22:58 ` Andi Kleen @ 2010-05-01 23:44 ` Ben Hutchings 1 sibling, 0 replies; 108+ messages in thread From: Ben Hutchings @ 2010-05-01 23:44 UTC (permalink / raw) To: David Miller; +Cc: andi, tglx, shemminger, eric.dumazet, netdev, peterz On Sat, 2010-05-01 at 15:03 -0700, David Miller wrote: > From: Andi Kleen <andi@firstfloor.org> > Date: Sat, 1 May 2010 12:53:04 +0200 > > >> And we don't want it to, because the decision mechanisms for steering > >> that we using now are starting to get into the stateful territory and > >> that's verbotton for NIC offload as far as we're concerned. > > > > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC > > (or someone else like netfilter) tracking flows is quite common and very far > > from full offload. AFAIK it doesn't have near all the problems full > > offload has. > > We're tracking flow cpu location state at the socket operations, like > recvmsg() and sendmsg(), where it belongs. > > Would you like us to call into the card drivers and firmware at these > spots instead? I'm interested in experimenting with this at some point, since our hardware supports a fairly large number of filters that could be used for it. Ben. -- Ben Hutchings, Senior Software Engineer, Solarflare Communications Not speaking for my employer; that's the marketing department's job. They asked us to note that Solarflare product names are trademarked. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-04-30 18:57 ` David Miller 2010-04-30 19:58 ` Thomas Gleixner 2010-04-30 21:01 ` Andi Kleen @ 2010-05-01 20:31 ` Martin Josefsson 2010-05-01 22:13 ` David Miller 2 siblings, 1 reply; 108+ messages in thread From: Martin Josefsson @ 2010-05-01 20:31 UTC (permalink / raw) To: David Miller; +Cc: tglx, shemminger, eric.dumazet, ak, netdev, andi, peterz On Fri, 30 Apr 2010, David Miller wrote: > Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. Can you have mwait monitor multiple cachelines for stores? If not then it might be hard to do that when you have multiple nics and you actually need to use the status descriptors, otherwise you could possibly have them all written to the same cacheline. Or if the nic doesn't support updating a status descriptor in memory. If you just want to wake up quickly without using interrupts it might be possible to abuse MSI to wake up without actually using interrupts, set the address to the cacheline that is being monitored. /Martin ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: OFT - reserving CPU's for networking 2010-05-01 20:31 ` Martin Josefsson @ 2010-05-01 22:13 ` David Miller 0 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-05-01 22:13 UTC (permalink / raw) To: gandalf; +Cc: tglx, shemminger, eric.dumazet, ak, netdev, andi, peterz From: Martin Josefsson <gandalf@mjufs.se> Date: Sat, 1 May 2010 22:31:05 +0200 (CEST) > On Fri, 30 Apr 2010, David Miller wrote: > >> Then we can do cool tricks like having the cpu spin on a mwait() on >> the >> network device's status descriptor in memory. > > Can you have mwait monitor multiple cachelines for stores? The idea is that if you have hundreds of cpus threads (several of my machines do, and it's not too long before these kinds of boxes will be common) in your machine you can spare one for each NIC. ^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <20100429182347.GA8512@gargoyle.fritz.box>]
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue [not found] ` <20100429182347.GA8512@gargoyle.fritz.box> @ 2010-04-29 19:12 ` Eric Dumazet [not found] ` <20100429214144.GA10663@gargoyle.fritz.box> 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 19:12 UTC (permalink / raw) To: Andi Kleen, Andi Kleen Cc: hadi, Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev, Andi Kleen, lenb, arjan Le jeudi 29 avril 2010 à 20:23 +0200, Andi Kleen a écrit : > On Thu, Apr 29, 2010 at 07:56:12PM +0200, Eric Dumazet wrote: > > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > > Andi, what do you think of this one ? > > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > > > That's what this function already does. You only set a single CPU > > > in the target mask, right? > > > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > > which is a bit faster for this, but that's not available in the lower > > > end Nehalems. But even then it's not exactly fast. > > > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > > operation. > > > > > > If it's a problem do it less often and batch IPIs. > > > > > > It's essentially the same problem as interrupt mitigation or NAPI > > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > > > Of course that would move more work to the sending CPU again, but > > > perhaps there's no alternative. I guess you could make it cheaper it by > > > minimizing access to packet data. > > > > > > -Andi > > > > Well, IPI are already batched, and rate is auto adaptative. > > > > After various changes, it seems things are going better, maybe there is > > something related to cache line trashing. > > > > I 'solved' it by using idle=poll, but you might take a look at > > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > > contended spinlock... > > acpi_idle_enter_bm should not be executed on a Nehalem, it's obsolete. > If it does on your system something is wrong. > > Ahh, that triggers a bell. There's one issue that if the remote CPU is in a very > deep idle state it could take a long time to wake it up. Nehalem has deeper > sleep states than earlier CPUs. When this happens the IPI sender will be slow > too I believe. > > Are the target CPUs idle? > Yes, mostly, but about 200.000 wakeups per second I would say... If a cpu in deep state receives an IPI, process a softirq, should it come back to deep state immediately, or should it wait for some milliseconds ? > Perhaps need to feed some information to cpuidle's governour to prevent this problem. > > idle=poll is very drastic, better to limit to C1 > How can I do this ? Thanks ! ^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <20100429214144.GA10663@gargoyle.fritz.box>]
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue [not found] ` <20100429214144.GA10663@gargoyle.fritz.box> @ 2010-04-30 5:25 ` Eric Dumazet 2010-04-30 23:38 ` David Miller 1 sibling, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-30 5:25 UTC (permalink / raw) To: Andi Kleen Cc: Andi Kleen, hadi, Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev, lenb, arjan Le jeudi 29 avril 2010 à 23:41 +0200, Andi Kleen a écrit : > On Thu, Apr 29, 2010 at 09:12:27PM +0200, Eric Dumazet wrote: > > Yes, mostly, but about 200.000 wakeups per second I would say... > > > > If a cpu in deep state receives an IPI, process a softirq, should it > > come back to deep state immediately, or should it wait for some > > milliseconds ? > > In principle the cpuidle governour should detect this and not put the target into > the slow deep c states. One change that was done recently to fix a similar > problem for disk IO was to take processes that wait for IO into account > (see 69d25870). But it doesn't work for networking. > > Here's a untested patch that might help: tell the cpuidle governour > networking is waiting for IO. This will tell it to not go down the deeply. > > I might have missed some schedule() paths, feel free to add more. > > Actually it's probably too aggressive because it will avoid C states even for > a closed window on the other side which might be hours. Better would > be some heuristic to only do this when you're really expected IO shortly. > > Also does your workload even sleep at all? If not we would need to increase > the iowait counters in recvmsg() itself. > My workload yes, uses blocking recvmsg() calls, but Jamal one uses epoll() so I guess problem is more generic than that. We should have an estimate of the number of wakeups (IO or not...) per second (or sub-second) so that cpuidle can avoid these deep states ? > Anyways might be still worth a try. > > For routing we probably need some other solution though, there are no > schedules there. > > > > > > Perhaps need to feed some information to cpuidle's governour to prevent this problem. > > > > > > idle=poll is very drastic, better to limit to C1 > > > > > > > How can I do this ? > > processor.max_cstate=1 or using /dev/network_latency > (see Documentation/power/pm_qos_interface.txt) > > -Andi > Thanks, I'll play with this today ! > > > commit 810227a7c24ecae2bb4aac320490a7115ac33be8 > Author: Andi Kleen <ak@linux.intel.com> > Date: Thu Apr 29 23:33:18 2010 +0200 > > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > XXX: probably too aggressive, some of these sleeps are not under high load. > > Based on a bug report from Eric Dumazet. > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > > diff --git a/net/core/sock.c b/net/core/sock.c > index c5812bb..c246d6c 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1402,7 +1402,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) > break; > if (sk->sk_err) > break; > - timeo = schedule_timeout(timeo); > + timeo = io_schedule_timeout(timeo); > } > finish_wait(sk->sk_sleep, &wait); > return timeo; > @@ -1512,7 +1512,7 @@ static void __lock_sock(struct sock *sk) > prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, > TASK_UNINTERRUPTIBLE); > spin_unlock_bh(&sk->sk_lock.slock); > - schedule(); > + io_schedule(); > spin_lock_bh(&sk->sk_lock.slock); > if (!sock_owned_by_user(sk)) > break; > > > > > Thanks ! > > > > ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue [not found] ` <20100429214144.GA10663@gargoyle.fritz.box> 2010-04-30 5:25 ` Eric Dumazet @ 2010-04-30 23:38 ` David Miller 2010-05-01 11:00 ` Andi Kleen 1 sibling, 1 reply; 108+ messages in thread From: David Miller @ 2010-04-30 23:38 UTC (permalink / raw) To: ak Cc: eric.dumazet, andi, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan From: Andi Kleen <ak@gargoyle.fritz.box> Date: Thu, 29 Apr 2010 23:41:44 +0200 > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > XXX: probably too aggressive, some of these sleeps are not under high load. > > Based on a bug report from Eric Dumazet. > > Signed-off-by: Andi Kleen <ak@linux.intel.com> I like this, except that we probably don't want the delayacct_blkio_*() calls these things do. Probably the rest of what these things do should remain in the io_schedule*() functions and the block layer can call it's own versions which add in the delayacct_blkio_*() bits. Or, if the delacct stuff is useful for socket I/O too, then it's interfaces names should have the "blk" stripped from them :-) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-30 23:38 ` David Miller @ 2010-05-01 11:00 ` Andi Kleen 2010-05-02 6:56 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-01 11:00 UTC (permalink / raw) To: David Miller Cc: eric.dumazet, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote: > From: Andi Kleen <ak@gargoyle.fritz.box> > Date: Thu, 29 Apr 2010 23:41:44 +0200 > > > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > > > XXX: probably too aggressive, some of these sleeps are not under high load. > > > > Based on a bug report from Eric Dumazet. > > > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > > I like this, except that we probably don't want the delayacct_blkio_*() calls > these things do. Yes. It needs more work, please don't apply it yet, to handle the "long sleep" case. Still curious if it fixes Eric's test case. > > Probably the rest of what these things do should remain in the io_schedule*() > functions and the block layer can call it's own versions which add in the > delayacct_blkio_*() bits. Good point. > > Or, if the delacct stuff is useful for socket I/O too, then it's interfaces > names should have the "blk" stripped from them :-) Good question. I suspect it's actually useful for some cases, but just adding sockets might confuse some users. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-01 11:00 ` Andi Kleen @ 2010-05-02 6:56 ` Eric Dumazet 2010-05-02 9:20 ` Andi Kleen 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 6:56 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan Le samedi 01 mai 2010 à 13:00 +0200, Andi Kleen a écrit : > On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote: > > From: Andi Kleen <ak@gargoyle.fritz.box> > > Date: Thu, 29 Apr 2010 23:41:44 +0200 > > > > > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > > > > > XXX: probably too aggressive, some of these sleeps are not under high load. > > > > > > Based on a bug report from Eric Dumazet. > > > > > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > > > > I like this, except that we probably don't want the delayacct_blkio_*() calls > > these things do. > > Yes. > > It needs more work, please don't apply it yet, to handle the "long sleep" case. > > Still curious if it fixes Eric's test case. > I tried it on the right spot (since my bench was only doing recvmsg() calls, I had to patch wait_for_packet() in net/core/datagram.c udp_recvmsg -> __skb_recv_datagram -> wait_for_packet -> schedule_timeout Unfortunatly, using io_schedule_timeout() did not solve the problem. Tell me if you need some traces or something. Thanks ! diff --git a/net/core/datagram.c b/net/core/datagram.c index 95b851f..051fd5b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -113,7 +113,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) goto interrupted; error = 0; - *timeo_p = schedule_timeout(*timeo_p); + *timeo_p = io_schedule_timeout(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); return error; ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 6:56 ` Eric Dumazet @ 2010-05-02 9:20 ` Andi Kleen 2010-05-02 10:54 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-02 9:20 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan > I tried it on the right spot (since my bench was only doing recvmsg() > calls, I had to patch wait_for_packet() in net/core/datagram.c > > udp_recvmsg -> __skb_recv_datagram -> wait_for_packet -> > schedule_timeout > > Unfortunatly, using io_schedule_timeout() did not solve the problem. Hmm, too bad. Weird. > > Tell me if you need some traces or something. I'll try to reproduce it and see what I can do. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 9:20 ` Andi Kleen @ 2010-05-02 10:54 ` Eric Dumazet 2010-05-02 14:13 ` Arjan van de Ven 2010-05-02 15:46 ` Andi Kleen 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 10:54 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan Le dimanche 02 mai 2010 à 11:20 +0200, Andi Kleen a écrit : > > I tried it on the right spot (since my bench was only doing recvmsg() > > calls, I had to patch wait_for_packet() in net/core/datagram.c > > > > udp_recvmsg -> __skb_recv_datagram -> wait_for_packet -> > > schedule_timeout > > > > Unfortunatly, using io_schedule_timeout() did not solve the problem. > > Hmm, too bad. Weird. > > > > > Tell me if you need some traces or something. > > I'll try to reproduce it and see what I can do. > Here the perf report on the latest test done, I confirm I am using io_schedule_timeout() in this kernel. In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these packets are then redistributed by RPS to same 16 cpus, generating about 650.000 IPI per second. top says : Cpu(s): 3.0%us, 17.3%sy, 0.0%ni, 22.4%id, 28.2%wa, 0.0%hi, 29.1%si, 0.0%st # Samples: 321362570767 # # Overhead Command Shared Object Symbol # ........ .............. ............................ ...... # 25.08% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--93.47%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--4.70%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--0.64%-- generic_exec_single | __smp_call_function_single | net_rps_action_and_irq_enable ... 9.72% init [kernel.kallsyms] [k] acpi_os_read_port | --- acpi_os_read_port | |--99.45%-- acpi_hw_read_port | acpi_hw_read | acpi_hw_read_multiple | acpi_hw_register_read | acpi_read_bit_register | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | --0.55%-- acpi_hw_read acpi_hw_read_multiple powertop says : PowerTOP version 1.11 (C) 2007 Intel Corporation Cn Avg residency P-states (frequencies) C0 (cpu running) (68.9%) 2.93 Ghz 46.5% polling 0.0ms ( 0.0%) 2.80 Ghz 5.1% C1 mwait 0.0ms ( 0.0%) 2.53 Ghz 3.0% C2 mwait 0.0ms (31.1%) 2.13 Ghz 2.8% 1.60 Ghz 38.2% Wakeups-from-idle per second : 45177.8 interval: 5.0s no ACPI power usage estimate available Top causes for wakeups: 9.9% (40863.0) <interrupt> : eth1-fp-7 9.9% (40861.0) <interrupt> : eth1-fp-8 9.9% (40858.0) <interrupt> : eth1-fp-5 9.9% (40855.2) <interrupt> : eth1-fp-10 9.9% (40847.6) <interrupt> : eth1-fp-14 9.9% (40847.2) <interrupt> : eth1-fp-12 9.9% (40835.0) <interrupt> : eth1-fp-1 9.9% (40834.2) <interrupt> : eth1-fp-3 9.9% (40834.0) <interrupt> : eth1-fp-6 9.9% (40829.6) <interrupt> : eth1-fp-4 1.0% (4002.0) <kernel core> : hrtimer_start_range_ns (tick_sched_timer) 0.4% (1725.6) <interrupt> : extra timer interrupt 0.0% ( 4.0) <kernel core> : usb_hcd_poll_rh_status (rh_timer_func) 0.0% ( 2.0) <kernel core> : clocksource_watchdog (clocksource_watchdog) 0.0% ( 2.0) snmpd : hrtimer_start_range_ns (hrtimer_wakeup) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 10:54 ` Eric Dumazet @ 2010-05-02 14:13 ` Arjan van de Ven 2010-05-02 14:27 ` Eric Dumazet 2010-05-02 15:46 ` Andi Kleen 1 sibling, 1 reply; 108+ messages in thread From: Arjan van de Ven @ 2010-05-02 14:13 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb > > Cn Avg residency P-states (frequencies) > C0 (cpu running) (68.9%) 2.93 Ghz 46.5% > polling 0.0ms ( 0.0%) 2.80 Ghz 5.1% > C1 mwait 0.0ms ( 0.0%) 2.53 Ghz 3.0% > C2 mwait 0.0ms (31.1%) 2.13 Ghz 2.8% > 1.60 Ghz 38.2% I bet your system advertizes C2 with the same latency as C1, but with lower power... which means Linux will pretty much never pick C1.... no matter how much you take Andi's patch. this is a bios thing... and until we put in the patch to override the bios values (I can dust it off but it might need a bit of tweaking since it was against .31) Andi's patch alone won't cut it... you also need a non-lying bios ;) -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 14:13 ` Arjan van de Ven @ 2010-05-02 14:27 ` Eric Dumazet 2010-05-02 15:32 ` Eric Dumazet 2010-05-02 17:54 ` Arjan van de Ven 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 14:27 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Le dimanche 02 mai 2010 à 07:13 -0700, Arjan van de Ven a écrit : > > > > Cn Avg residency P-states (frequencies) > > C0 (cpu running) (68.9%) 2.93 Ghz 46.5% > > polling 0.0ms ( 0.0%) 2.80 Ghz 5.1% > > C1 mwait 0.0ms ( 0.0%) 2.53 Ghz 3.0% > > C2 mwait 0.0ms (31.1%) 2.13 Ghz 2.8% > > 1.60 Ghz 38.2% > > I bet your system advertizes C2 with the same latency as C1, > but with lower power... which means Linux will pretty much never > pick C1.... no matter how much you take Andi's patch. > > this is a bios thing... and until we put in the patch to override the > bios values (I can dust it off but it might need a bit of tweaking > since it was against .31) Andi's patch alone won't cut it... you also > need a non-lying bios ;) > > > # pwd /sys/devices/system/cpu/cpu15/cpuidle # grep . */* state0/desc:CPUIDLE CORE POLL IDLE state0/latency:0 state0/name:C0 state0/power:4294967295 state0/time:0 state0/usage:0 state1/desc:ACPI FFH INTEL MWAIT 0x0 state1/latency:1 state1/name:C1 state1/power:1000 state1/time:433855186 state1/usage:126869 state2/desc:ACPI FFH INTEL MWAIT 0x10 state2/latency:64 state2/name:C2 state2/power:500 state2/time:198095020416 state2/usage:76287744 C2 latency seems to be 64 (us ?), while C1 seems to be 1 BIOS Information Vendor: HP Version: I24 Release Date: 10/01/2009 # powertop PowerTOP 1.11 (C) 2007, 2008 Intel Corporation Collecting data for 5 seconds Your CPU supports the following C-states : C1 C2 C3 Your BIOS reports the following C-states : C1 C2 C3 seems to be disabled in BIOS ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 14:27 ` Eric Dumazet @ 2010-05-02 15:32 ` Eric Dumazet 2010-05-02 17:54 ` Arjan van de Ven 1 sibling, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 15:32 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Le dimanche 02 mai 2010 à 16:27 +0200, Eric Dumazet a écrit : > Le dimanche 02 mai 2010 à 07:13 -0700, Arjan van de Ven a écrit : > > > > > > Cn Avg residency P-states (frequencies) > > > C0 (cpu running) (68.9%) 2.93 Ghz 46.5% > > > polling 0.0ms ( 0.0%) 2.80 Ghz 5.1% > > > C1 mwait 0.0ms ( 0.0%) 2.53 Ghz 3.0% > > > C2 mwait 0.0ms (31.1%) 2.13 Ghz 2.8% > > > 1.60 Ghz 38.2% > > > > I bet your system advertizes C2 with the same latency as C1, > > but with lower power... which means Linux will pretty much never > > pick C1.... no matter how much you take Andi's patch. > > > > this is a bios thing... and until we put in the patch to override the > > bios values (I can dust it off but it might need a bit of tweaking > > since it was against .31) Andi's patch alone won't cut it... you also > > need a non-lying bios ;) > > > > > > > # pwd > /sys/devices/system/cpu/cpu15/cpuidle > # grep . */* > state0/desc:CPUIDLE CORE POLL IDLE > state0/latency:0 > state0/name:C0 > state0/power:4294967295 > state0/time:0 > state0/usage:0 > state1/desc:ACPI FFH INTEL MWAIT 0x0 > state1/latency:1 > state1/name:C1 > state1/power:1000 > state1/time:433855186 > state1/usage:126869 > state2/desc:ACPI FFH INTEL MWAIT 0x10 > state2/latency:64 > state2/name:C2 > state2/power:500 > state2/time:198095020416 > state2/usage:76287744 > > C2 latency seems to be 64 (us ?), while C1 seems to be 1 > > BIOS Information > Vendor: HP > Version: I24 > Release Date: 10/01/2009 > > # powertop > PowerTOP 1.11 (C) 2007, 2008 Intel Corporation > > Collecting data for 5 seconds > > > Your CPU supports the following C-states : C1 C2 C3 > Your BIOS reports the following C-states : C1 C2 > > C3 seems to be disabled in BIOS > I took a look at BIOS settings and enabled the minimum sleep state to be C6 (instead of C3, the default). Now we see C3 being available... No changes, only more IPI delivered during the test, and more overhead in clockevents_notify() # grep . */* state0/desc:CPUIDLE CORE POLL IDLE state0/latency:0 state0/name:C0 state0/power:4294967295 state0/time:0 state0/usage:0 state1/desc:ACPI FFH INTEL MWAIT 0x0 state1/latency:1 state1/name:C1 state1/power:1000 state1/time:39432 state1/usage:119 state2/desc:ACPI FFH INTEL MWAIT 0x10 state2/latency:64 state2/name:C2 state2/power:500 state2/time:3170745 state2/usage:11177 state3/desc:ACPI FFH INTEL MWAIT 0x20 state3/latency:96 state3/name:C3 state3/power:350 state3/time:1030987453 state3/usage:14047019 --------------------------------------------------------------------------------------------------------------------------- PerfTop: 15984 irqs/sec kernel:98.5% [1000Hz cycles], (all, 16 CPUs) --------------------------------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _______ 23822.00 40.2% _raw_spin_lock_irqsave vmlinux 4413.00 7.4% acpi_os_read_port vmlinux 1426.00 2.4% _raw_spin_lock vmlinux 1284.00 2.2% _raw_spin_unlock_irqrestore vmlinux 1247.00 2.1% schedule vmlinux 1137.00 1.9% bnx2x_rx_int vmlinux 643.00 1.1% tick_broadcast_oneshot_control vmlinux 597.00 1.0% copy_user_generic_string vmlinux 595.00 1.0% __napi_complete vmlinux 550.00 0.9% call_function_single_interrupt vmlinux 548.00 0.9% bnx2x_msix_fp_int vmlinux 486.00 0.8% __netif_receive_skb vmlinux 461.00 0.8% bnx2x_poll vmlinux 433.00 0.7% eth_type_trans vmlinux 428.00 0.7% acpi_idle_enter_bm vmlinux 422.00 0.7% sock_recv_ts_and_drops vmlinux 382.00 0.6% __udp4_lib_lookup vmlinux 369.00 0.6% __slab_free vmlinux 357.00 0.6% ip_route_input vmlinux 341.00 0.6% kfree vmlinux 335.00 0.6% ipt_do_table vmlinux 334.00 0.6% ip_rcv vmlinux 332.00 0.6% udp_recvmsg vmlinux 317.00 0.5% __kmalloc_node_track_caller vmlinux 37.46% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--95.58%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--3.27%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 14:27 ` Eric Dumazet 2010-05-02 15:32 ` Eric Dumazet @ 2010-05-02 17:54 ` Arjan van de Ven 2010-05-02 19:22 ` Eric Dumazet 2010-05-02 21:30 ` Andi Kleen 1 sibling, 2 replies; 108+ messages in thread From: Arjan van de Ven @ 2010-05-02 17:54 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Sun, 02 May 2010 16:27:28 +0200 Eric Dumazet <eric.dumazet@gmail.com> wrote: > C2 latency seems to be 64 (us ?), while C1 seems to be 1 the processor_idle module has a "latency_factor" module parameter. The default is 2, but sometimes people think 6 is a better value... .. any chance you can try that value ? Also, I'm starting to wonder if Andi's patch to use io_schedule() needs to be replaced with a net_schedule() kind of thing. The cpuidle code currently has a weight factor for IO (based on measuring/experiments), and maybe networking really needs another factor... so just having a parallel concept with a different weight could be the right answer for that. > > Your CPU supports the following C-states : C1 C2 C3 > Your BIOS reports the following C-states : C1 C2 > > C3 seems to be disabled in BIOS btw this C2 == marketing name C3, and C3 == marketing name C6 (too many translations ;-) we'll fix powertop to report the marketing name soon. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 17:54 ` Arjan van de Ven @ 2010-05-02 19:22 ` Eric Dumazet 2010-05-02 22:06 ` Andi Kleen 2010-05-03 3:50 ` Arjan van de Ven 2010-05-02 21:30 ` Andi Kleen 1 sibling, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 19:22 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Le dimanche 02 mai 2010 à 10:54 -0700, Arjan van de Ven a écrit : > On Sun, 02 May 2010 16:27:28 +0200 > Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > C2 latency seems to be 64 (us ?), while C1 seems to be 1 > > the processor_idle module has a "latency_factor" module parameter. > The default is 2, but sometimes people think 6 is a better value... > .. any chance you can try that value ? > I tried 6 and 20, nothing changed ;( > Also, I'm starting to wonder if Andi's patch to use io_schedule() needs > to be replaced with a net_schedule() kind of thing. The cpuidle code > currently has a weight factor for IO (based on measuring/experiments), > and maybe networking really needs another factor... so just having a > parallel concept with a different weight could be the right answer for > that. > But a task blocked on disk IO is probably blocked for a small amount of time, while on network, it can be for a long time. I am not sure its the right metric. I was expecting something based on recent history. Say if we have 20.000 wakeups per second, most likely we should not enter C2/C3 states... > > we'll fix powertop to report the marketing name soon. > > Ah, I see, thanks :) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 19:22 ` Eric Dumazet @ 2010-05-02 22:06 ` Andi Kleen 2010-05-03 3:50 ` Arjan van de Ven 1 sibling, 0 replies; 108+ messages in thread From: Andi Kleen @ 2010-05-02 22:06 UTC (permalink / raw) To: Eric Dumazet Cc: Arjan van de Ven, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb > But a task blocked on disk IO is probably blocked for a small amount of > time, while on network, it can be for a long time. I am not sure its the > right metric. I think it needs a dynamic timeout. I agree the reference count as is will not work well for networking. > > I was expecting something based on recent history. > Say if we have 20.000 wakeups per second, most likely we should not > enter C2/C3 states... That's what the menu governour already does, it just doesn't work in some cases :/ -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 19:22 ` Eric Dumazet 2010-05-02 22:06 ` Andi Kleen @ 2010-05-03 3:50 ` Arjan van de Ven 2010-05-03 5:17 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: Arjan van de Ven @ 2010-05-03 3:50 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb > > Also, I'm starting to wonder if Andi's patch to use io_schedule() > > needs to be replaced with a net_schedule() kind of thing. The > > cpuidle code currently has a weight factor for IO (based on > > measuring/experiments), and maybe networking really needs another > > factor... so just having a parallel concept with a different weight > > could be the right answer for that. > > > > But a task blocked on disk IO is probably blocked for a small amount > of time, while on network, it can be for a long time. I am not sure > its the right metric. it's not so much about the duration, as it is about the performance sensitivity.... > I was expecting something based on recent history. > Say if we have 20.000 wakeups per second, most likely we should not > enter C2/C3 states... we effectively do that. The thing is that C2 is so low cost normally that it's still worth it even at 20k wakeups... this is where the bios tells us how "heavy" the states are.... and 64 usec... is just not very much. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 3:50 ` Arjan van de Ven @ 2010-05-03 5:17 ` Eric Dumazet 2010-05-03 10:22 ` Arjan van de Ven 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-03 5:17 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Le dimanche 02 mai 2010 à 20:50 -0700, Arjan van de Ven a écrit : > we effectively do that. The thing is that C2 is so low cost normally > that it's still worth it even at 20k wakeups... > > this is where the bios tells us how "heavy" the states are.... > and 64 usec... is just not very much. Maybe its low cost, (apparently, it is, since I can reach ~900.000 ipis on my 16 cores machine) but multiply this by 16 or 32 or 64 cpus, and clockevents_notify() cost appears to be a killer, all cpus compete on a single lock. Maybe this notifier could use RCU ? ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 5:17 ` Eric Dumazet @ 2010-05-03 10:22 ` Arjan van de Ven 2010-05-03 10:34 ` Andi Kleen 0 siblings, 1 reply; 108+ messages in thread From: Arjan van de Ven @ 2010-05-03 10:22 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Mon, 03 May 2010 07:17:14 +0200 Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le dimanche 02 mai 2010 à 20:50 -0700, Arjan van de Ven a écrit : > > > we effectively do that. The thing is that C2 is so low cost normally > > that it's still worth it even at 20k wakeups... > > > > this is where the bios tells us how "heavy" the states are.... > > and 64 usec... is just not very much. > > Maybe its low cost, (apparently, it is, since I can reach ~900.000 > ipis on my 16 cores machine) but multiply this by 16 or 32 or 64 > cpus, and clockevents_notify() cost appears to be a killer, all cpus > compete on a single lock. > > Maybe this notifier could use RCU ? could this be an artifact of the local apic stopping in deeper C states? (which is finally fixed in the Westmere generation) -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 10:22 ` Arjan van de Ven @ 2010-05-03 10:34 ` Andi Kleen 2010-05-03 14:09 ` Arjan van de Ven 0 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-03 10:34 UTC (permalink / raw) To: Arjan van de Ven Cc: Eric Dumazet, Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb > > Maybe its low cost, (apparently, it is, since I can reach ~900.000 > > ipis on my 16 cores machine) but multiply this by 16 or 32 or 64 > > cpus, and clockevents_notify() cost appears to be a killer, all cpus > > compete on a single lock. > > > > Maybe this notifier could use RCU ? > > could this be an artifact of the local apic stopping in deeper C states? > (which is finally fixed in the Westmere generation) Yes it is I think. But I suspect Eric wants a solution for Nehalem. -Andi -- ak@linux.intel.com -- Speaking for myself only. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 10:34 ` Andi Kleen @ 2010-05-03 14:09 ` Arjan van de Ven 2010-05-03 14:45 ` Brian Bloniarz 2010-05-03 15:52 ` Andi Kleen 0 siblings, 2 replies; 108+ messages in thread From: Arjan van de Ven @ 2010-05-03 14:09 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Mon, 3 May 2010 12:34:26 +0200 Andi Kleen <andi@firstfloor.org> wrote: > > > Maybe its low cost, (apparently, it is, since I can reach ~900.000 > > > ipis on my 16 cores machine) but multiply this by 16 or 32 or 64 > > > cpus, and clockevents_notify() cost appears to be a killer, all > > > cpus compete on a single lock. > > > > > > Maybe this notifier could use RCU ? > > > > could this be an artifact of the local apic stopping in deeper C > > states? (which is finally fixed in the Westmere generation) > > Yes it is I think. > > But I suspect Eric wants a solution for Nehalem. sure ;-) so the hard problem is that on going idle, the local timers need to be funneled to the external HPET. Afaik right now we use one channel of the hpet, with the result that we have one global lock for this. HPETs have more than one channel (2 or 3 historically, newer chipsets iirc have a few more), so in principle we can split this lock at least a little bit... if we can get to one hpet channel per level 3 cache domain we'd already make huge progress in terms of cost of the contention.... -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 14:09 ` Arjan van de Ven @ 2010-05-03 14:45 ` Brian Bloniarz 2010-05-04 1:10 ` Arjan van de Ven 2010-05-03 15:52 ` Andi Kleen 1 sibling, 1 reply; 108+ messages in thread From: Brian Bloniarz @ 2010-05-03 14:45 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Arjan van de Ven wrote: > On Mon, 3 May 2010 12:34:26 +0200 > Andi Kleen <andi@firstfloor.org> wrote: > >>>> Maybe its low cost, (apparently, it is, since I can reach ~900.000 >>>> ipis on my 16 cores machine) but multiply this by 16 or 32 or 64 >>>> cpus, and clockevents_notify() cost appears to be a killer, all >>>> cpus compete on a single lock. >>>> >>>> Maybe this notifier could use RCU ? >>> could this be an artifact of the local apic stopping in deeper C >>> states? (which is finally fixed in the Westmere generation) >> Yes it is I think. >> >> But I suspect Eric wants a solution for Nehalem. > > sure ;-) > > > so the hard problem is that on going idle, the local timers need to be > funneled to the external HPET. Afaik right now we use one channel of > the hpet, with the result that we have one global lock for this. Does the HPET only need to be programmed when going idle? That could mean that this isn't a big performance issue. cares if you spin for a while when you're about to sleep for at least 60usec? > HPETs have more than one channel (2 or 3 historically, newer chipsets > iirc have a few more), so in principle we can split this lock at least > a little bit... if we can get to one hpet channel per level 3 cache > domain we'd already make huge progress in terms of cost of the > contention.... Another possible approach: if a core needs the HPET and finds it locked, it could queue up its request to a backlog which the locking core will service. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 14:45 ` Brian Bloniarz @ 2010-05-04 1:10 ` Arjan van de Ven 0 siblings, 0 replies; 108+ messages in thread From: Arjan van de Ven @ 2010-05-04 1:10 UTC (permalink / raw) To: Brian Bloniarz Cc: Andi Kleen, Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Mon, 03 May 2010 10:45:07 -0400 Brian Bloniarz <bmb@athenacr > > so the hard problem is that on going idle, the local timers need to > > be funneled to the external HPET. Afaik right now we use one > > channel of the hpet, with the result that we have one global lock > > for this. > > Does the HPET only need to be programmed when going idle? correct; when going idle the per logical CPU timer value needs to be put in the global HPET (assuming 1 channel is in use). This "global" is where the lock comes in. > That could mean that this isn't a big performance issue. > cares if you spin for a while when you're about to sleep for > at least 60usec? depends on how long the sleep is ;-) -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 14:09 ` Arjan van de Ven 2010-05-03 14:45 ` Brian Bloniarz @ 2010-05-03 15:52 ` Andi Kleen 2010-05-04 1:11 ` Arjan van de Ven 1 sibling, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-03 15:52 UTC (permalink / raw) To: Arjan van de Ven Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb > so the hard problem is that on going idle, the local timers need to be > funneled to the external HPET. Afaik right now we use one channel of > the hpet, with the result that we have one global lock for this. > > HPETs have more than one channel (2 or 3 historically, newer chipsets > iirc have a few more), so in principle we can split this lock at least > a little bit... if we can get to one hpet channel per level 3 cache > domain we'd already make huge progress in terms of cost of the > contention.... I suggested the same thing a few emails up @) (great minds think alike etc.etc. @) . I'm not sure how difficult it would be to implement though. Potential issues: Some user applications use the hpet channels directly through the character device interface so there would be a potential compatibility issue (but maybe that should be just moved to be emulated with a hrtimer ?) And if multiple broadcast controllers are elected this might make it harder to become idle. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-03 15:52 ` Andi Kleen @ 2010-05-04 1:11 ` Arjan van de Ven 0 siblings, 0 replies; 108+ messages in thread From: Arjan van de Ven @ 2010-05-04 1:11 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Mon, 3 May 2010 17:52:04 +0200 Andi Kleen <andi@firstfloor.org> wrote: > > HPETs have more than one channel (2 or 3 historically, newer > > chipsets iirc have a few more), so in principle we can split this > > lock at least a little bit... if we can get to one hpet channel per > > level 3 cache domain we'd already make huge progress in terms of > > cost of the contention.... > > I suggested the same thing a few emails up @) (great minds think > alike etc.etc. @) . > > I'm not sure how difficult it would be to implement though. the hardest part will be cases where the SMM code borrows higher HPET channels or something.. not sure if they do, but.. color me a bit afraid we'll find cases. > > Potential issues: > > Some user applications use the hpet channels directly through > the character device interface so there would be a potential > compatibility issue (but maybe that should be just moved > to be emulated with a hrtimer ?) we can and should just emulate this. Same for the rtc device I suspect. > And if multiple broadcast controllers are elected this might > make it harder to become idle. not quite, as long as you do a directed broadcast. As long as there's a predictable mapping for which cores group to which hpet channel.. won't be that bad since you only need to wake up your own local subset. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 17:54 ` Arjan van de Ven 2010-05-02 19:22 ` Eric Dumazet @ 2010-05-02 21:30 ` Andi Kleen 1 sibling, 0 replies; 108+ messages in thread From: Andi Kleen @ 2010-05-02 21:30 UTC (permalink / raw) To: Arjan van de Ven Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Sun, May 02, 2010 at 10:54:18AM -0700, Arjan van de Ven wrote: > On Sun, 02 May 2010 16:27:28 +0200 > Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > C2 latency seems to be 64 (us ?), while C1 seems to be 1 > > the processor_idle module has a "latency_factor" module parameter. > The default is 2, but sometimes people think 6 is a better value... > .. any chance you can try that value ? > > Also, I'm starting to wonder if Andi's patch to use io_schedule() needs > to be replaced with a net_schedule() kind of thing. The cpuidle code > currently has a weight factor for IO (based on measuring/experiments), > and maybe networking really needs another factor... so just having a > parallel concept with a different weight could be the right answer for > that. We definitely need a net_schedule() for other reasons too: to avoid the blkio wait code and then also because networking needs a short "fast idle" timeout because the delays are not bounded. Otherwise a sender that suddenly stops sending could break all your power saving. I think the reference count used in io_schedule is not the right model for this, probably needs a per cpu timeout ("be fast until this time"). Possibly a dynamic one feed by the measured input rate. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 10:54 ` Eric Dumazet 2010-05-02 14:13 ` Arjan van de Ven @ 2010-05-02 15:46 ` Andi Kleen 2010-05-02 16:35 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-02 15:46 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan > In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers > packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these > packets are then redistributed by RPS to same 16 cpus, generating about > 650.000 IPI per second. BTW if rps was SMT aware it could avoid a lot of the IPIs in the first place. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 15:46 ` Andi Kleen @ 2010-05-02 16:35 ` Eric Dumazet 2010-05-02 17:43 ` Arjan van de Ven 2010-05-02 21:25 ` Andi Kleen 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 16:35 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan Le dimanche 02 mai 2010 à 17:46 +0200, Andi Kleen a écrit : > > In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers > > packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these > > packets are then redistributed by RPS to same 16 cpus, generating about > > 650.000 IPI per second. > > BTW if rps was SMT aware it could avoid a lot of the IPIs in the first place. RPS do what you want, just stick a good cpumask, not a unware one :) In my test, I specifically do something 'stupid' like : echo fffe >/sys/class/net/bond0.2240/queues/rx-0/rps_cpus echo fffd >/sys/class/net/bond0.2240/queues/rx-1/rps_cpus echo fffb >/sys/class/net/bond0.2240/queues/rx-2/rps_cpus echo fff7 >/sys/class/net/bond0.2240/queues/rx-3/rps_cpus echo ffef >/sys/class/net/bond0.2240/queues/rx-4/rps_cpus echo ffdf >/sys/class/net/bond0.2240/queues/rx-5/rps_cpus echo ffbf >/sys/class/net/bond0.2240/queues/rx-6/rps_cpus echo ff7f >/sys/class/net/bond0.2240/queues/rx-7/rps_cpus echo feff >/sys/class/net/bond0.2240/queues/rx-8/rps_cpus echo fdff >/sys/class/net/bond0.2240/queues/rx-9/rps_cpus echo fbff >/sys/class/net/bond0.2240/queues/rx-10/rps_cpus echo f7ff >/sys/class/net/bond0.2240/queues/rx-11/rps_cpus echo efff >/sys/class/net/bond0.2240/queues/rx-12/rps_cpus echo dfff >/sys/class/net/bond0.2240/queues/rx-13/rps_cpus echo bfff >/sys/class/net/bond0.2240/queues/rx-14/rps_cpus echo 7fff >/sys/class/net/bond0.2240/queues/rx-15/rps_cpus echo 0001 >/proc/irq/*/eth1-fp-0/../smp_affinity echo 0002 >/proc/irq/*/eth1-fp-1/../smp_affinity echo 0004 >/proc/irq/*/eth1-fp-2/../smp_affinity echo 0008 >/proc/irq/*/eth1-fp-3/../smp_affinity echo 0010 >/proc/irq/*/eth1-fp-4/../smp_affinity echo 0020 >/proc/irq/*/eth1-fp-5/../smp_affinity echo 0040 >/proc/irq/*/eth1-fp-6/../smp_affinity echo 0080 >/proc/irq/*/eth1-fp-7/../smp_affinity echo 0100 >/proc/irq/*/eth1-fp-8/../smp_affinity echo 0200 >/proc/irq/*/eth1-fp-9/../smp_affinity echo 0400 >/proc/irq/*/eth1-fp-10/../smp_affinity echo 0800 >/proc/irq/*/eth1-fp-11/../smp_affinity echo 1000 >/proc/irq/*/eth1-fp-12/../smp_affinity echo 2000 >/proc/irq/*/eth1-fp-13/../smp_affinity echo 4000 >/proc/irq/*/eth1-fp-14/../smp_affinity echo 8000 >/proc/irq/*/eth1-fp-15/../smp_affinity You mean we can wakeup a thread with something else than an IPI ? ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 16:35 ` Eric Dumazet @ 2010-05-02 17:43 ` Arjan van de Ven 2010-05-02 17:47 ` Eric Dumazet 2010-05-02 21:25 ` Andi Kleen 1 sibling, 1 reply; 108+ messages in thread From: Arjan van de Ven @ 2010-05-02 17:43 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb On Sun, 02 May 2010 18:35:31 +0200 Eric Dumazet <eric.dumazet@gmail.com> wrote > > > You mean we can wakeup a thread with something else than an IPI ? > actually we can. mwait is not only "go idle", it is "go idle until someone writes to <THIS> cacheline". where <THIS> is set up with a "monitor" instruction. We don't need to send an ipi per se.. all we need is to write to the right cacheline that we're monitoring. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 17:43 ` Arjan van de Ven @ 2010-05-02 17:47 ` Eric Dumazet 0 siblings, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 17:47 UTC (permalink / raw) To: Arjan van de Ven Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb Le dimanche 02 mai 2010 à 10:43 -0700, Arjan van de Ven a écrit : > On Sun, 02 May 2010 18:35:31 +0200 > Eric Dumazet <eric.dumazet@gmail.com> wrote > > > > > > You mean we can wakeup a thread with something else than an IPI ? > > > > actually we can. > > mwait is not only "go idle", it is "go idle until someone writes to > <THIS> cacheline". where <THIS> is set up with a "monitor" instruction. > We don't need to send an ipi per se.. all we need is to write to the > right cacheline that we're monitoring. > > Thats a bit x86 specific, isnt it ? But we want to eventually send a 'signal' to a cpu, even if not blocked in idle, so that it can do following action : /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; __napi_schedule(&sd->backlog); __get_cpu_var(netdev_rx_stat).received_rps++; } And it also should be portable ;) If something else than an IPI is available, please let us know ! Thanks ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 16:35 ` Eric Dumazet 2010-05-02 17:43 ` Arjan van de Ven @ 2010-05-02 21:25 ` Andi Kleen 2010-05-02 21:45 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-02 21:25 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan > You mean we can wakeup a thread with something else than an IPI ? It's pointless to send an IPI to your thread sibling for this. Everything it could do you can do yourself too with the same performance. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 21:25 ` Andi Kleen @ 2010-05-02 21:45 ` Eric Dumazet 2010-05-02 21:54 ` Andi Kleen 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 21:45 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan Le dimanche 02 mai 2010 à 23:25 +0200, Andi Kleen a écrit : > It's pointless to send an IPI to your thread sibling for this. > Everything it could do you can do yourself too with the same performance. > > -Andi Amen Tests just prove the reverse. I have some collegues that disable HyperThreading for exact same reasons. I wonder why Intel designed HT. Should be marketing I guess. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 21:45 ` Eric Dumazet @ 2010-05-02 21:54 ` Andi Kleen 2010-05-02 22:08 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: Andi Kleen @ 2010-05-02 21:54 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan On Sun, May 02, 2010 at 11:45:55PM +0200, Eric Dumazet wrote: > Le dimanche 02 mai 2010 à 23:25 +0200, Andi Kleen a écrit : > > > It's pointless to send an IPI to your thread sibling for this. > > Everything it could do you can do yourself too with the same performance. > > > > -Andi > > Amen That is in terms of cache locality. > > Tests just prove the reverse. What do you mean? > > I have some collegues that disable HyperThreading for exact same > reasons. I wonder why Intel designed HT. Should be marketing I guess. HT (especially Nehalem HT) is useful for a wide range of workloads. Just handling network interrupts for its thread sibling is not one of them. -Andi ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 21:54 ` Andi Kleen @ 2010-05-02 22:08 ` Eric Dumazet 2010-05-03 20:15 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-02 22:08 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan Le dimanche 02 mai 2010 à 23:54 +0200, Andi Kleen a écrit : > On Sun, May 02, 2010 at 11:45:55PM +0200, Eric Dumazet wrote: > > Tests just prove the reverse. > > What do you mean? > Test I did this week with Jamal. We first set a "ee" rps mask, because all NIC interrupts were handled by CPU0, and Jamal thought like you, that not using cpu4 would give better performance. But using "fe" mask gave me a bonus, from ~700.000 pps to ~800.000 pps CPU : E5450 @3.00GHz Two quad-core cpus in the machine, tg3 NIC. With RPS, CPU0 does not a lot of things, just talk with the NIC, bring a few cache lines per packet and dispatch it to a slave cpu. > HT (especially Nehalem HT) is useful for a wide range of workloads. > Just handling network interrupts for its thread sibling is not one of them. > Thats the theory, now in practice I see different results. Of course, this might be related to hash distribution being different and more uniform. I should redo the test with many more flows. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-05-02 22:08 ` Eric Dumazet @ 2010-05-03 20:15 ` jamal 0 siblings, 0 replies; 108+ messages in thread From: jamal @ 2010-05-03 20:15 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, David Miller, xiaosuo, therbert, shemminger, netdev, lenb, arjan On Mon, 2010-05-03 at 00:08 +0200, Eric Dumazet wrote: > > Test I did this week with Jamal. > > We first set a "ee" rps mask, because all NIC interrupts were handled by > CPU0, and Jamal thought like you, that not using cpu4 would give better > performance. > > But using "fe" mask gave me a bonus, from ~700.000 pps to ~800.000 pps > I am seeing the opposite with my machine (Nehalem): with ee i get 99.4% and fe i get 94.2% whereas non-rps is about 98.1%. cheers, jamal PS:- sorry dont have time to collect a lot more data - tommorow i could do more. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-26 14:03 ` Eric Dumazet 2010-04-26 14:55 ` Eric Dumazet @ 2010-04-26 21:03 ` jamal 1 sibling, 0 replies; 108+ messages in thread From: jamal @ 2010-04-26 21:03 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev On Mon, 2010-04-26 at 16:03 +0200, Eric Dumazet wrote: > > Jamal, I have a Nehalem setup now, and I can see > _raw_spin_lock_irqsave() abuse is not coming from network tree, but from > clockevents_notify() yikes. Thanks Eric - I shouldve been able to figure that one out. But why is this thing expensive? I will run the test tommorow and see if i see the same thing. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-23 8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao 2010-04-23 9:27 ` Eric Dumazet @ 2010-04-23 10:26 ` Eric Dumazet 2010-04-27 22:08 ` David Miller 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-23 10:26 UTC (permalink / raw) To: Changli Gao Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit : > batch skb dequeueing from softnet input_pkt_queue. > > batch skb dequeueing from softnet input_pkt_queue to reduce potential lock > contention when RPS is enabled. > > Note: in the worst case, the number of packets in a softnet_data may be double > of netdev_max_backlog. > > Signed-off-by: Changli Gao <xiaosuo@gmail.com> > ---- Oops, reading it again, I found process_backlog() was still taking the lock twice, if only one packet is waiting in input_pkt_queue. Possible fix, on top of your patch : diff --git a/net/core/dev.c b/net/core/dev.c index 0eddd23..0569be7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3296,8 +3296,9 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (1) { + while (work < quota) { struct sk_buff *skb; + unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -3308,13 +3309,15 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue)); - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - if (skb_queue_empty(&sd->process_queue)) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) { + input_queue_head_add(sd, qlen); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + } + if (qlen < quota - work) { __napi_complete(napi); - rps_unlock(sd); - break; + quota = work + qlen; } rps_unlock(sd); } ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue 2010-04-23 10:26 ` Eric Dumazet @ 2010-04-27 22:08 ` David Miller 2010-04-27 22:18 ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: David Miller @ 2010-04-27 22:08 UTC (permalink / raw) To: eric.dumazet; +Cc: xiaosuo, hadi, therbert, shemminger, netdev From: Eric Dumazet <eric.dumazet@gmail.com> Date: Fri, 23 Apr 2010 12:26:06 +0200 > Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit : >> batch skb dequeueing from softnet input_pkt_queue. >> >> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock >> contention when RPS is enabled. >> >> Note: in the worst case, the number of packets in a softnet_data may be double >> of netdev_max_backlog. >> >> Signed-off-by: Changli Gao <xiaosuo@gmail.com> >> ---- > > Oops, reading it again, I found process_backlog() was still taking the > lock twice, if only one packet is waiting in input_pkt_queue. > > Possible fix, on top of your patch : I've applied Changli's patch with this fixup added to it. If there are any follow-on changes necessary after further analysis, please send patches on top of this work. Thanks. ^ permalink raw reply [flat|nested] 108+ messages in thread
* [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-27 22:08 ` David Miller @ 2010-04-27 22:18 ` Eric Dumazet 2010-04-27 22:19 ` David Miller 2010-04-28 11:33 ` jamal 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-27 22:18 UTC (permalink / raw) To: David Miller Cc: xiaosuo, hadi, therbert, shemminger, netdev, Eilon Greenstein Le mardi 27 avril 2010 à 15:08 -0700, David Miller a écrit : > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Fri, 23 Apr 2010 12:26:06 +0200 > > > Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit : > >> batch skb dequeueing from softnet input_pkt_queue. > >> > >> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock > >> contention when RPS is enabled. > >> > >> Note: in the worst case, the number of packets in a softnet_data may be double > >> of netdev_max_backlog. > >> > >> Signed-off-by: Changli Gao <xiaosuo@gmail.com> > >> ---- > > > > Oops, reading it again, I found process_backlog() was still taking the > > lock twice, if only one packet is waiting in input_pkt_queue. > > > > Possible fix, on top of your patch : > > I've applied Changli's patch with this fixup added to it. > > If there are any follow-on changes necessary after further analysis, > please send patches on top of this work. > Thanks David, I was about to resubmit the cumulative patch ;) On my 'old' dev machine (two quad core), RPS is able to get a 300% increase on udpsink test on 20 flows. I yet have to make routing/firewalling tests as well. I also noticed bnx2x driver has some strange prefetch() calls. [PATCH net-next-2.6] bnx2x: Remove two prefetch() 1) Even on 64bit arches, sizeof(struct sk_buff) < 256 2) No need to prefetch same pointer twice. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Eilon Greenstein <eilong@broadcom.com> --- diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c index 613f727..f706ed1 100644 --- a/drivers/net/bnx2x_main.c +++ b/drivers/net/bnx2x_main.c @@ -1617,7 +1617,6 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget) rx_buf = &fp->rx_buf_ring[bd_cons]; skb = rx_buf->skb; prefetch(skb); - prefetch((u8 *)skb + 256); len = le16_to_cpu(cqe->fast_path_cqe.pkt_len); pad = cqe->fast_path_cqe.placement_offset; @@ -1668,7 +1667,6 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget) dma_unmap_addr(rx_buf, mapping), pad + RX_COPY_THRESH, DMA_FROM_DEVICE); - prefetch(skb); prefetch(((char *)(skb)) + 128); /* is this an error packet? */ ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-27 22:18 ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet @ 2010-04-27 22:19 ` David Miller 2010-04-28 13:14 ` Eilon Greenstein 2010-04-28 11:33 ` jamal 1 sibling, 1 reply; 108+ messages in thread From: David Miller @ 2010-04-27 22:19 UTC (permalink / raw) To: eric.dumazet; +Cc: xiaosuo, hadi, therbert, shemminger, netdev, eilong From: Eric Dumazet <eric.dumazet@gmail.com> Date: Wed, 28 Apr 2010 00:18:13 +0200 > [PATCH net-next-2.6] bnx2x: Remove two prefetch() > > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256 > 2) No need to prefetch same pointer twice. > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> > CC: Eilon Greenstein <eilong@broadcom.com> Eilon please review and ACK/NACK Thanks. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-27 22:19 ` David Miller @ 2010-04-28 13:14 ` Eilon Greenstein 2010-04-28 15:44 ` Eliezer Tamir ` (2 more replies) 0 siblings, 3 replies; 108+ messages in thread From: Eilon Greenstein @ 2010-04-28 13:14 UTC (permalink / raw) To: David Miller Cc: vladz, eliezert, eric.dumazet@gmail.com, xiaosuo@gmail.com, hadi@cyberus.ca, therbert@google.com, shemminger@vyatta.com, netdev@vger.kernel.org On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote: > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Wed, 28 Apr 2010 00:18:13 +0200 > > > [PATCH net-next-2.6] bnx2x: Remove two prefetch() > > > > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256 > > 2) No need to prefetch same pointer twice. > > > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> > > CC: Eilon Greenstein <eilong@broadcom.com> > > Eilon please review and ACK/NACK Vlad ran few benchmarks, and we couldn't find any justification for those prefetch calls. After consulting with Eliezer Tamir (the original author) we are glad to Ack this patch. Thanks Eric! Acked-by: <eilong@broadcom.com> ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-28 13:14 ` Eilon Greenstein @ 2010-04-28 15:44 ` Eliezer Tamir 2010-04-28 16:53 ` David Miller [not found] ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com> 2 siblings, 0 replies; 108+ messages in thread From: Eliezer Tamir @ 2010-04-28 15:44 UTC (permalink / raw) To: eilong Cc: David Miller, vladz, eric.dumazet@gmail.com, xiaosuo@gmail.com, hadi@cyberus.ca, therbert@google.com, shemminger@vyatta.com, netdev@vger.kernel.org On Wed, Apr 28, 2010 at 4:14 PM, Eilon Greenstein <eilong@broadcom.com> wrote: > > On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote: > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 28 Apr 2010 00:18:13 +0200 > > > > > [PATCH net-next-2.6] bnx2x: Remove two prefetch() > > > > > > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256 > > > 2) No need to prefetch same pointer twice. > > > > > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> > > > CC: Eilon Greenstein <eilong@broadcom.com> > > > > Eilon please review and ACK/NACK > > Vlad ran few benchmarks, and we couldn't find any justification for > those prefetch calls. After consulting with Eliezer Tamir (the original > author) we are glad to Ack this patch. > > Thanks Eric! > Acked-by: <eilong@broadcom.com> > > Normally, I would not have said anything but since Eilon asked. Acked-by: <eliezer@tamir.org.il> (this time in plain text) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-28 13:14 ` Eilon Greenstein 2010-04-28 15:44 ` Eliezer Tamir @ 2010-04-28 16:53 ` David Miller [not found] ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com> 2 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-04-28 16:53 UTC (permalink / raw) To: eilong Cc: vladz, eliezert, eric.dumazet, xiaosuo, hadi, therbert, shemminger, netdev From: "Eilon Greenstein" <eilong@broadcom.com> Date: Wed, 28 Apr 2010 16:14:15 +0300 > On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote: >> From: Eric Dumazet <eric.dumazet@gmail.com> >> Date: Wed, 28 Apr 2010 00:18:13 +0200 >> >> > [PATCH net-next-2.6] bnx2x: Remove two prefetch() >> > >> > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256 >> > 2) No need to prefetch same pointer twice. >> > >> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> >> > CC: Eilon Greenstein <eilong@broadcom.com> >> >> Eilon please review and ACK/NACK > > Vlad ran few benchmarks, and we couldn't find any justification for > those prefetch calls. After consulting with Eliezer Tamir (the original > author) we are glad to Ack this patch. > > Thanks Eric! > Acked-by: <eilong@broadcom.com> Thanks, applied. Please put your full name as well as your email address in Acked-by: tags, just like you do for Signed-off-by: tags. ^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com>]
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() [not found] ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com> @ 2010-04-28 16:55 ` David Miller 0 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-04-28 16:55 UTC (permalink / raw) To: eliezer Cc: eilong, vladz, eric.dumazet, xiaosuo, hadi, therbert, shemminger, netdev From: Eliezer Tamir <eliezer@tamir.org.il> Date: Wed, 28 Apr 2010 18:42:37 +0300 > Acked-by: <eliezer@tamir.org.il> Like I told Eilon, please specify your full name in future Acked-by: tags, just as you would for a Signed-off-by: tag. Thanks. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-27 22:18 ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet 2010-04-27 22:19 ` David Miller @ 2010-04-28 11:33 ` jamal 2010-04-28 12:33 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: jamal @ 2010-04-28 11:33 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein On Wed, 2010-04-28 at 00:18 +0200, Eric Dumazet wrote: > Thanks David, I was about to resubmit the cumulative patch ;) Hrm, i never got the email with your patch on top of Changlis (the fscking ISP has creative ways of reordering, delaying and also occassionaly loosing my emails). So all my tests from last week did not include the extra patch. I will try to make time today to test with latest net-next which seems to have some extra goodies. If there is any other patch you want me to try let me know... cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-28 11:33 ` jamal @ 2010-04-28 12:33 ` Eric Dumazet 2010-04-28 12:36 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-28 12:33 UTC (permalink / raw) To: hadi; +Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein Le mercredi 28 avril 2010 à 07:33 -0400, jamal a écrit : > On Wed, 2010-04-28 at 00:18 +0200, Eric Dumazet wrote: > > > Thanks David, I was about to resubmit the cumulative patch ;) > > Hrm, i never got the email with your patch on top of Changlis > (the fscking ISP has creative ways of reordering, delaying and also > occassionaly loosing my emails). So all my tests from last > week did not include the extra patch. I will try to make time today > to test with latest net-next which seems to have some extra goodies. > If there is any other patch you want me to try let me know... > > cheers, > jamal If you wait a bit, I have another patch to speedup udp receive path ;) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch() 2010-04-28 12:33 ` Eric Dumazet @ 2010-04-28 12:36 ` jamal 2010-04-28 14:06 ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-28 12:36 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote: > If you wait a bit, I have another patch to speedup udp receive path ;) Shoot whenever you are ready ;-> I will test with and without your patch.. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 12:36 ` jamal @ 2010-04-28 14:06 ` Eric Dumazet 2010-04-28 14:19 ` Eric Dumazet ` (2 more replies) 0 siblings, 3 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-28 14:06 UTC (permalink / raw) To: hadi Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le mercredi 28 avril 2010 à 08:36 -0400, jamal a écrit : > On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote: > > > If you wait a bit, I have another patch to speedup udp receive path ;) > > Shoot whenever you are ready ;-> I will test with and without your > patch.. > Here it is ;) Thanks [PATCH net-next-2.6] net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric DUmazet <eric.dumazet@gmail.com> --- include/net/sock.h | 10 ++++++++++ net/core/datagram.c | 10 +++++++--- net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 4 ++-- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index cf12b1e..d361c77 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1021,6 +1021,16 @@ extern void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) +static inline void lock_sock_bh(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); +} + +static inline void unlock_sock_bh(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot); diff --git a/net/core/datagram.c b/net/core/datagram.c index 5574a5d..95b851f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -229,9 +229,13 @@ EXPORT_SYMBOL(skb_free_datagram); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + lock_sock_bh(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_bh(sk); + + /* skb is now orphaned, might be freed outside of locked section */ + consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 63eb56b..1f86965 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1062,10 +1062,10 @@ static unsigned int first_packet_length(struct sock *sk) spin_unlock_bh(&rcvq->lock); if (!skb_queue_empty(&list_kill)) { - lock_sock(sk); + lock_sock_bh(sk); __skb_queue_purge(&list_kill); sk_mem_reclaim_partial(sk); - release_sock(sk); + unlock_sock_bh(sk); } return res; } @@ -1196,10 +1196,10 @@ out: return err; csum_copy_err: - lock_sock(sk); + lock_sock_bh(sk); if (!skb_kill_datagram(sk, skb, flags)) UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - release_sock(sk); + unlock_sock_bh(sk); if (noblock) return -EAGAIN; @@ -1624,9 +1624,9 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { - lock_sock(sk); + lock_sock_bh(sk); udp_flush_pending_frames(sk); - release_sock(sk); + unlock_sock_bh(sk); } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3ead20a..91c60f0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,7 +424,7 @@ out: return err; csum_copy_err: - lock_sock(sk); + lock_sock_bh(sk); if (!skb_kill_datagram(sk, skb, flags)) { if (is_udp4) UDP_INC_STATS_USER(sock_net(sk), @@ -433,7 +433,7 @@ csum_copy_err: UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - release_sock(sk); + unlock_sock_bh(sk); if (flags & MSG_DONTWAIT) return -EAGAIN; ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 14:06 ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet @ 2010-04-28 14:19 ` Eric Dumazet 2010-04-28 14:34 ` Eric Dumazet 2010-04-28 21:36 ` David Miller 2010-04-28 23:44 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 2 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-28 14:19 UTC (permalink / raw) To: hadi Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le mercredi 28 avril 2010 à 16:06 +0200, Eric Dumazet a écrit : > Le mercredi 28 avril 2010 à 08:36 -0400, jamal a écrit : > > On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote: > > > > > If you wait a bit, I have another patch to speedup udp receive path ;) > > > > Shoot whenever you are ready ;-> I will test with and without your > > patch.. > > > > Here it is ;) > > Thanks I forgot to say that with my previous DDOS test/bench (16 cpus trying to feed one udp socket), my receiver can now process 420.000 pps instead of 200.000 ;) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 14:19 ` Eric Dumazet @ 2010-04-28 14:34 ` Eric Dumazet 0 siblings, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-28 14:34 UTC (permalink / raw) To: hadi Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le mercredi 28 avril 2010 à 16:19 +0200, Eric Dumazet a écrit : > I forgot to say that with my previous DDOS test/bench (16 cpus trying to > feed one udp socket), my receiver can now process 420.000 pps instead of > 200.000 ;) And perf top of the cpu dedicated to the thread doing the recvmsg() is : (after patch) ---------------------------------------------------------------------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:98.0% [1000Hz cycles], (all, cpu: 1) ---------------------------------------------------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ____________________________ 5463.00 45.5% _raw_spin_lock_bh vmlinux 761.00 6.3% copy_user_generic_string vmlinux 662.00 5.5% sock_recv_ts_and_drops vmlinux 645.00 5.4% kfree vmlinux 568.00 4.7% _raw_spin_lock vmlinux 494.00 4.1% __skb_recv_datagram vmlinux 488.00 4.1% skb_copy_datagram_iovec vmlinux 467.00 3.9% __slab_free vmlinux 176.00 1.5% udp_recvmsg vmlinux 168.00 1.4% ia32_sysenter_target vmlinux 161.00 1.3% kmem_cache_free vmlinux 161.00 1.3% _raw_spin_lock_irqsave vmlinux 151.00 1.3% memcpy_toiovec vmlinux 131.00 1.1% fget_light vmlinux 130.00 1.1% sock_rfree vmlinux 104.00 0.9% inet_recvmsg vmlinux 99.00 0.8% dst_release vmlinux 98.00 0.8% skb_release_head_state vmlinux 83.00 0.7% __sk_mem_reclaim vmlinux 75.00 0.6% sys_recvfrom vmlinux 61.00 0.5% sysexit_from_sys_call vmlinux 59.00 0.5% fput vmlinux 56.00 0.5% schedule vmlinux 56.00 0.5% sock_recvmsg vmlinux 54.00 0.4% move_addr_to_user vmlinux 51.00 0.4% compat_sys_socketcall vmlinux 48.00 0.4% _raw_spin_unlock_bh vmlinux ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 14:06 ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet 2010-04-28 14:19 ` Eric Dumazet @ 2010-04-28 21:36 ` David Miller 2010-04-28 22:22 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet 2010-04-28 23:44 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 2 siblings, 1 reply; 108+ messages in thread From: David Miller @ 2010-04-28 21:36 UTC (permalink / raw) To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb From: Eric Dumazet <eric.dumazet@gmail.com> Date: Wed, 28 Apr 2010 16:06:45 +0200 > [PATCH net-next-2.6] net: speedup udp receive path > > Since commit 95766fff ([UDP]: Add memory accounting.), > each received packet needs one extra sock_lock()/sock_release() pair. > > This added latency because of possible backlog handling. Then later, > ticket spinlocks added yet another latency source in case of DDOS. > > This patch introduces lock_sock_bh() and unlock_sock_bh() > synchronization primitives, avoiding one atomic operation and backlog > processing. > > skb_free_datagram_locked() uses them instead of full blown > lock_sock()/release_sock(). skb is orphaned inside locked section for > proper socket memory reclaim, and finally freed outside of it. > > UDP receive path now take the socket spinlock only once. > > Signed-off-by: Eric DUmazet <eric.dumazet@gmail.com> Clever, let's see what this breaks :-) Applied, thanks Eric. ^ permalink raw reply [flat|nested] 108+ messages in thread
* [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper 2010-04-28 21:36 ` David Miller @ 2010-04-28 22:22 ` Eric Dumazet 2010-04-28 22:39 ` David Miller 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-28 22:22 UTC (permalink / raw) To: David Miller; +Cc: netdev Le mercredi 28 avril 2010 à 14:36 -0700, David Miller a écrit : > > Clever, let's see what this breaks :-) > > Applied, thanks Eric. Thanks ;) Let's respin an old work about dst, with a first small work unit : Next patch will try to not touch dst refcount in input path (previously attempted in July 2009) Ref : http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753 [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper When queueing a skb to socket, we can immediately release its dst if target socket do not use IP_CMSG_PKTINFO. tcp_data_queue() can drop dst too. This to benefit from a hot cache line and avoid the receiver, possibly on another cpu, to dirty this cache line himself. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 16 ++++++++++++++++ net/ipv4/raw.c | 2 +- net/ipv4/tcp_input.c | 1 + net/ipv4/udp.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 7 files changed, 22 insertions(+), 4 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index a84ceb6..8149b77 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -393,6 +393,7 @@ extern int ip_options_rcv_srr(struct sk_buff *skb); * Functions provided by ip_sockglue.c */ +extern int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); extern void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb); extern int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b0aa054..ce23178 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -954,6 +954,22 @@ e_inval: return -EINVAL; } +/** + * ip_queue_rcv_skb - Queue an skb into sock receive queue + * @sk: socket + * @skb: buffer + * + * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option + * is not set, we drop skb dst entry now, while dst cache line is hot. + */ +int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) + skb_dst_drop(skb); + return sock_queue_rcv_skb(sk, skb); +} +EXPORT_SYMBOL(ip_queue_rcv_skb); + int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cc6f097..52ef5af 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -290,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ae3ec15..e82162c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4367,6 +4367,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; + skb_dst_drop(skb); __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 63eb56b..8591398 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1264,7 +1264,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) sock_rps_save_rxhash(sk, skb->rxhash); - rc = sock_queue_rcv_skb(sk, skb); + rc = ip_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8562738..0e3d2dd 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -381,7 +381,7 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) } /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3ead20a..aa0e47a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -514,7 +514,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { + if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) UDP6_INC_STATS_BH(sock_net(sk), ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper 2010-04-28 22:22 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet @ 2010-04-28 22:39 ` David Miller 0 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-04-28 22:39 UTC (permalink / raw) To: eric.dumazet; +Cc: netdev From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 29 Apr 2010 00:22:44 +0200 > Next patch will try to not touch dst refcount in input path (previously > attempted in July 2009) > Ref : http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753 Yes, I remember this. > [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper > > When queueing a skb to socket, we can immediately release its dst if > target socket do not use IP_CMSG_PKTINFO. > > tcp_data_queue() can drop dst too. > > This to benefit from a hot cache line and avoid the receiver, possibly > on another cpu, to dirty this cache line himself. > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Pretty soon the whole receive path will be "read mostly" :-) Applied, thanks Eric. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 14:06 ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet 2010-04-28 14:19 ` Eric Dumazet 2010-04-28 21:36 ` David Miller @ 2010-04-28 23:44 ` jamal 2010-04-29 0:00 ` jamal 2010-04-29 4:09 ` Eric Dumazet 2 siblings, 2 replies; 108+ messages in thread From: jamal @ 2010-04-28 23:44 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz [-- Attachment #1: Type: text/plain, Size: 1188 bytes --] On Wed, 2010-04-28 at 16:06 +0200, Eric Dumazet wrote: > Here it is ;) Sorry - things got a little hectic with TheMan. I am afraid i dont have good news. Actually, I should say i dont have good news in regards to rps. For my sample app, two things seem to be happening: a) The overall performance has gotten better for both rps and non-rps. b) non-rps is now performing relatively better This is just what i see in net-next not related to your patch. It seems the kernels i tested prior to April 23 showed rps better. The one i tested on Apr23 showed rps being about the same as non-rps. As i stated in my last result posting, I thought i didnt test properly but i did again today and saw the same thing. And now non-rps is _consistently_ better. So some regression is going on... Your patch has improved the performance of rps relative to what is in net-next very lightly; but it has also improved the performance of non-rps;-> My traces look different for the app cpu than yours - likely because of the apps being different. At the moment i dont have time to dig deeper into code, but i could test as cycles show up. I am attaching the profile traces and results. cheers, jamal [-- Attachment #2: sum-apr23and28.txt --] [-- Type: text/plain, Size: 1469 bytes --] April 23 net-next kernel sink cpu all cpuint cpuapp --------------------------------------------------------- nn 93.95% 84.5% 99.8% 79.8% nn-rps 96.41% 85.4% 95.5% 82.5% nn-cl 97.29% 84.0% 99.9% 79.6% nn-cl-rps 97.76% 86.5% 96.5% 84.8% nn: Basic net-next from Apr23 nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 nn-cl: Basic net-next from Apr23 + Changli patch nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0 sink: the amount of traffic the system was able to sink in. cpu all: avg % system cpu consumed in test cpuint: avg %cpu consumed by the cpu where interrupts happened cpuapp: avg %cpu consumed by a sample cpu which did app processing Now repeat with Erics changes and kernel from Apr-28 kernel sink cpu all cpuint cpuapp --------------------------------------------------------- nn2 98.78% 83.6% 100.0% 82.8% nn2-rps 94.43% 84.2% 98.1% 82.0% nn2-ed 98.74% 83.2% 99.9% 81.6% nn2-ed-rps 95.15% 84.5% 97.3% 82.1% nn2: Basic net-next from Apr28 nn2-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 nn2-ed: Basic net-next from Apr23 + Eric patch nn2-ed-rps: Basic net-next from Apr23 + Eric patch + rps mask ee,irq aff cpu0 [-- Attachment #3: nn-apr28-summary.txt --] [-- Type: text/plain, Size: 78977 bytes --] I: net-next Average udp sink: 98.78% -------------------------------------------------------------------------------------------------- PerfTop: 3632 irqs/sec kernel:83.7% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 2738.00 9.8% sky2_poll [sky2] 1543.00 5.5% _raw_spin_lock_irqsave [kernel] 1019.00 3.7% system_call [kernel] 740.00 2.7% copy_user_generic_string [kernel] 687.00 2.5% fget [kernel] 640.00 2.3% _raw_spin_unlock_irqrestore [kernel] 634.00 2.3% sys_epoll_ctl [kernel] 613.00 2.2% datagram_poll [kernel] 553.00 2.0% _raw_spin_lock_bh [kernel] 530.00 1.9% kmem_cache_free [kernel] 522.00 1.9% schedule [kernel] 487.00 1.7% vread_tsc [kernel].vsyscall_fn 467.00 1.7% _raw_spin_lock [kernel] 432.00 1.5% udp_recvmsg [kernel] 426.00 1.5% kmem_cache_alloc [kernel] 418.00 1.5% __udp4_lib_lookup [kernel] 417.00 1.5% sys_epoll_wait [kernel] 376.00 1.3% fput [kernel] 361.00 1.3% ip_route_input [kernel] 344.00 1.2% local_bh_enable_ip [kernel] 326.00 1.2% ip_rcv [kernel] 321.00 1.2% first_packet_length [kernel] 307.00 1.1% ep_remove [kernel] 303.00 1.1% dst_release [kernel] 301.00 1.1% skb_copy_datagram_iovec [kernel] 297.00 1.1% mutex_lock [kernel] -------------------------------------------------------------------------------------------------- PerfTop: 4018 irqs/sec kernel:83.3% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 4274.00 9.7% sky2_poll [sky2] 2473.00 5.6% _raw_spin_lock_irqsave [kernel] 1585.00 3.6% system_call [kernel] 1179.00 2.7% copy_user_generic_string [kernel] 1089.00 2.5% fget [kernel] 1019.00 2.3% _raw_spin_unlock_irqrestore [kernel] 1011.00 2.3% sys_epoll_ctl [kernel] 965.00 2.2% datagram_poll [kernel] 902.00 2.0% kmem_cache_free [kernel] 841.00 1.9% _raw_spin_lock_bh [kernel] 837.00 1.9% schedule [kernel] 735.00 1.7% vread_tsc [kernel].vsyscall_fn 730.00 1.7% udp_recvmsg [kernel] 729.00 1.7% _raw_spin_lock [kernel] 678.00 1.5% kmem_cache_alloc [kernel] 651.00 1.5% sys_epoll_wait [kernel] 635.00 1.4% __udp4_lib_lookup [kernel] 595.00 1.3% fput [kernel] 568.00 1.3% local_bh_enable_ip [kernel] 562.00 1.3% ip_route_input [kernel] 516.00 1.2% dst_release [kernel] 502.00 1.1% ep_remove [kernel] 485.00 1.1% skb_copy_datagram_iovec [kernel] 484.00 1.1% first_packet_length [kernel] 476.00 1.1% ip_rcv [kernel] 470.00 1.1% __alloc_skb [kernel] 459.00 1.0% epoll_ctl /lib/libc-2.7.so 458.00 1.0% mutex_lock [kernel] -------------------------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 3534.00 34.7% sky2_poll [sky2] 545.00 5.3% __udp4_lib_lookup [kernel] 537.00 5.3% ip_route_input [kernel] 427.00 4.2% _raw_spin_lock_irqsave [kernel] 401.00 3.9% __alloc_skb [kernel] 360.00 3.5% ip_rcv [kernel] 332.00 3.3% _raw_spin_lock [kernel] 292.00 2.9% sock_queue_rcv_skb [kernel] 291.00 2.9% __udp4_lib_rcv [kernel] 273.00 2.7% sock_def_readable [kernel] 269.00 2.6% __netif_receive_skb [kernel] 209.00 2.1% __wake_up_common [kernel] 196.00 1.9% __kmalloc [kernel] 164.00 1.6% _raw_read_lock [kernel] 157.00 1.5% kmem_cache_alloc [kernel] 157.00 1.5% ep_poll_callback [kernel] 133.00 1.3% resched_task [kernel] 128.00 1.3% task_rq_lock [kernel] 120.00 1.2% swiotlb_sync_single [kernel] 120.00 1.2% sky2_rx_submit [sky2] 117.00 1.1% udp_queue_rcv_skb [kernel] 108.00 1.1% ip_local_deliver [kernel] 104.00 1.0% try_to_wake_up [kernel] 102.00 1.0% _raw_spin_unlock_irqrestore [kernel] 98.00 1.0% select_task_rq_fair [kernel] -------------------------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 4601.00 34.0% sky2_poll [sky2] 732.00 5.4% __udp4_lib_lookup [kernel] 724.00 5.3% ip_route_input [kernel] 527.00 3.9% _raw_spin_lock_irqsave [kernel] 520.00 3.8% __alloc_skb [kernel] 483.00 3.6% ip_rcv [kernel] 441.00 3.3% _raw_spin_lock [kernel] 401.00 3.0% sock_queue_rcv_skb [kernel] 373.00 2.8% __udp4_lib_rcv [kernel] 365.00 2.7% sock_def_readable [kernel] 353.00 2.6% __netif_receive_skb [kernel] 285.00 2.1% __wake_up_common [kernel] 273.00 2.0% __kmalloc [kernel] 230.00 1.7% _raw_read_lock [kernel] 208.00 1.5% ep_poll_callback [kernel] 199.00 1.5% kmem_cache_alloc [kernel] 180.00 1.3% task_rq_lock [kernel] 172.00 1.3% sky2_rx_submit [sky2] 171.00 1.3% resched_task [kernel] 165.00 1.2% ip_local_deliver [kernel] 162.00 1.2% udp_queue_rcv_skb [kernel] 158.00 1.2% _raw_spin_unlock_irqrestore [kernel] 148.00 1.1% select_task_rq_fair [kernel] 144.00 1.1% try_to_wake_up [kernel] 142.00 1.0% sky2_remove [sky2] 140.00 1.0% swiotlb_sync_single [kernel] 95.00 0.7% cache_alloc_refill [kernel] 92.00 0.7% dev_gro_receive [kernel] 82.00 0.6% is_swiotlb_buffer [kernel] -------------------------------------------------------------------------------------------------- PerfTop: 622 irqs/sec kernel:74.9% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ _____________________________________ 113.00 6.5% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 105.00 6.0% system_call /lib/modules/2.6.34-rc5/build/vmlinux 69.00 3.9% fget /lib/modules/2.6.34-rc5/build/vmlinux 64.00 3.7% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 56.00 3.2% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 55.00 3.1% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 53.00 3.0% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 46.00 2.6% _raw_spin_lock_bh /lib/modules/2.6.34-rc5/build/vmlinux 42.00 2.4% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 37.00 2.1% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 37.00 2.1% schedule /lib/modules/2.6.34-rc5/build/vmlinux 35.00 2.0% mutex_lock /lib/modules/2.6.34-rc5/build/vmlinux 35.00 2.0% vread_tsc [kernel].vsyscall_fn 35.00 2.0% udp_recvmsg /lib/modules/2.6.34-rc5/build/vmlinux 34.00 1.9% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 31.00 1.8% local_bh_enable_ip /lib/modules/2.6.34-rc5/build/vmlinux 29.00 1.7% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 28.00 1.6% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 27.00 1.5% process_recv /home/hadi/udp_sink/mcpudp 25.00 1.4% mutex_unlock /lib/modules/2.6.34-rc5/build/vmlinux 24.00 1.4% ep_send_events_proc /lib/modules/2.6.34-rc5/build/vmlinux 24.00 1.4% clock_gettime /lib/librt-2.7.so 23.00 1.3% fput /lib/modules/2.6.34-rc5/build/vmlinux 23.00 1.3% skb_copy_datagram_iovec /lib/modules/2.6.34-rc5/build/vmlinux 20.00 1.1% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 20.00 1.1% inet_recvmsg /lib/modules/2.6.34-rc5/build/vmlinux 19.00 1.1% epoll_dispatch /usr/lib/libevent-1.3e.so.1.0.3 19.00 1.1% first_packet_length /lib/modules/2.6.34-rc5/build/vmlinux -------------------------------------------------------------------------------------------------- PerfTop: 625 irqs/sec kernel:83.0% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ _____________________________________ 315.00 6.8% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 232.00 5.0% system_call /lib/modules/2.6.34-rc5/build/vmlinux 175.00 3.8% fget /lib/modules/2.6.34-rc5/build/vmlinux 174.00 3.8% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 168.00 3.6% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 155.00 3.4% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 144.00 3.1% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 133.00 2.9% _raw_spin_lock_bh /lib/modules/2.6.34-rc5/build/vmlinux 126.00 2.7% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 113.00 2.4% vread_tsc [kernel].vsyscall_fn 110.00 2.4% _raw_spin_unlock_bh /lib/modules/2.6.34-rc5/build/vmlinux 106.00 2.3% schedule /lib/modules/2.6.34-rc5/build/vmlinux 103.00 2.2% local_bh_enable_ip /lib/modules/2.6.34-rc5/build/vmlinux 101.00 2.2% udp_recvmsg /lib/modules/2.6.34-rc5/build/vmlinux 97.00 2.1% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 84.00 1.8% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 78.00 1.7% fput /lib/modules/2.6.34-rc5/build/vmlinux 75.00 1.6% first_packet_length /lib/modules/2.6.34-rc5/build/vmlinux 74.00 1.6% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 71.00 1.5% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 69.00 1.5% epoll_ctl /lib/libc-2.7.so 67.00 1.5% mutex_lock /lib/modules/2.6.34-rc5/build/vmlinux 65.00 1.4% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 65.00 1.4% inet_recvmsg /lib/modules/2.6.34-rc5/build/vmlinux 64.00 1.4% process_recv /home/hadi/udp_sink/mcpudp 62.00 1.3% skb_copy_datagram_iovec /lib/modules/2.6.34-rc5/build/vmlinux 60.00 1.3% clock_gettime /lib/librt-2.7.so -------------------------------------------------------------------------------------------------- PerfTop: 700 irqs/sec kernel:84.3% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ _____________________________________ 489.00 6.4% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 376.00 4.9% system_call /lib/modules/2.6.34-rc5/build/vmlinux 308.00 4.0% fget /lib/modules/2.6.34-rc5/build/vmlinux 302.00 3.9% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 280.00 3.6% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 274.00 3.6% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 249.00 3.2% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 223.00 2.9% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 221.00 2.9% _raw_spin_unlock_bh /lib/modules/2.6.34-rc5/build/vmlinux 221.00 2.9% local_bh_enable_ip /lib/modules/2.6.34-rc5/build/vmlinux 208.00 2.7% vread_tsc [kernel].vsyscall_fn 200.00 2.6% _raw_spin_lock_bh /lib/modules/2.6.34-rc5/build/vmlinux 191.00 2.5% schedule /lib/modules/2.6.34-rc5/build/vmlinux 188.00 2.4% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 177.00 2.3% udp_recvmsg /lib/modules/2.6.34-rc5/build/vmlinux 141.00 1.8% fput /lib/modules/2.6.34-rc5/build/vmlinux 140.00 1.8% first_packet_length /lib/modules/2.6.34-rc5/build/vmlinux 128.00 1.7% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 119.00 1.5% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 105.00 1.4% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 104.00 1.4% epoll_ctl /lib/libc-2.7.so 102.00 1.3% skb_copy_datagram_iovec /lib/modules/2.6.34-rc5/build/vmlinux 100.00 1.3% mutex_lock /lib/modules/2.6.34-rc5/build/vmlinux 95.00 1.2% mutex_unlock /lib/modules/2.6.34-rc5/build/vmlinux 94.00 1.2% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 92.00 1.2% ep_send_events_proc /lib/modules/2.6.34-rc5/build/vmlinux 92.00 1.2% clock_gettime /lib/librt-2.7.so 92.00 1.2% __skb_recv_datagram /lib/modules/2.6.34-rc5/build/vmlinux 91.00 1.2% process_recv /home/hadi/udp_sink/mcpudp 88.00 1.1% kfree /lib/modules/2.6.34-rc5/build/vmlinux 86.00 1.1% _raw_spin_lock /lib/modules/2.6.34-rc5/build/vmlinux II: net-next with rps = ee 94.43% -------------- -------------------------------------------------------------------------------------------------- PerfTop: 4328 irqs/sec kernel:84.0% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ______________________ 3908.00 17.1% sky2_poll [sky2] 694.00 3.0% _raw_spin_lock_irqsave [kernel] 584.00 2.6% sky2_intr [sky2] 557.00 2.4% system_call [kernel] 490.00 2.1% _raw_spin_unlock_irqrestore [kernel] 488.00 2.1% fget [kernel] 425.00 1.9% ip_rcv [kernel] 405.00 1.8% sys_epoll_ctl [kernel] 398.00 1.7% __netif_receive_skb [kernel] 375.00 1.6% _raw_spin_lock [kernel] 365.00 1.6% copy_user_generic_string [kernel] 363.00 1.6% ip_route_input [kernel] 350.00 1.5% kmem_cache_free [kernel] 346.00 1.5% schedule [kernel] 319.00 1.4% call_function_single_interrupt [kernel] 295.00 1.3% vread_tsc [kernel].vsyscall_fn 270.00 1.2% __udp4_lib_lookup [kernel] 264.00 1.2% kmem_cache_alloc [kernel] 235.00 1.0% fput [kernel] 219.00 1.0% datagram_poll [kernel] -------------------------------------------------------------------------------------------------- PerfTop: 3791 irqs/sec kernel:84.4% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ______________________ 6274.00 17.2% sky2_poll [sky2] 1139.00 3.1% _raw_spin_lock_irqsave [kernel] 953.00 2.6% system_call [kernel] 942.00 2.6% sky2_intr [sky2] 785.00 2.2% _raw_spin_unlock_irqrestore [kernel] 745.00 2.0% fget [kernel] 695.00 1.9% ip_rcv [kernel] 653.00 1.8% sys_epoll_ctl [kernel] 609.00 1.7% ip_route_input [kernel] 606.00 1.7% __netif_receive_skb [kernel] 583.00 1.6% _raw_spin_lock [kernel] 569.00 1.6% kmem_cache_free [kernel] 564.00 1.5% copy_user_generic_string [kernel] 554.00 1.5% schedule [kernel] 510.00 1.4% call_function_single_interrupt [kernel] 488.00 1.3% vread_tsc [kernel].vsyscall_fn 459.00 1.3% kmem_cache_alloc [kernel] 417.00 1.1% __udp4_lib_lookup [kernel] 387.00 1.1% fput [kernel] 358.00 1.0% __udp4_lib_rcv [kernel] 347.00 1.0% event_base_loop libevent-1.3e.so.1.0.3 ----------------------------------------------------------------------------------------------- PerfTop: 997 irqs/sec kernel:98.2% [1000Hz cycles], (all, cpu: 0) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ ________ 3926.00 61.0% sky2_poll [sky2] 671.00 10.4% sky2_intr [sky2] 192.00 3.0% __alloc_skb [kernel] 126.00 2.0% get_rps_cpu [kernel] 111.00 1.7% __kmalloc [kernel] 97.00 1.5% enqueue_to_backlog [kernel] 95.00 1.5% _raw_spin_lock_irqsave [kernel] 93.00 1.4% _raw_spin_lock [kernel] 79.00 1.2% kmem_cache_alloc [kernel] 63.00 1.0% sky2_rx_submit [sky2] ----------------------------------------------------------------------------------------------- PerfTop: 980 irqs/sec kernel:98.0% [1000Hz cycles], (all, cpu: 0) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ ____________________ 6945.00 61.4% sky2_poll [sky2] 1219.00 10.8% sky2_intr [sky2] 323.00 2.9% __alloc_skb [kernel] 243.00 2.1% get_rps_cpu [kernel] 195.00 1.7% __kmalloc [kernel] 161.00 1.4% _raw_spin_lock_irqsave [kernel] 149.00 1.3% enqueue_to_backlog [kernel] 139.00 1.2% _raw_spin_lock [kernel] 136.00 1.2% kmem_cache_alloc [kernel] 135.00 1.2% irq_entries_start [kernel] 108.00 1.0% sky2_rx_submit [sky2] ----------------------------------------------------------------------------------------------- PerfTop: 458 irqs/sec kernel:80.8% [1000Hz cycles], (all, cpu: 2) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _____________________________________ 130.00 4.7% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 114.00 4.1% system_call /lib/modules/2.6.34-rc5/build/vmlinux 91.00 3.3% ip_rcv /lib/modules/2.6.34-rc5/build/vmlinux 82.00 3.0% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 74.00 2.7% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux 74.00 2.7% fget /lib/modules/2.6.34-rc5/build/vmlinux 71.00 2.6% __netif_receive_skb /lib/modules/2.6.34-rc5/build/vmlinux 69.00 2.5% ip_route_input /lib/modules/2.6.34-rc5/build/vmlinux 66.00 2.4% schedule /lib/modules/2.6.34-rc5/build/vmlinux 63.00 2.3% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 61.00 2.2% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 61.00 2.2% __udp4_lib_lookup /lib/modules/2.6.34-rc5/build/vmlinux 57.00 2.1% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 49.00 1.8% vread_tsc [kernel].vsyscall_fn 49.00 1.8% _raw_spin_lock /lib/modules/2.6.34-rc5/build/vmlinux 47.00 1.7% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 45.00 1.6% fput /lib/modules/2.6.34-rc5/build/vmlinux 44.00 1.6% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 40.00 1.4% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 40.00 1.4% local_bh_enable_ip /lib/modules/2.6.34-rc5/build/vmlinux 38.00 1.4% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 35.00 1.3% process_recv /home/hadi/udp_sink/mcpudp 34.00 1.2% mutex_unlock /lib/modules/2.6.34-rc5/build/vmlinux 31.00 1.1% _raw_spin_unlock_bh /lib/modules/2.6.34-rc5/build/vmlinux 31.00 1.1% event_base_loop /usr/lib/libevent-1.3e.so.1.0.3 ----------------------------------------------------------------------------------------------- PerfTop: 552 irqs/sec kernel:82.4% [1000Hz cycles], (all, cpu: 2) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _____________________________________ 204.00 4.7% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 169.00 3.9% system_call /lib/modules/2.6.34-rc5/build/vmlinux 151.00 3.5% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 132.00 3.0% ip_rcv /lib/modules/2.6.34-rc5/build/vmlinux 129.00 3.0% fget /lib/modules/2.6.34-rc5/build/vmlinux 123.00 2.8% __netif_receive_skb /lib/modules/2.6.34-rc5/build/vmlinux 115.00 2.6% ip_route_input /lib/modules/2.6.34-rc5/build/vmlinux 112.00 2.6% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux 112.00 2.6% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 103.00 2.4% schedule /lib/modules/2.6.34-rc5/build/vmlinux 94.00 2.2% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 89.00 2.0% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 86.00 2.0% _raw_spin_lock /lib/modules/2.6.34-rc5/build/vmlinux 83.00 1.9% __udp4_lib_lookup /lib/modules/2.6.34-rc5/build/vmlinux 76.00 1.7% vread_tsc [kernel].vsyscall_fn 68.00 1.6% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 67.00 1.5% fput /lib/modules/2.6.34-rc5/build/vmlinux 64.00 1.5% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 62.00 1.4% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 60.00 1.4% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 60.00 1.4% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 56.00 1.3% _raw_spin_lock_bh /lib/modules/2.6.34-rc5/build/vmlinux 53.00 1.2% event_base_loop /usr/lib/libevent-1.3e.so.1.0.3 51.00 1.2% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 48.00 1.1% epoll_ctl /lib/libc-2.7.so 48.00 1.1% kfree /lib/modules/2.6.34-rc5/build/vmlinux 47.00 1.1% _raw_spin_unlock_bh /lib/modules/2.6.34-rc5/build/vmlinux 47.00 1.1% mutex_unlock /lib/modules/2.6.34-rc5/build/vmlinux 45.00 1.0% __udp4_lib_rcv /lib/modules/2.6.34-rc5/build/vmlinux 45.00 1.0% tick_nohz_stop_sched_tick /lib/modules/2.6.34-rc5/build/vmlinux ----------------------------------------------------------------------------------------------- PerfTop: 408 irqs/sec kernel:82.1% [1000Hz cycles], (all, cpu: 2) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _____________________________________ 240.00 4.8% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 200.00 4.0% system_call /lib/modules/2.6.34-rc5/build/vmlinux 165.00 3.3% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 161.00 3.2% ip_rcv /lib/modules/2.6.34-rc5/build/vmlinux 158.00 3.1% fget /lib/modules/2.6.34-rc5/build/vmlinux 150.00 3.0% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 135.00 2.7% __netif_receive_skb /lib/modules/2.6.34-rc5/build/vmlinux 122.00 2.4% ip_route_input /lib/modules/2.6.34-rc5/build/vmlinux 117.00 2.3% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux 114.00 2.3% schedule /lib/modules/2.6.34-rc5/build/vmlinux 110.00 2.2% _raw_spin_lock /lib/modules/2.6.34-rc5/build/vmlinux 108.00 2.1% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 101.00 2.0% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 94.00 1.9% vread_tsc [kernel].vsyscall_fn 90.00 1.8% __udp4_lib_lookup /lib/modules/2.6.34-rc5/build/vmlinux 85.00 1.7% fput /lib/modules/2.6.34-rc5/build/vmlinux 78.00 1.5% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 77.00 1.5% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux 75.00 1.5% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 74.00 1.5% _raw_spin_lock_bh /lib/modules/2.6.34-rc5/build/vmlinux 69.00 1.4% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 68.00 1.3% event_base_loop /usr/lib/libevent-1.3e.so.1.0.3 68.00 1.3% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 62.00 1.2% _raw_spin_unlock_bh /lib/modules/2.6.34-rc5/build/vmlinux 62.00 1.2% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 55.00 1.1% epoll_ctl /lib/libc-2.7.so 53.00 1.1% local_bh_enable_ip /lib/modules/2.6.34-rc5/build/vmlinux 53.00 1.1% tick_nohz_stop_sched_tick /lib/modules/2.6.34-rc5/build/vmlinux 52.00 1.0% mutex_unlock /lib/modules/2.6.34-rc5/build/vmlinux ----------------------------------------------------------------------------------------------- PerfTop: 440 irqs/sec kernel:85.0% [1000Hz cycles], (all, cpu: 2) ----------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _____________________________________ 226.00 4.6% _raw_spin_lock_irqsave /lib/modules/2.6.34-rc5/build/vmlinux 213.00 4.3% system_call /lib/modules/2.6.34-rc5/build/vmlinux 154.00 3.1% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux 148.00 3.0% ip_rcv /lib/modules/2.6.34-rc5/build/vmlinux 143.00 2.9% fget /lib/modules/2.6.34-rc5/build/vmlinux 143.00 2.9% ip_route_input /lib/modules/2.6.34-rc5/build/vmlinux 140.00 2.8% __netif_receive_skb /lib/modules/2.6.34-rc5/build/vmlinux 124.00 2.5% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux 124.00 2.5% sys_epoll_ctl /lib/modules/2.6.34-rc5/build/vmlinux 104.00 2.1% copy_user_generic_string /lib/modules/2.6.34-rc5/build/vmlinux 103.00 2.1% vread_tsc [kernel].vsyscall_fn 101.00 2.0% schedule /lib/modules/2.6.34-rc5/build/vmlinux 100.00 2.0% kmem_cache_free /lib/modules/2.6.34-rc5/build/vmlinux 99.00 2.0% _raw_spin_lock /lib/modules/2.6.34-rc5/build/vmlinux 93.00 1.9% __udp4_lib_lookup /lib/modules/2.6.34-rc5/build/vmlinux 80.00 1.6% fput /lib/modules/2.6.34-rc5/build/vmlinux 76.00 1.5% kmem_cache_alloc /lib/modules/2.6.34-rc5/build/vmlinux 75.00 1.5% sock_recv_ts_and_drops /lib/modules/2.6.34-rc5/build/vmlinux 73.00 1.5% dst_release /lib/modules/2.6.34-rc5/build/vmlinux 70.00 1.4% sys_epoll_wait /lib/modules/2.6.34-rc5/build/vmlinux 69.00 1.4% datagram_poll /lib/modules/2.6.34-rc5/build/vmlinux 65.00 1.3% event_base_loop /usr/lib/libevent-1.3e.so.1.0.3 65.00 1.3% ep_remove /lib/modules/2.6.34-rc5/build/vmlinux III: Kernel compiled with Erics patch, rps mask 00 Avg udp packets sunk: 98.74% ------------------------------------------------------------------------------- PerfTop: 4202 irqs/sec kernel:82.5% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 1639.00 9.0% sky2_poll [sky2] 1051.00 5.8% _raw_spin_lock_irqsave [kernel] 665.00 3.7% system_call [kernel] 578.00 3.2% fget [kernel] 476.00 2.6% _raw_spin_unlock_irqrestore [kernel] 457.00 2.5% copy_user_generic_string [kernel] 427.00 2.4% sys_epoll_ctl [kernel] 401.00 2.2% datagram_poll [kernel] 391.00 2.2% kmem_cache_free [kernel] 349.00 1.9% schedule [kernel] 339.00 1.9% vread_tsc [kernel].vsyscall_fn 323.00 1.8% udp_recvmsg [kernel] 292.00 1.6% kmem_cache_alloc [kernel] 285.00 1.6% _raw_spin_lock [kernel] 272.00 1.5% _raw_spin_lock_bh [kernel] 268.00 1.5% sys_epoll_wait [kernel] 260.00 1.4% fput [kernel] 234.00 1.3% ip_route_input [kernel] 221.00 1.2% __udp4_lib_lookup [kernel] 212.00 1.2% dst_release [kernel] 209.00 1.2% ip_rcv [kernel] 203.00 1.1% ep_remove [kernel] 202.00 1.1% first_packet_length [kernel] ------------------------------------------------------------------------------- PerfTop: 3999 irqs/sec kernel:82.3% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 3452.00 9.3% sky2_poll [sky2] 2212.00 5.9% _raw_spin_lock_irqsave [kernel] 1350.00 3.6% system_call [kernel] 1187.00 3.2% fget [kernel] 1010.00 2.7% copy_user_generic_string [kernel] 965.00 2.6% _raw_spin_unlock_irqrestore [kernel] 842.00 2.3% sys_epoll_ctl [kernel] 833.00 2.2% datagram_poll [kernel] 770.00 2.1% kmem_cache_free [kernel] 710.00 1.9% vread_tsc [kernel].vsyscall_fn 688.00 1.8% schedule [kernel] 651.00 1.7% udp_recvmsg [kernel] 603.00 1.6% _raw_spin_lock_bh [kernel] 599.00 1.6% _raw_spin_lock [kernel] 597.00 1.6% sys_epoll_wait [kernel] 594.00 1.6% kmem_cache_alloc [kernel] 553.00 1.5% ip_route_input [kernel] 528.00 1.4% fput [kernel] 496.00 1.3% __udp4_lib_lookup [kernel] 444.00 1.2% dst_release [kernel] 433.00 1.2% ip_rcv [kernel] 408.00 1.1% first_packet_length [kernel] ------------------------------------------------------------------------------- PerfTop: 3765 irqs/sec kernel:83.7% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 4275.00 9.5% sky2_poll [sky2] 2684.00 6.0% _raw_spin_lock_irqsave [kernel] 1654.00 3.7% system_call [kernel] 1447.00 3.2% fget [kernel] 1223.00 2.7% copy_user_generic_string [kernel] 1146.00 2.5% _raw_spin_unlock_irqrestore [kernel] 1036.00 2.3% sys_epoll_ctl [kernel] 1019.00 2.3% datagram_poll [kernel] 974.00 2.2% kmem_cache_free [kernel] 843.00 1.9% vread_tsc [kernel].vsyscall_fn 799.00 1.8% schedule [kernel] 761.00 1.7% udp_recvmsg [kernel] 736.00 1.6% kmem_cache_alloc [kernel] 719.00 1.6% _raw_spin_lock_bh [kernel] 716.00 1.6% _raw_spin_lock [kernel] 696.00 1.5% sys_epoll_wait [kernel] 680.00 1.5% ip_route_input [kernel] 657.00 1.5% fput [kernel] 613.00 1.4% __udp4_lib_lookup [kernel] 552.00 1.2% dst_release [kernel] 507.00 1.1% ip_rcv [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:99.9% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 669.00 32.2% sky2_poll [sky2] 128.00 6.2% ip_route_input [kernel] 106.00 5.1% ip_rcv [kernel] 105.00 5.1% __udp4_lib_lookup [kernel] 86.00 4.1% _raw_spin_lock [kernel] 85.00 4.1% _raw_spin_lock_irqsave [kernel] 82.00 3.9% __alloc_skb [kernel] 78.00 3.8% sock_queue_rcv_skb [kernel] 57.00 2.7% __netif_receive_skb [kernel] 53.00 2.6% __wake_up_common [kernel] 47.00 2.3% __udp4_lib_rcv [kernel] 42.00 2.0% sock_def_readable [kernel] 37.00 1.8% kmem_cache_alloc [kernel] 34.00 1.6% ep_poll_callback [kernel] 34.00 1.6% __kmalloc [kernel] 34.00 1.6% select_task_rq_fair [kernel] 30.00 1.4% _raw_read_lock [kernel] 27.00 1.3% _raw_spin_unlock_irqrestore [kernel] 24.00 1.2% sky2_rx_submit [sky2] 22.00 1.1% udp_queue_rcv_skb [kernel] 21.00 1.0% try_to_wake_up [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 3061.00 31.9% sky2_poll [sky2] 529.00 5.5% ip_route_input [kernel] 518.00 5.4% __udp4_lib_lookup [kernel] 424.00 4.4% ip_rcv [kernel] 390.00 4.1% _raw_spin_lock_irqsave [kernel] 389.00 4.1% __alloc_skb [kernel] 365.00 3.8% _raw_spin_lock [kernel] 326.00 3.4% sock_queue_rcv_skb [kernel] 297.00 3.1% __netif_receive_skb [kernel] 273.00 2.8% __udp4_lib_rcv [kernel] 223.00 2.3% sock_def_readable [kernel] 205.00 2.1% __wake_up_common [kernel] 181.00 1.9% __kmalloc [kernel] 151.00 1.6% kmem_cache_alloc [kernel] 147.00 1.5% _raw_read_lock [kernel] 143.00 1.5% ep_poll_callback [kernel] 136.00 1.4% sky2_rx_submit [sky2] 123.00 1.3% task_rq_lock [kernel] 118.00 1.2% _raw_spin_unlock_irqrestore [kernel] 114.00 1.2% select_task_rq_fair [kernel] 104.00 1.1% resched_task [kernel] 104.00 1.1% sky2_remove [sky2] 102.00 1.1% udp_queue_rcv_skb [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 3898.00 31.0% sky2_poll [sky2] 715.00 5.7% ip_route_input [kernel] 651.00 5.2% __udp4_lib_lookup [kernel] 576.00 4.6% ip_rcv [kernel] 534.00 4.2% __alloc_skb [kernel] 518.00 4.1% _raw_spin_lock_irqsave [kernel] 441.00 3.5% sock_queue_rcv_skb [kernel] 439.00 3.5% _raw_spin_lock [kernel] 396.00 3.1% __netif_receive_skb [kernel] 351.00 2.8% __udp4_lib_rcv [kernel] 300.00 2.4% sock_def_readable [kernel] 264.00 2.1% __wake_up_common [kernel] 260.00 2.1% __kmalloc [kernel] 198.00 1.6% kmem_cache_alloc [kernel] 193.00 1.5% ep_poll_callback [kernel] 192.00 1.5% _raw_read_lock [kernel] 168.00 1.3% sky2_rx_submit [sky2] 167.00 1.3% task_rq_lock [kernel] 153.00 1.2% udp_queue_rcv_skb [kernel] 149.00 1.2% _raw_spin_unlock_irqrestore [kernel] 147.00 1.2% ip_local_deliver [kernel] 144.00 1.1% resched_task [kernel] 137.00 1.1% sky2_remove [sky2] ------------------------------------------------------------------------------- PerfTop: 663 irqs/sec kernel:81.9% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 129.00 7.0% _raw_spin_lock_irqsave [kernel] 84.00 4.5% fget [kernel] 83.00 4.5% system_call [kernel] 82.00 4.4% copy_user_generic_string [kernel] 67.00 3.6% _raw_spin_unlock_irqrestore [kernel] 63.00 3.4% datagram_poll [kernel] 57.00 3.1% udp_recvmsg [kernel] 55.00 3.0% sys_epoll_ctl [kernel] 55.00 3.0% vread_tsc [kernel].vsyscall_fn 43.00 2.3% sys_epoll_wait [kernel] 43.00 2.3% _raw_spin_lock_bh [kernel] 41.00 2.2% first_packet_length [kernel] 40.00 2.2% dst_release [kernel] 37.00 2.0% fput [kernel] 37.00 2.0% kmem_cache_free [kernel] 36.00 1.9% mutex_unlock [kernel] 35.00 1.9% schedule [kernel] 34.00 1.8% skb_copy_datagram_iovec [kernel] 34.00 1.8% ep_remove [kernel] 29.00 1.6% mutex_lock [kernel] 29.00 1.6% _raw_spin_lock [kernel] 28.00 1.5% __skb_recv_datagram [kernel] 25.00 1.4% epoll_ctl /lib/libc-2.7.so 25.00 1.4% tick_nohz_stop_sched_tick [kernel] ------------------------------------------------------------------------------- PerfTop: 629 irqs/sec kernel:81.1% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 351.00 7.9% _raw_spin_lock_irqsave [kernel] 248.00 5.6% system_call [kernel] 219.00 5.0% fget [kernel] 194.00 4.4% copy_user_generic_string [kernel] 184.00 4.2% datagram_poll [kernel] 162.00 3.7% sys_epoll_ctl [kernel] 159.00 3.6% _raw_spin_unlock_irqrestore [kernel] 129.00 2.9% udp_recvmsg [kernel] 129.00 2.9% kmem_cache_free [kernel] 123.00 2.8% vread_tsc [kernel].vsyscall_fn 108.00 2.4% schedule [kernel] 107.00 2.4% _raw_spin_lock_bh [kernel] 104.00 2.4% sys_epoll_wait [kernel] 100.00 2.3% fput [kernel] 94.00 2.1% dst_release [kernel] 78.00 1.8% first_packet_length [kernel] 73.00 1.7% ep_remove [kernel] 69.00 1.6% epoll_ctl /lib/libc-2.7.so 66.00 1.5% skb_copy_datagram_iovec [kernel] 66.00 1.5% mutex_unlock [kernel] 64.00 1.4% __skb_recv_datagram [kernel] 64.00 1.4% mutex_lock [kernel] 57.00 1.3% sock_recv_ts_and_drops [kernel] 51.00 1.2% kmem_cache_alloc [kernel] 49.00 1.1% ep_send_events_proc [kernel] ------------------------------------------------------------------------------- PerfTop: 457 irqs/sec kernel:72.0% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ______________________ 411.00 7.8% _raw_spin_lock_irqsave [kernel] 280.00 5.3% system_call [kernel] 269.00 5.1% fget [kernel] 239.00 4.5% copy_user_generic_string [kernel] 232.00 4.4% datagram_poll [kernel] 175.00 3.3% _raw_spin_unlock_irqrestore [kernel] 170.00 3.2% sys_epoll_ctl [kernel] 169.00 3.2% kmem_cache_free [kernel] 149.00 2.8% udp_recvmsg [kernel] 144.00 2.7% vread_tsc [kernel].vsyscall_fn 129.00 2.4% sys_epoll_wait [kernel] 128.00 2.4% _raw_spin_lock_bh [kernel] 115.00 2.2% fput [kernel] 112.00 2.1% schedule [kernel] 108.00 2.0% dst_release [kernel] 88.00 1.7% first_packet_length [kernel] 86.00 1.6% ep_remove [kernel] 83.00 1.6% mutex_lock [kernel] 79.00 1.5% skb_copy_datagram_iovec [kernel] 76.00 1.4% mutex_unlock [kernel] 75.00 1.4% epoll_ctl /lib/libc-2.7.so 73.00 1.4% sock_recv_ts_and_drops [kernel] 67.00 1.3% __skb_recv_datagram [kernel] 65.00 1.2% tick_nohz_stop_sched_tick [kernel] Interesting stuff; check cache miss contributions - wow, how low is eth_type_trans.. and yet we keep optimizing that! ------------------------------------------------------------------------------- PerfTop: 1021 irqs/sec kernel:98.8% [1000Hz cache-misses], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _______________________________ ________ 5271.00 77.8% sky2_poll [sky2] 706.00 10.4% kmem_cache_alloc [kernel] 154.00 2.3% dev_gro_receive [kernel] 149.00 2.2% __napi_gro_receive [kernel] 128.00 1.9% napi_gro_receive [kernel] 106.00 1.6% __alloc_skb [kernel] 57.00 0.8% eth_type_trans [kernel] 45.00 0.7% skb_gro_reset_offset [kernel] 26.00 0.4% drain_array [kernel] 23.00 0.3% perf_session__mmap_read_counter perf 10.00 0.1% cache_alloc_refill [kernel] 9.00 0.1% __netdev_alloc_skb [kernel] 9.00 0.1% event__preprocess_sample perf ------------------------------------------------------------------------------- PerfTop: 997 irqs/sec kernel:100.0% [1000Hz cache-misses], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ____________________ ________ 3019.00 79.4% sky2_poll [sky2] 360.00 9.5% kmem_cache_alloc [kernel] 91.00 2.4% dev_gro_receive [kernel] 86.00 2.3% __alloc_skb [kernel] 83.00 2.2% __napi_gro_receive [kernel] 69.00 1.8% napi_gro_receive [kernel] 45.00 1.2% eth_type_trans [kernel] 25.00 0.7% skb_gro_reset_offset [kernel] 9.00 0.2% __netdev_alloc_skb [kernel] 5.00 0.1% cache_alloc_refill [kernel] 5.00 0.1% skb_pull [kernel] ------------------------------------------------------------------------------- PerfTop: 997 irqs/sec kernel:100.0% [1000Hz cache-misses], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ____________________ ________ 8887.00 79.8% sky2_poll [sky2] 1138.00 10.2% kmem_cache_alloc [kernel] 273.00 2.5% __napi_gro_receive [kernel] 246.00 2.2% dev_gro_receive [kernel] 189.00 1.7% napi_gro_receive [kernel] 159.00 1.4% __alloc_skb [kernel] 119.00 1.1% eth_type_trans [kernel] 86.00 0.8% skb_gro_reset_offset [kernel] 13.00 0.1% __netdev_alloc_skb [kernel] 8.00 0.1% skb_pull [kernel] 7.00 0.1% cache_alloc_refill [kernel] Not much going on in other cpus .. i.e hardly anything shows up in the profile .. IV: rps with ee and irq affinity to cpu0 Avg udp packets sunk: 95.15% ------------------------------------------------------------------------------- PerfTop: 3558 irqs/sec kernel:84.6% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 3096.00 17.1% sky2_poll [sky2] 645.00 3.6% _raw_spin_lock_irqsave [kernel] 493.00 2.7% system_call [kernel] 462.00 2.6% sky2_intr [sky2] 416.00 2.3% _raw_spin_unlock_irqrestore [kernel] 382.00 2.1% fget [kernel] 361.00 2.0% __netif_receive_skb [kernel] 342.00 1.9% ip_rcv [kernel] 334.00 1.8% _raw_spin_lock [kernel] 320.00 1.8% sys_epoll_ctl [kernel] 298.00 1.6% copy_user_generic_string [kernel] 288.00 1.6% call_function_single_interrup [kernel] 277.00 1.5% load_balance [kernel] 271.00 1.5% ip_route_input [kernel] 270.00 1.5% vread_tsc [kernel].vsyscall_fn 256.00 1.4% kmem_cache_free [kernel] 222.00 1.2% __udp4_lib_lookup [kernel] 222.00 1.2% schedule [kernel] 194.00 1.1% fput [kernel] 189.00 1.0% kmem_cache_alloc [kernel] 171.00 0.9% sys_epoll_wait [kernel] 164.00 0.9% ep_remove [kernel] ------------------------------------------------------------------------------- PerfTop: 3452 irqs/sec kernel:84.3% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 5033.00 16.2% sky2_poll [sky2] 1147.00 3.7% _raw_spin_lock_irqsave [kernel] 888.00 2.9% system_call [kernel] 774.00 2.5% sky2_intr [sky2] 757.00 2.4% _raw_spin_unlock_irqrestore [kernel] 702.00 2.3% fget [kernel] 630.00 2.0% __netif_receive_skb [kernel] 609.00 2.0% _raw_spin_lock [kernel] 607.00 2.0% ip_rcv [kernel] 553.00 1.8% sys_epoll_ctl [kernel] 514.00 1.7% ip_route_input [kernel] 508.00 1.6% call_function_single_interrup [kernel] 504.00 1.6% copy_user_generic_string [kernel] 466.00 1.5% kmem_cache_free [kernel] 452.00 1.5% schedule [kernel] 450.00 1.4% vread_tsc [kernel].vsyscall_fn 390.00 1.3% load_balance [kernel] 377.00 1.2% fput [kernel] 364.00 1.2% __udp4_lib_lookup [kernel] 329.00 1.1% kmem_cache_alloc [kernel] 314.00 1.0% ep_remove [kernel] 289.00 0.9% dst_release [kernel] 276.00 0.9% sys_epoll_wait [kernel] 265.00 0.9% datagram_poll [kernel] ------------------------------------------------------------------------------- PerfTop: 3328 irqs/sec kernel:85.7% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 6788.00 17.5% sky2_poll [sky2] 1413.00 3.6% _raw_spin_lock_irqsave [kernel] 1042.00 2.7% system_call [kernel] 997.00 2.6% sky2_intr [sky2] 903.00 2.3% _raw_spin_unlock_irqrestore [kernel] 837.00 2.2% fget [kernel] 740.00 1.9% _raw_spin_lock [kernel] 725.00 1.9% __netif_receive_skb [kernel] 722.00 1.9% ip_rcv [kernel] 651.00 1.7% sys_epoll_ctl [kernel] 609.00 1.6% call_function_single_interrup [kernel] 604.00 1.6% ip_route_input [kernel] 601.00 1.5% copy_user_generic_string [kernel] 573.00 1.5% schedule [kernel] 561.00 1.4% kmem_cache_free [kernel] 538.00 1.4% load_balance [kernel] 515.00 1.3% vread_tsc [kernel].vsyscall_fn 480.00 1.2% fput [kernel] 421.00 1.1% kmem_cache_alloc [kernel] 418.00 1.1% __udp4_lib_lookup [kernel] 377.00 1.0% ep_remove [kernel] 347.00 0.9% datagram_poll [kernel] 335.00 0.9% dst_release [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:96.2% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 2109.00 61.3% sky2_poll [sky2] 366.00 10.6% sky2_intr [sky2] 84.00 2.4% __alloc_skb [kernel] 57.00 1.7% _raw_spin_lock_irqsave [kernel] 56.00 1.6% get_rps_cpu [kernel] 52.00 1.5% __kmalloc [kernel] 39.00 1.1% irq_entries_start [kernel] 39.00 1.1% enqueue_to_backlog [kernel] 34.00 1.0% kmem_cache_alloc [kernel] 33.00 1.0% default_send_IPI_mask_sequenc [kernel] 32.00 0.9% sky2_rx_submit [sky2] 30.00 0.9% swiotlb_sync_single [kernel] 28.00 0.8% _raw_spin_lock [kernel] 23.00 0.7% sky2_remove [sky2] 22.00 0.6% __smp_call_function_single [kernel] 19.00 0.6% system_call [kernel] 18.00 0.5% sys_epoll_ctl [kernel] 18.00 0.5% fget [kernel] 17.00 0.5% cache_alloc_refill [kernel] 16.00 0.5% copy_user_generic_string [kernel] 16.00 0.5% _raw_spin_unlock_irqrestore [kernel] 15.00 0.4% dev_gro_receive [kernel] 14.00 0.4% net_rx_action [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:97.9% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _______________________________ ____________________ 4479.00 60.9% sky2_poll [sky2] 849.00 11.5% sky2_intr [sky2] 163.00 2.2% __alloc_skb [kernel] 155.00 2.1% get_rps_cpu [kernel] 121.00 1.6% _raw_spin_lock_irqsave [kernel] 92.00 1.3% __kmalloc [kernel] 89.00 1.2% _raw_spin_lock [kernel] 83.00 1.1% enqueue_to_backlog [kernel] 79.00 1.1% irq_entries_start [kernel] 78.00 1.1% kmem_cache_alloc [kernel] 69.00 0.9% sky2_rx_submit [sky2] 65.00 0.9% swiotlb_sync_single [kernel] 58.00 0.8% default_send_IPI_mask_sequence_ [kernel] 50.00 0.7% system_call [kernel] 45.00 0.6% fget [kernel] 40.00 0.5% sky2_remove [sky2] 37.00 0.5% __smp_call_function_single [kernel] 36.00 0.5% datagram_poll [kernel] 36.00 0.5% _raw_spin_unlock_irqrestore [kernel] 34.00 0.5% cache_alloc_refill [kernel] 31.00 0.4% net_rx_action [kernel] 28.00 0.4% kmem_cache_free [kernel] 27.00 0.4% _raw_spin_lock_bh [kernel] 27.00 0.4% copy_user_generic_string [kernel] 25.00 0.3% dev_gro_receive [kernel] ------------------------------------------------------------------------------- PerfTop: 980 irqs/sec kernel:97.3% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _______________________________ ____________________ 6544.00 61.6% sky2_poll [sky2] 1098.00 10.3% sky2_intr [sky2] 248.00 2.3% __alloc_skb [kernel] 198.00 1.9% get_rps_cpu [kernel] 182.00 1.7% _raw_spin_lock_irqsave [kernel] 144.00 1.4% __kmalloc [kernel] 138.00 1.3% _raw_spin_lock [kernel] 127.00 1.2% kmem_cache_alloc [kernel] 125.00 1.2% irq_entries_start [kernel] 119.00 1.1% enqueue_to_backlog [kernel] 93.00 0.9% sky2_rx_submit [sky2] 91.00 0.9% swiotlb_sync_single [kernel] 83.00 0.8% default_send_IPI_mask_sequence_ [kernel] 82.00 0.8% system_call [kernel] 64.00 0.6% sky2_remove [sky2] 60.00 0.6% fget [kernel] 58.00 0.5% cache_alloc_refill [kernel] 57.00 0.5% _raw_spin_unlock_irqrestore [kernel] 51.00 0.5% datagram_poll [kernel] 47.00 0.4% copy_user_generic_string [kernel] ------------------------------------------------------------------------------- PerfTop: 315 irqs/sec kernel:81.0% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 114.00 4.5% system_call [kernel] 98.00 3.9% _raw_spin_lock_irqsave [kernel] 89.00 3.5% _raw_spin_unlock_irqrestore [kernel] 89.00 3.5% ip_rcv [kernel] 83.00 3.3% call_function_single_interrup [kernel] 76.00 3.0% __netif_receive_skb [kernel] 67.00 2.6% fget [kernel] 62.00 2.4% ip_route_input [kernel] 59.00 2.3% vread_tsc [kernel].vsyscall_fn 54.00 2.1% kmem_cache_free [kernel] 54.00 2.1% sys_epoll_ctl [kernel] 51.00 2.0% schedule [kernel] 49.00 1.9% _raw_spin_lock [kernel] 49.00 1.9% __udp4_lib_lookup [kernel] 44.00 1.7% ep_remove [kernel] 44.00 1.7% copy_user_generic_string [kernel] 41.00 1.6% fput [kernel] 38.00 1.5% sys_epoll_wait [kernel] 37.00 1.5% tick_nohz_stop_sched_tick [kernel] 36.00 1.4% kmem_cache_alloc [kernel] 34.00 1.3% datagram_poll [kernel] 33.00 1.3% __udp4_lib_rcv [kernel] 31.00 1.2% process_recv mcpudp ------------------------------------------------------------------------------- PerfTop: 292 irqs/sec kernel:82.9% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 154.00 4.7% _raw_spin_lock_irqsave [kernel] 140.00 4.2% system_call [kernel] 111.00 3.4% ip_rcv [kernel] 106.00 3.2% _raw_spin_unlock_irqrestore [kernel] 96.00 2.9% call_function_single_interrup [kernel] 95.00 2.9% fget [kernel] 90.00 2.7% __netif_receive_skb [kernel] 89.00 2.7% sys_epoll_ctl [kernel] 77.00 2.3% copy_user_generic_string [kernel] 77.00 2.3% ip_route_input [kernel] 76.00 2.3% kmem_cache_free [kernel] 74.00 2.2% _raw_spin_lock [kernel] 71.00 2.1% schedule [kernel] 69.00 2.1% vread_tsc [kernel].vsyscall_fn 58.00 1.8% __udp4_lib_lookup [kernel] 52.00 1.6% __udp4_lib_rcv [kernel] 51.00 1.5% fput [kernel] 47.00 1.4% ep_remove [kernel] 47.00 1.4% event_base_loop libevent-1.3e.so.1.0.3 39.00 1.2% process_recv mcpudp 39.00 1.2% sys_epoll_wait [kernel] 38.00 1.2% udp_recvmsg [kernel] 38.00 1.2% sock_recv_ts_and_drops [kernel] 37.00 1.1% __switch_to [kernel] ------------------------------------------------------------------------------- PerfTop: 290 irqs/sec kernel:82.1% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ _____________________________ ______________________ 175.00 4.7% _raw_spin_lock_irqsave [kernel] 153.00 4.2% system_call [kernel] 122.00 3.3% ip_rcv [kernel] 114.00 3.1% _raw_spin_unlock_irqrestore [kernel] 114.00 3.1% fget [kernel] 105.00 2.8% __netif_receive_skb [kernel] 101.00 2.7% sys_epoll_ctl [kernel] 100.00 2.7% call_function_single_interrup [kernel] 90.00 2.4% copy_user_generic_string [kernel] 84.00 2.3% schedule [kernel] 76.00 2.1% kmem_cache_free [kernel] 76.00 2.1% _raw_spin_lock [kernel] 72.00 2.0% ip_route_input [kernel] 70.00 1.9% vread_tsc [kernel].vsyscall_fn 68.00 1.8% __udp4_lib_lookup [kernel] 68.00 1.8% __udp4_lib_rcv [kernel] 57.00 1.5% ep_remove [kernel] 57.00 1.5% fput [kernel] 55.00 1.5% kmem_cache_alloc [kernel] 51.00 1.4% process_recv mcpudp ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 23:44 ` [PATCH net-next-2.6] net: speedup udp receive path jamal @ 2010-04-29 0:00 ` jamal 2010-04-29 4:09 ` Eric Dumazet 1 sibling, 0 replies; 108+ messages in thread From: jamal @ 2010-04-29 0:00 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Wed, 2010-04-28 at 19:45 -0400, jamal wrote: > Your patch has improved the performance of rps relative to what is in > net-next very lightly; but it has also improved the performance of > non-rps;-> Correction: Last part of sentence not true (obvious if you look at results i attached) cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-28 23:44 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 2010-04-29 0:00 ` jamal @ 2010-04-29 4:09 ` Eric Dumazet 2010-04-29 11:35 ` jamal 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 4:09 UTC (permalink / raw) To: hadi Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le mercredi 28 avril 2010 à 19:44 -0400, jamal a écrit : > On Wed, 2010-04-28 at 16:06 +0200, Eric Dumazet wrote: > > > Here it is ;) > > Sorry - things got a little hectic with TheMan. > > I am afraid i dont have good news. > Actually, I should say i dont have good news in regards to rps. > For my sample app, two things seem to be happening: > a) The overall performance has gotten better for both rps > and non-rps. > b) non-rps is now performing relatively better > > This is just what i see in net-next not related to your patch. > It seems the kernels i tested prior to April 23 showed rps better. > The one i tested on Apr23 showed rps being about the same as non-rps. > As i stated in my last result posting, I thought i didnt test properly > but i did again today and saw the same thing. And now non-rps is > _consistently_ better. > So some regression is going on... > > Your patch has improved the performance of rps relative to what is in > net-next very lightly; but it has also improved the performance of > non-rps;-> > My traces look different for the app cpu than yours - likely because of > the apps being different. > > At the moment i dont have time to dig deeper into code, but i could > test as cycles show up. > > I am attaching the profile traces and results. > > cheers, > jamal Hi Jamal I dont see in your results the number of pps, number of udp ports, number of flows. In my latest results, I can handle more pps than before, regardless of rps being on or off, and with various number of udp ports (one user thread per port), number of flows (many src addr so that rps spread packets on many cpus) If/when contention windows are smaller, cpu can run uncontended, and can consume more cycles to process more frames ? With a non yet published patch, I even can reach 600.000 pps in DDOS situations, instead of 400.000. Thanks ! ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 4:09 ` Eric Dumazet @ 2010-04-29 11:35 ` jamal 2010-04-29 12:12 ` Changli Gao 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-29 11:35 UTC (permalink / raw) To: Eric Dumazet Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz [-- Attachment #1: Type: text/plain, Size: 2089 bytes --] On Thu, 2010-04-29 at 06:09 +0200, Eric Dumazet wrote: > I dont see in your results the number of pps, number of udp ports, > number of flows. My test scenario is still the same: send 1M packets of 8 flows round-robin at 750Kpps. Repeat test 4-6 times and average out. 8 flows map to 8 cpus. Any rate above 750Kpps and the driver starts dropping. The flows are {Fixed dst IP, fixed src IP, fixed src port, 8 variable dst port}. ip_rcv and friends show up in profile as we have already discussed - but i dont want to change the test characteristic because i cant do fair backward comparison. Also i use rps mask ee to use all the cpus except the core doing demux (core 0). In the results when i say "udp sink 90%" it means 90% of 750Kpps was successfuly received by the app (on the multiple cpus). > In my latest results, I can handle more pps than before, regardless of > rps being on or off, Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps. Attached is history results to make more sense of what i am saying: we have net-next kernels from apr14, apr23, apr23 with changlis change, apr28, apr28 with your change. What you'll see is non-rps (blue) gets better and rps (Orange) gets better slowly then by apr28 it is worse. > and with various number of udp ports (one user > thread per port), number of flows (many src addr so that rps spread > packets on many cpus) > This is true for me except for non rps getting relatively better and rps getting worse in plain net-next for Apr 28. Sorry, dont have time to dissect where things changed but i figured if i reported it will point to something obvious. > If/when contention windows are smaller, cpu can run uncontended, and can > consume more cycles to process more frames ? > > With a non yet published patch, I even can reach 600.000 pps in DDOS > situations, instead of 400.000. So my tests are simpler. What i was hoping to see was at minimum rps maintains its gap of 6-7% more capacity. I dont mind seeing rps get better. If both rps and non-rps get better that even more interesting. cheers, jamal [-- Attachment #2: rps-hist.pdf --] [-- Type: application/pdf, Size: 212033 bytes --] ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 11:35 ` jamal @ 2010-04-29 12:12 ` Changli Gao 2010-04-29 12:45 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: Changli Gao @ 2010-04-29 12:12 UTC (permalink / raw) To: hadi Cc: Eric Dumazet, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Thu, Apr 29, 2010 at 7:35 PM, jamal <hadi@cyberus.ca> wrote: > > Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps. > Attached is history results to make more sense of what i am saying: > we have net-next kernels from apr14, apr23, apr23 with changlis change, > apr28, apr28 with your change. What you'll see is non-rps (blue) gets > better and rps (Orange) gets better slowly then by apr28 it is worse. Did the number of IPIs increase in the apr28 test? The finial patch with Eric's change may introduce more IPIs. And I am wondering why 23rdcl-non-rps is better than before. Maybe it is the side effect of my patch: enlarge the netdev_max_backlog. -- Regards, Changli Gao(xiaosuo@gmail.com) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 12:12 ` Changli Gao @ 2010-04-29 12:45 ` Eric Dumazet 2010-04-29 13:17 ` jamal 2010-04-29 23:07 ` Changli Gao 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 12:45 UTC (permalink / raw) To: Changli Gao Cc: hadi, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le jeudi 29 avril 2010 à 20:12 +0800, Changli Gao a écrit : > On Thu, Apr 29, 2010 at 7:35 PM, jamal <hadi@cyberus.ca> wrote: > > > > Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps. > > Attached is history results to make more sense of what i am saying: > > we have net-next kernels from apr14, apr23, apr23 with changlis change, > > apr28, apr28 with your change. What you'll see is non-rps (blue) gets > > better and rps (Orange) gets better slowly then by apr28 it is worse. > > Did the number of IPIs increase in the apr28 test? The finial patch > with Eric's change may introduce more IPIs. And I am wondering why > 23rdcl-non-rps is better than before. Maybe it is the side effect of > my patch: enlarge the netdev_max_backlog. > > Changli, I wonder how you can cook "performance" patches without testing them at all for real... This cannot be true ? When the cpu doing the device softirq is flooded, it handles 300 packets per net_rx_action() round (netdev_budget), so sends at most 6 ipis per 300 packets, with or without my patch, with or without your patch as well. (At most because if remote cpus are flooded as well, they dont napi_complete so no IPI needed at all) (My patch had an effect only on normal load, ie one packet received in a while... up to 50.000 pps I would say). And it also has a nice effect on non RPS loads (mostly the more typical load for following years). If a second packet comes 3us after the first one, and before 2nd CPU handled it, we _can_ afford an extra IPI. 750.000/50 = 15.000 IPI per second. Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows that sending IPI is very cheap (maybe ~1% of cpu cycles) # Samples: 32033467127 # # Overhead Command Shared Object Symbol # ........ .............. ................. ...... # 18.05% init [kernel.kallsyms] [k] poll_idle 10.91% init [kernel.kallsyms] [k] bnx2x_rx_int 10.42% init [kernel.kallsyms] [k] eth_type_trans 5.72% init [kernel.kallsyms] [k] kmem_cache_alloc_node 5.43% init [kernel.kallsyms] [k] __memset 5.20% init [kernel.kallsyms] [k] get_rps_cpu 4.82% init [kernel.kallsyms] [k] __slab_alloc 4.34% init [kernel.kallsyms] [k] get_partial_node 4.22% init [kernel.kallsyms] [k] _raw_spin_lock 3.41% init [kernel.kallsyms] [k] __kmalloc_node_track_caller 3.01% init [kernel.kallsyms] [k] __alloc_skb 2.22% init [kernel.kallsyms] [k] enqueue_to_backlog 2.10% init [kernel.kallsyms] [k] vlan_gro_common 1.34% init [kernel.kallsyms] [k] swiotlb_map_page 1.25% init [kernel.kallsyms] [k] skb_put 1.06% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.92% init [kernel.kallsyms] [k] dev_gro_receive 0.88% init [kernel.kallsyms] [k] swiotlb_dma_mapping_error 0.83% init [kernel.kallsyms] [k] vlan_gro_receive 0.83% init [kernel.kallsyms] [k] __phys_addr 0.83% init [kernel.kallsyms] [k] __napi_complete 0.83% init [kernel.kallsyms] [k] default_send_IPI_mask_sequence_phys 0.77% init [kernel.kallsyms] [k] is_swiotlb_buffer 0.76% init [kernel.kallsyms] [k] __netdev_alloc_skb 0.74% init [kernel.kallsyms] [k] deactivate_slab 0.73% init [kernel.kallsyms] [k] netif_receive_skb 0.72% init [kernel.kallsyms] [k] unmap_single 0.69% init [kernel.kallsyms] [k] csd_lock 0.63% init [kernel.kallsyms] [k] bnx2x_poll 0.61% init [kernel.kallsyms] [k] bnx2x_msix_fp_int 0.59% init [kernel.kallsyms] [k] irq_entries_start 0.59% init [kernel.kallsyms] [k] swiotlb_sync_single 0.54% init [kernel.kallsyms] [k] get_slab 0.46% init [kernel.kallsyms] [k] napi_skb_finish ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 12:45 ` Eric Dumazet @ 2010-04-29 13:17 ` jamal 2010-04-29 13:21 ` Eric Dumazet 2010-04-29 23:07 ` Changli Gao 1 sibling, 1 reply; 108+ messages in thread From: jamal @ 2010-04-29 13:17 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Thu, 2010-04-29 at 14:45 +0200, Eric Dumazet wrote: > > Changli, I wonder how you can cook "performance" patches without testing > them at all for real... This cannot be true ? Eric, I am with you, however you are in the minority of people who test and produce numbers ;-> The system rewards people for sending patches not much for anything else - so i cant blame Changli ;-> > When the cpu doing the device softirq is flooded, it handles 300 packets > per net_rx_action() round (netdev_budget), so sends at most 6 ipis per > 300 packets, with or without my patch, with or without your patch as > well. > > (At most because if remote cpus are flooded as well, they dont > napi_complete so no IPI needed at all) > > (My patch had an effect only on normal load, ie one packet received in a > while... up to 50.000 pps I would say). And it also has a nice effect on > non RPS loads (mostly the more typical load for following years). > If a second packet comes 3us after the first one, and before 2nd CPU > handled it, we _can_ afford an extra IPI. > > 750.000/50 = 15.000 IPI per second. Could we have some stat in there that shows IPIs being produced? I think it would help to at least observe any changes over variety of tests. I did try to patch my system during the first few tests to record IPIs but it seems to make more sense to have it as a perf stat. > Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows > that sending IPI is very cheap (maybe ~1% of cpu cycles) > > # Samples: 32033467127 > # One thing i observed is our profiles seem different. Could you send me your .config for a single nehalem and i will try to go as close as possible to it? I have a sky2 instead of bnx - but i suspect everything else will be very similar... I apologize i dont have much time to look into details - but what i can do is test at least. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 13:17 ` jamal @ 2010-04-29 13:21 ` Eric Dumazet 2010-04-29 13:37 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 13:21 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le jeudi 29 avril 2010 à 09:17 -0400, jamal a écrit : > Could we have some stat in there that shows IPIs being produced? I think > it would help to at least observe any changes over variety of tests. > I did try to patch my system during the first few tests to record IPIs > but it seems to make more sense to have it as a perf stat. > > > Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows > > that sending IPI is very cheap (maybe ~1% of cpu cycles) > > > > # Samples: 32033467127 > > # > > One thing i observed is our profiles seem different. Could you send me > your .config for a single nehalem and i will try to go as close as > possible to it? I have a sky2 instead of bnx - but i suspect everything > else will be very similar... > I apologize i dont have much time to look into details - but what i can > do is test at least. I'am going to redo some test on my 'old machine', with tg3 driver. You could try following program : #include <stdio.h> #include <string.h> #include <stdlib.h> #include <unistd.h> struct softnet_stat_vals { int flip; unsigned int tab[2][10]; }; int read_file(struct softnet_stat_vals *v) { char buffer[1024]; FILE *F = fopen("/proc/net/softnet_stat", "r"); v->flip ^= 1; if (!F) return -1; memset(v->tab[v->flip], 0, 10 * sizeof(unsigned int)); while (fgets(buffer, sizeof(buffer), F)) { int i, pos = 0; unsigned int val; for (i = 0; ;) { if (sscanf(buffer + pos, "%08x", &val) != 1) break; v->tab[v->flip][i] += val; pos += 9; if (++i == 10) break; } } fclose(F); } int main(int argc, char *argv[]) { struct softnet_stat_vals *v = calloc(sizeof(struct softnet_stat_vals), 1); read_file(v); for (;;) { sleep(1); read_file(v); printf("%u rps\n", v->tab[v->flip][9] - v->tab[v->flip^1][9]); } } ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 13:21 ` Eric Dumazet @ 2010-04-29 13:37 ` jamal 2010-04-29 13:49 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-29 13:37 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Thu, 2010-04-29 at 15:21 +0200, Eric Dumazet wrote: > > You could try following program : > Will do later today (test machine is not on the network and is about 20 minutes from here; so worst case i will get you results by end of day) I guess this program is good enough since it tells me the system wide ipi count - what my patch did was also to break it down by which cpu got how many IPIs (served to check if there was uneven distribution) > > Is your application mono threaded and receiving data to 8 sockets ? > I fork one instance per detected cpu and bind to different ports each time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 13:37 ` jamal @ 2010-04-29 13:49 ` Eric Dumazet 2010-04-29 13:56 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 13:49 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le jeudi 29 avril 2010 à 09:37 -0400, jamal a écrit : > On Thu, 2010-04-29 at 15:21 +0200, Eric Dumazet wrote: > > > > > > You could try following program : > > > > Will do later today (test machine is not on the network and is about 20 > minutes from here; so worst case i will get you results by end of day) > I guess this program is good enough since it tells me the system wide > ipi count - what my patch did was also to break it down by which cpu got > how many IPIs (served to check if there was uneven distribution) > > > > > Is your application mono threaded and receiving data to 8 sockets ? > > > > I fork one instance per detected cpu and bind to different ports each > time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc. > I guess this is the problem ;) With RPS, you should not bind your threads to cpu. This is the rps hash who will decide for you. I am using following program : /* * Usage: udpsink [ -p baseport] nbports * */ #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <string.h> #include <stdio.h> #include <errno.h> #include <unistd.h> #include <stdlib.h> #include <fcntl.h> struct worker_data { int fd; unsigned long pack_count; unsigned long bytes_count; unsigned long _padd[16 - 3]; /* alignment */ }; void usage(int code) { fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n"); exit(code); } void *worker_func(void *arg) { struct worker_data *wdata = (struct worker_data *)arg; char buffer[4096]; struct sockaddr_in addr; int lu; while (1) { socklen_t len = sizeof(addr); lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0, (struct sockaddr *)&addr, &len); if (lu > 0) { wdata->pack_count++; wdata->bytes_count += lu; } } } int main(int argc, char *argv[]) { int c; int baseport = 4000; int nbthreads; struct worker_data *wdata; unsigned long ototal = 0; int concurrent = 0; int verbose = 0; int i; while ((c = getopt(argc, argv, "cvp:")) != -1) { if (c == 'p') baseport = atoi(optarg); else if (c == 'c') concurrent = 1; else if (c == 'v') verbose++; else usage(1); } if (optind == argc) usage(1); nbthreads = atoi(argv[optind]); wdata = calloc(sizeof(struct worker_data), nbthreads); if (!wdata) { perror("calloc"); return 1; } for (i = 0; i < nbthreads; i++) { struct sockaddr_in addr; pthread_t tid; if (i && concurrent) { wdata[i].fd = wdata[0].fd ; } else { wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0); if (wdata[i].fd == -1) { perror("socket"); return 1; } memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; // addr.sin_addr.s_addr = inet_addr(argv[optind]); addr.sin_port = htons(baseport + i); if (bind(wdata[i].fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { perror("bind"); return 1; } // fcntl(wdata[i].fd, F_SETFL, O_NDELAY); } pthread_create(&tid, NULL, worker_func, wdata + i); } for (;;) { unsigned long total; long delta; sleep(1); total = 0; for (i = 0; i < nbthreads;i++) { total += wdata[i].pack_count; } delta = total - ototal; if (delta) { printf("%lu pps (%lu", delta, total); if (verbose) { for (i = 0; i < nbthreads;i++) { if (wdata[i].pack_count) printf(" %d:%lu", i, wdata[i].pack_count); } } printf(")\n"); } ototal = total; } } ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 13:49 ` Eric Dumazet @ 2010-04-29 13:56 ` jamal 2010-04-29 20:36 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-04-29 13:56 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Thu, 2010-04-29 at 15:49 +0200, Eric Dumazet wrote: > > I fork one instance per detected cpu and bind to different ports each > > time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc. > > > > I guess this is the problem ;) > > With RPS, you should not bind your threads to cpu. > This is the rps hash who will decide for you. > Sorry - I was not clear; i have the option of binding to cpu vs the setsched api; but what i meant in this case is: - for each cpu detected, fork -- open socket ---bind to udp port cpu# + 8200 I could also bind to a cpu in the last step and i did notice it improved distribution - but all my tests since apr23 dont do that ;-> > > I am using following program : > I will try your program instead so we can reduce the variables cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 13:56 ` jamal @ 2010-04-29 20:36 ` jamal 2010-04-29 21:01 ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet 2010-04-30 19:30 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 0 siblings, 2 replies; 108+ messages in thread From: jamal @ 2010-04-29 20:36 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz [-- Attachment #1: Type: text/plain, Size: 738 bytes --] On Thu, 2010-04-29 at 09:56 -0400, jamal wrote: > > I will try your program instead so we can reduce the variables Results attached. With your app rps does a hell lot better and non-rps worse ;-> With my proggie, non-rps does much better than yours and rps does a lot worse for same setup. I see the scheduler kicking quiet a bit in non-rps for you... The main difference between us as i see it is: a) i use epoll - actually linked to libevent (1.0.something) b) I fork processes and you use pthreads. I dont have time to chase it today, but 1) I am either going to change yours to use libevent or make mine get rid of it then 2) move towards pthreads or have yours fork.. then observe if that makes any difference.. cheers, jamal [-- Attachment #2: apr29-res.txt --] [-- Type: text/plain, Size: 29074 bytes --] No RPS; same kernel as yesterday with Eric's changes ------------------------------------------------------------------------------- PerfTop: 2572 irqs/sec kernel:94.7% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 2901.00 17.4% sky2_poll [sky2] 781.00 4.7% schedule [kernel] 574.00 3.4% __skb_recv_datagram [kernel] 518.00 3.1% _raw_spin_lock_irqsave [kernel] 460.00 2.8% udp_recvmsg [kernel] 457.00 2.7% copy_user_generic_string [kernel] 397.00 2.4% _raw_spin_lock_bh [kernel] 340.00 2.0% __udp4_lib_lookup [kernel] 320.00 1.9% ip_route_input [kernel] 295.00 1.8% _raw_spin_lock [kernel] 293.00 1.8% dst_release [kernel] 282.00 1.7% ip_rcv [kernel] 275.00 1.6% skb_copy_datagram_iovec [kernel] 263.00 1.6% __switch_to [kernel] 257.00 1.5% __alloc_skb [kernel] 256.00 1.5% system_call [kernel] 243.00 1.5% sock_recv_ts_and_drops [kernel] 227.00 1.4% sock_queue_rcv_skb [kernel] 225.00 1.3% _raw_spin_unlock_irqrestore [kernel] 220.00 1.3% fget_light [kernel] 218.00 1.3% pick_next_task_fair [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 1508.00 37.9% sky2_poll [sky2] 198.00 5.0% ip_route_input [kernel] 184.00 4.6% __udp4_lib_lookup [kernel] 172.00 4.3% ip_rcv [kernel] 139.00 3.5% _raw_spin_lock [kernel] 131.00 3.3% __alloc_skb [kernel] 130.00 3.3% sock_queue_rcv_skb [kernel] 111.00 2.8% __udp4_lib_rcv [kernel] 101.00 2.5% __netif_receive_skb [kernel] 78.00 2.0% select_task_rq_fair [kernel] 74.00 1.9% try_to_wake_up [kernel] 73.00 1.8% sock_def_readable [kernel] 72.00 1.8% _raw_spin_lock_irqsave [kernel] 67.00 1.7% task_rq_lock [kernel] 66.00 1.7% _raw_read_lock [kernel] 64.00 1.6% __kmalloc [kernel] 62.00 1.6% resched_task [kernel] 61.00 1.5% sky2_rx_submit [sky2] 52.00 1.3% ip_local_deliver [kernel] 51.00 1.3% kmem_cache_alloc [kernel] 51.00 1.3% swiotlb_sync_single [kernel] 43.00 1.1% sky2_remove [sky2] 41.00 1.0% udp_queue_rcv_skb [kernel] 39.00 1.0% __wake_up_common [kernel] ------------------------------------------------------------------------------- PerfTop: 368 irqs/sec kernel:95.9% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 279.00 8.2% schedule [kernel] 260.00 7.7% __skb_recv_datagram [kernel] 196.00 5.8% _raw_spin_lock_bh [kernel] 180.00 5.3% copy_user_generic_string [kernel] 176.00 5.2% udp_recvmsg [kernel] 150.00 4.4% _raw_spin_lock_irqsave [kernel] 142.00 4.2% dst_release [kernel] 106.00 3.1% skb_copy_datagram_iovec [kernel] 97.00 2.9% sock_recv_ts_and_drops [kernel] 93.00 2.7% tick_nohz_stop_sched_tick [kernel] 89.00 2.6% sys_recvfrom [kernel] 89.00 2.6% __switch_to [kernel] 86.00 2.5% pick_next_task_fair [kernel] 82.00 2.4% sock_rfree [kernel] 75.00 2.2% system_call [kernel] 73.00 2.2% fget_light [kernel] 70.00 2.1% _raw_spin_lock_irq [kernel] 63.00 1.9% kmem_cache_free [kernel] 61.00 1.8% _raw_spin_unlock_irqrestore [kernel] 60.00 1.8% kfree [kernel] 56.00 1.7% select_nohz_load_balancer [kernel] 55.00 1.6% finish_task_switch [kernel] 48.00 1.4% inet_recvmsg [kernel] 41.00 1.2% security_socket_recvmsg [kernel] ------------------------------------------------------------------------------- PerfTop: 97 irqs/sec kernel:81.4% [1000Hz cycles], (all, cpu: 7) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ____________________________ ________ 55.00 10.8% schedule [kernel] 38.00 7.5% __skb_recv_datagram [kernel] 36.00 7.1% udp_recvmsg [kernel] 32.00 6.3% _raw_spin_lock_irqsave [kernel] 31.00 6.1% _raw_spin_lock_bh [kernel] 30.00 5.9% copy_user_generic_string [kernel] 29.00 5.7% sock_recv_ts_and_drops [kernel] 27.00 5.3% skb_copy_datagram_iovec [kernel] 17.00 3.3% system_call [kernel] 17.00 3.3% dst_release [kernel] 14.00 2.7% _raw_spin_unlock_irqrestore [kernel] 12.00 2.4% __switch_to [kernel] 12.00 2.4% pick_next_task_fair [kernel] 11.00 2.2% inet_recvmsg [kernel] 11.00 2.2% sys_recvfrom [kernel] 10.00 2.0% finish_task_switch [kernel] 10.00 2.0% sock_rfree [kernel] 10.00 2.0% select_nohz_load_balancer [kernel] 7.00 1.4% rcu_enter_nohz [kernel] 7.00 1.4% tick_nohz_stop_sched_tick [kernel] 7.00 1.4% tick_nohz_restart_sched_tick [kernel] 5.00 1.0% ktime_get [kernel] Run1 ---- 557257 pps (557257 0:69750 1:69417 2:69063 3:68818 4:70139 5:69824 6:70135 7:70113) 737468 pps (1294725 0:162765 1:162430 2:162075 3:155770 4:163150 5:162838 6:163150 7:162549) 744238 pps (2038963 0:255795 1:255460 2:255105 3:248800 4:256180 5:255867 6:256180 7:255579) 719343 pps (2758306 0:348825 1:348202 2:348135 3:338166 4:349210 5:333030 6:349210 7:343528) 741830 pps (3500136 0:440870 1:440933 2:441165 3:430162 4:442240 5:425970 6:442240 7:436558) 686289 pps (4186425 0:533900 1:533749 2:515637 3:511486 4:531997 5:504717 6:525536 7:529406) 681708 pps (4868133 0:613701 1:617409 2:608667 3:599774 4:607480 5:589487 6:609802 7:621817) 697577 pps (5565710 0:704183 1:710439 2:688904 3:681696 4:689120 5:673932 6:702448 7:714988) 729284 pps (6294994 0:797213 1:803469 2:775863 3:770959 4:781160 5:766105 6:792207 7:808018) 734160 pps (7029154 0:886389 1:896504 2:868898 3:863506 4:868426 5:859138 6:885242 7:901053) 728541 pps (7757695 0:978789 1:989534 2:961928 3:946834 4:961458 5:952170 6:978272 7:988714) 709578 pps (8467273 0:1071819 1:1079000 2:1041101 3:1038974 4:1047215 5:1037254 6:1070168 7:1081744) 684154 pps (9151427 0:1160855 1:1158471 2:1122874 3:1129012 4:1136563 5:1120258 6:1153624 7:1169773) 498291 pps (9649718 0:1224303 1:1214178 2:1185737 3:1191467 4:1200058 5:1183753 6:1217121 7:1233101) Essentially sink in about 96.5% of 10M packet run2 --- 402553 pps (402553 0:51530 1:53289 2:53625 3:45748 4:53625 5:49484 6:42292 7:52960) 711539 pps (1114092 0:144028 1:146426 2:144237 3:124551 4:146760 5:142619 6:119376 7:146095) 692319 pps (1806411 0:208285 1:239557 2:220103 3:211096 4:239890 5:235749 6:212506 7:239225) 731896 pps (2538307 0:301450 1:332723 2:308718 3:304264 4:333055 5:320036 6:305671 7:332390) 712869 pps (3251176 0:393270 1:418806 2:397578 3:396844 4:426245 5:406943 6:398861 7:412629) 681513 pps (3932689 0:486300 1:501926 2:490613 3:489874 4:466455 5:499973 6:491891 7:505659) 697308 pps (4629997 0:567969 1:585032 2:583643 3:576712 4:548243 5:589399 6:581080 7:597922) 712903 pps (5342900 0:657579 1:660221 2:676673 3:669744 4:641273 5:682222 6:674110 7:681082) 687765 pps (6030665 0:744421 1:752470 2:764631 3:751445 4:722250 5:771799 6:761224 7:762426) 695799 pps (6726464 0:832438 1:842797 2:853337 3:844470 4:804427 5:857412 6:846918 7:844668) 720011 pps (7446475 0:925210 1:934696 2:934883 3:937280 4:894644 5:949883 6:932740 7:937142) 712021 pps (8158496 0:1017246 1:1027726 2:1016841 3:1024712 4:978513 5:1042913 6:1023516 7:1027031) 709810 pps (8868306 0:1098522 1:1111823 2:1109871 3:1117444 4:1070124 5:1131774 6:1109841 7:1118909) 591817 pps (9460123 0:1178005 1:1185698 2:1189381 3:1196367 4:1143880 5:1198406 6:1176121 7:1192265) 94.6% run3 --- 682714 pps (682714 0:83336 1:86683 2:86895 3:86243 4:84616 5:81152 6:86895 7:86895) 691212 pps (1373926 0:164602 1:179240 2:171897 3:174162 4:176509 5:158115 6:174083 7:175321) 661913 pps (2035839 0:243004 1:263829 2:259312 3:267160 4:268875 5:231009 6:253411 7:249239) 715612 pps (2751451 0:336034 1:350220 2:346461 3:360190 4:359219 5:317625 6:346441 7:335265) 655354 pps (3406805 0:419339 1:434934 2:432010 3:442138 4:437837 5:394805 6:427064 7:418679) 592126 pps (3998931 0:494253 1:511454 2:508829 3:511992 4:508978 5:474866 6:496884 7:491679) 697177 pps (4696108 0:584474 1:601703 2:589111 3:602252 4:598767 5:565114 6:582153 7:572539) 681004 pps (5377112 0:662864 1:684427 2:678825 3:688402 4:685441 5:651962 6:673697 7:651495) 669622 pps (6046734 0:740275 1:765126 2:762764 3:773772 4:772144 5:731330 6:762339 7:738987) 645906 pps (6692640 0:825606 1:850550 2:846793 3:858243 4:850408 5:812402 6:838248 7:810391) 705873 pps (7398513 0:916877 1:937693 2:929956 3:950433 4:938179 5:894913 6:928125 7:902337) 735460 pps (8133973 0:1009907 1:1030722 2:1022986 3:1037959 4:1031209 5:987943 6:1021155 7:992092) 707605 pps (8841578 0:1102933 1:1122367 2:1101160 3:1129212 4:1124239 5:1063617 6:1112929 7:1085122) 347807 pps (9189385 0:1149677 1:1168026 2:1147905 3:1170556 4:1158858 5:1110362 6:1152134 7:1131867) 91.9% run4 ---- 552606 pps (552606 0:72743 1:75411 2:67732 3:70204 4:63741 5:64934 6:66096 7:71746) 684450 pps (1237056 0:162839 1:165064 2:148974 3:160417 4:153919 5:135895 6:156238 7:153710) 696799 pps (1933855 0:254440 1:252304 2:240107 3:249399 4:246028 5:228009 6:247409 7:216161) 676546 pps (2610401 0:341132 1:336959 2:325332 3:330438 4:336250 5:305238 6:336208 7:298848) 712251 pps (3322652 0:432976 1:428990 2:413228 3:419977 4:425918 5:386917 6:426275 7:388371) 615680 pps (3938332 0:515679 1:497421 2:491618 3:505449 4:489452 5:462820 6:505336 7:470561) 635467 pps (4573799 0:597340 1:582917 2:555389 3:582751 4:573273 5:545378 6:584378 7:552373) 725581 pps (5299380 0:690038 1:675870 2:636347 3:676029 4:666231 5:632208 6:677337 7:645324) 699015 pps (5998395 0:783068 1:763654 2:725184 3:762784 4:752559 5:709123 6:764439 7:737586) 674472 pps (6672867 0:872645 1:847669 2:808333 3:827766 4:842267 5:798997 6:853779 7:821412) 680913 pps (7353780 0:961487 1:926760 2:887273 3:919158 4:925165 5:891082 6:929793 7:913064) 666279 pps (8020059 0:1050823 1:1012028 2:972691 3:988738 4:1009904 5:974127 6:1017940 7:993808) 680615 pps (8700674 0:1124223 1:1087779 2:1057541 3:1080546 4:1094373 5:1066880 6:1102496 7:1086838) 420306 pps (9120980 0:1177541 1:1130287 2:1111621 3:1134624 4:1148453 5:1120960 6:1156576 7:1140918) 91.2% run5 ------ 294229 pps (294229 0:38805 1:30946 2:32655 3:36613 4:38805 5:38805 6:38800 7:38801) 694748 pps (988977 0:124394 1:123976 2:114107 3:128079 4:111317 5:131835 6:131835 7:123434) 690185 pps (1679162 0:217405 1:216988 2:194192 3:204091 4:195948 5:224678 6:220924 7:204937) 726561 pps (2405723 0:307828 1:309671 2:278163 3:296811 4:286642 5:317346 6:311296 7:297967) 695974 pps (3101697 0:391228 1:395256 2:371056 3:388790 4:379533 5:410242 6:393051 7:372541) 665395 pps (3767092 0:473134 1:484367 2:447394 3:462837 4:471026 5:491170 6:473947 7:463219) 671483 pps (4438575 0:562883 1:574014 2:534258 3:544512 4:534064 5:581420 6:560073 7:547353) 679400 pps (5117975 0:641135 1:663809 2:618019 3:633448 4:605085 5:674433 6:649865 7:632183) 696263 pps (5814238 0:734516 1:743715 2:711049 3:717481 4:693193 5:758493 6:740374 7:715417) 681791 pps (6496029 0:823596 1:836004 2:795579 3:809104 4:783457 5:820061 6:820219 7:808010) 670672 pps (7166701 0:911202 1:927618 2:888127 3:875504 4:874363 5:889342 6:911838 7:888707) 743444 pps (7910145 0:1004233 1:1020652 2:981157 3:968534 4:967393 5:982078 6:1004362 7:981737) 725623 pps (8635768 0:1096546 1:1113682 2:1059978 3:1061564 4:1060423 5:1072761 6:1097392 7:1073423) 662504 pps (9298272 0:1171688 1:1197579 2:1137559 3:1154595 4:1146405 5:1161670 6:1176001 7:1152776) 12979 pps (9311251 0:1173488 1:1199379 2:1137914 3:1156399 4:1148209 5:1163475 6:1177806 7:1154581) 93.1% Average for no-rps 93.5% of 10M incoming at ~ 750Kpps. # echo 1 > /proc/irq/55/smp_affinity # echo ee > /sys/class/net/eth0/queues/rx-0/rps_cpus ------------------------------------------------------------------------------- PerfTop: 2273 irqs/sec kernel:93.7% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 922.00 10.3% sky2_poll [sky2] 402.00 4.5% __netif_receive_skb [kernel] 400.00 4.4% ip_rcv [kernel] 356.00 4.0% call_function_single_interrupt [kernel] 339.00 3.8% ip_route_input [kernel] 282.00 3.1% schedule [kernel] 194.00 2.2% _raw_spin_lock_irqsave [kernel] 180.00 2.0% sock_recv_ts_and_drops [kernel] 178.00 2.0% _raw_spin_lock [kernel] 173.00 1.9% __udp4_lib_lookup [kernel] 171.00 1.9% __udp4_lib_rcv [kernel] 162.00 1.8% system_call [kernel] 154.00 1.7% kfree [kernel] 147.00 1.6% __skb_recv_datagram [kernel] 146.00 1.6% copy_user_generic_string [kernel] 136.00 1.5% dst_release [kernel] 136.00 1.5% _raw_spin_unlock_irqrestore [kernel] 126.00 1.4% fget_light [kernel] 126.00 1.4% sky2_intr [sky2] 122.00 1.4% udp_recvmsg [kernel] 111.00 1.2% sock_queue_rcv_skb [kernel] ------------------------------------------------------------------------------- PerfTop: 325 irqs/sec kernel:93.2% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ ________ 1033.00 62.9% sky2_poll [sky2] 159.00 9.7% sky2_intr [sky2] 119.00 7.3% irq_entries_start [kernel] 51.00 3.1% __alloc_skb [kernel] 48.00 2.9% get_rps_cpu [kernel] 24.00 1.5% __kmalloc [kernel] 23.00 1.4% swiotlb_sync_single [kernel] 20.00 1.2% _raw_spin_lock [kernel] 17.00 1.0% sky2_rx_submit [sky2] 15.00 0.9% enqueue_to_backlog [kernel] 14.00 0.9% kmem_cache_alloc [kernel] 11.00 0.7% default_send_IPI_mask_sequence_phys [kernel] 10.00 0.6% sky2_remove [sky2] 10.00 0.6% cache_alloc_refill [kernel] 8.00 0.5% _raw_spin_lock_irqsave [kernel] 7.00 0.4% dev_gro_receive [kernel] 6.00 0.4% net_rx_action [kernel] 6.00 0.4% __netdev_alloc_skb [kernel] 6.00 0.4% load_balance [kernel] 5.00 0.3% __smp_call_function_single [kernel] ------------------------------------------------------------------------------- PerfTop: 347 irqs/sec kernel:96.3% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 104.00 6.7% call_function_single_interrupt [kernel] 104.00 6.7% __netif_receive_skb [kernel] 95.00 6.1% ip_rcv [kernel] 93.00 6.0% ip_route_input [kernel] 62.00 4.0% schedule [kernel] 49.00 3.2% sock_recv_ts_and_drops [kernel] 46.00 3.0% system_call [kernel] 46.00 3.0% dst_release [kernel] 45.00 2.9% _raw_spin_lock [kernel] 41.00 2.7% _raw_spin_lock_irqsave [kernel] 40.00 2.6% _raw_spin_unlock_irqrestore [kernel] 36.00 2.3% copy_user_generic_string [kernel] 34.00 2.2% __udp4_lib_rcv [kernel] 30.00 1.9% fget_light [kernel] 30.00 1.9% sock_queue_rcv_skb [kernel] 28.00 1.8% udp_recvmsg [kernel] 28.00 1.8% __udp4_lib_lookup [kernel] 26.00 1.7% select_task_rq_fair [kernel] 25.00 1.6% tick_nohz_stop_sched_tick [kernel] 23.00 1.5% __napi_complete [kernel] 20.00 1.3% __switch_to [kernel] 20.00 1.3% finish_task_switch [kernel] 20.00 1.3% kmem_cache_free [kernel] 20.00 1.3% sys_recvfrom [kernel] 19.00 1.2% kfree [kernel] 19.00 1.2% __skb_recv_datagram [kernel] ------------------------------------------------------------------------------- PerfTop: 243 irqs/sec kernel:95.5% [1000Hz cycles], (all, cpu: 7) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 92.00 7.3% ip_rcv [kernel] 74.00 5.9% __netif_receive_skb [kernel] 57.00 4.6% ip_route_input [kernel] 49.00 3.9% sock_recv_ts_and_drops [kernel] 49.00 3.9% system_call [kernel] 47.00 3.8% schedule [kernel] 39.00 3.1% _raw_spin_lock_irqsave [kernel] 36.00 2.9% call_function_single_interrupt [kernel] 34.00 2.7% udp_recvmsg [kernel] 32.00 2.6% __udp4_lib_rcv [kernel] 31.00 2.5% copy_user_generic_string [kernel] 31.00 2.5% fget_light [kernel] 30.00 2.4% __udp4_lib_lookup [kernel] 26.00 2.1% kfree [kernel] 25.00 2.0% __skb_recv_datagram [kernel] 25.00 2.0% sock_queue_rcv_skb [kernel] 23.00 1.8% __switch_to [kernel] 22.00 1.8% sock_recvmsg [kernel] 22.00 1.8% _raw_spin_unlock_irqrestore [kernel] 21.00 1.7% select_task_rq_fair [kernel] 18.00 1.4% _raw_spin_lock [kernel] 17.00 1.4% process_backlog [kernel] 17.00 1.4% sys_recvfrom [kernel] 17.00 1.4% _raw_spin_lock_bh [kernel] run1 ---- 590479 pps (590479 0:73820 1:73817 2:73820 3:73819 4:73815 5:73815 6:73815 7:73815) 744641 pps (1335120 0:166895 1:166895 2:166895 3:166895 4:166895 5:166895 6:166895 7:166895) 744374 pps (2079494 0:259940 1:259940 2:259940 3:259940 4:259940 5:259940 6:259940 7:259940) 744340 pps (2823834 0:352985 1:352985 2:352985 3:352985 4:352985 5:352985 6:352980 7:352985) 744390 pps (3568224 0:446035 1:446035 2:446035 3:446035 4:446035 5:446035 6:446032 7:446030) 744404 pps (4312628 0:539085 1:539085 2:539085 3:539081 4:539085 5:539085 6:539085 7:539085) 744369 pps (5056997 0:632130 1:632130 2:632130 3:632130 4:632130 5:632130 6:632130 7:632130) 744394 pps (5801391 0:725180 1:725180 2:725180 3:725180 4:725180 5:725180 6:725180 7:725180) 744399 pps (6545790 0:818230 1:818230 2:818229 3:818230 4:818230 5:818226 6:818225 7:818225) 744354 pps (7290144 0:911275 1:911275 2:911275 3:911275 4:911270 5:911270 6:911270 7:911270) 744363 pps (8034507 0:1004320 1:1004320 2:1004320 3:1004320 4:1004320 5:1004306 6:1004320 7:1004317) 744379 pps (8778886 0:1097370 1:1097368 2:1097370 3:1097370 4:1097370 5:1097356 6:1097367 7:1097365) 744449 pps (9523335 0:1190425 1:1190425 2:1190425 3:1190421 4:1190425 5:1190411 6:1190425 7:1190425) 476651 pps (9999986 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1249986 6:1250000 7:1250000) 99.9% ! rps counter.. 865721 rps 1067721 rps run2 ---- 573759 pps (573759 0:71720 1:71720 2:71720 3:71723 4:71721 5:71720 6:71720 7:71719) 744249 pps (1318008 0:164755 1:164753 2:164750 3:164750 4:164750 5:164750 6:164750 7:164750) 744260 pps (2062268 0:257785 1:257785 2:257785 3:257785 4:257785 5:257783 6:257780 7:257780) 744238 pps (2806506 0:350815 1:350815 2:350815 3:350815 4:350815 5:350811 6:350810 7:350810) 744233 pps (3550739 0:443845 1:443845 2:443845 3:443845 4:443844 5:443841 6:443841 7:443840) 744236 pps (4294975 0:536875 1:536875 2:536875 3:536870 4:536870 5:536870 6:536870 7:536870) 744244 pps (5039219 0:629905 1:629905 2:629905 3:629905 4:629905 5:629901 6:629901 7:629900) 744240 pps (5783459 0:722935 1:722935 2:722935 3:722934 4:722930 5:722930 6:722930 7:722930) 744214 pps (6527673 0:815962 1:815960 2:815965 3:815963 4:815962 5:815960 6:815955 7:815955) 744268 pps (7271941 0:908995 1:908995 2:908995 3:908995 4:908991 5:908990 6:908990 7:908990) 744239 pps (8016180 0:1002025 1:1002025 2:1002025 3:1002025 4:1002020 5:1002020 6:1002020 7:1002020) 744241 pps (8760421 0:1095055 1:1095055 2:1095052 3:1095055 4:1095055 5:1095050 6:1095050 7:1095050) 744234 pps (9504655 0:1188085 1:1188085 2:1188084 3:1188085 4:1188085 5:1188081 6:1188080 7:1188080) 495345 pps (10000000 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1250000 6:1250000 7:1250000) 100.0% !!! rps count .. 3651 rps 1455997 rps 498777 rps run3 ---- 72947 pps (72947 0:9120 1:9120 2:9120 3:9120 4:9120 5:9117 6:9115 7:9115) 744616 pps (817563 0:102198 1:102195 2:102195 3:102195 4:102195 5:102195 6:102195 7:102195) 744710 pps (1562273 0:195285 1:195285 2:195285 3:195285 4:195285 5:195285 6:195285 7:195283) 744478 pps (2306751 0:288345 1:288345 2:288345 3:288345 4:288345 5:288345 6:288341 7:288340) 744603 pps (3051354 0:381422 1:381420 2:381420 3:381414 4:381420 5:381420 6:381420 7:381420) 744475 pps (3795829 0:474480 1:474480 2:474480 3:474472 4:474480 5:474480 6:474480 7:474477) 744740 pps (4540569 0:567575 1:567575 2:567575 3:567564 4:567570 5:567570 6:567570 7:567570) 744641 pps (5285210 0:660655 1:660655 2:660655 3:660646 4:660650 5:660650 6:660650 7:660650) 744300 pps (6029510 0:753695 1:753690 2:753690 3:753682 4:753690 5:753690 6:753690 7:753690) 744249 pps (6773759 0:846725 1:846725 2:846725 3:846712 4:846720 5:846720 6:846720 7:846720) 744709 pps (7518468 0:939814 1:939810 2:939810 3:939802 4:939810 5:939810 6:939810 7:939810) 744647 pps (8263115 0:1032893 1:1032890 2:1032890 3:1032882 4:1032890 5:1032890 6:1032890 7:1032890) 744672 pps (9007787 0:1125976 1:1125975 2:1125975 3:1125967 4:1125975 5:1125975 6:1125975 7:1125970) 744692 pps (9752479 0:1219065 1:1219065 2:1219062 3:1219056 4:1219060 5:1219060 6:1219060 7:1219060) 247513 pps (9999992 0:1250000 1:1250000 2:1250000 3:1249992 4:1250000 5:1250000 6:1250000 7:1250000) 99.9%! rps count ... 1118484 rps 842940 rps run4 ---- 288558 pps (288558 0:36070 1:36070 2:36070 3:36070 4:36070 5:36070 6:36070 7:36068) 744237 pps (1032795 0:129103 1:129100 2:129105 3:129100 4:129100 5:129100 6:129095 7:129095) 742988 pps (1775783 0:222135 1:222135 2:222135 3:222135 4:220853 5:222130 6:222130 7:222130) 744210 pps (2519993 0:315160 1:315160 2:315160 3:315160 4:313883 5:315160 6:315155 7:315155) 744214 pps (3264207 0:408189 1:408185 2:408185 3:408185 4:406908 5:408185 6:408185 7:408185) 744278 pps (4008485 0:501223 1:501220 2:501220 3:501220 4:499943 5:501220 6:501220 7:501220) 743699 pps (4752184 0:594252 1:594250 2:593718 3:594250 4:592973 5:594250 6:594248 7:594245) 744243 pps (5496427 0:687280 1:687280 2:686748 3:687280 4:686003 5:687280 6:687280 7:687276) 744231 pps (6240658 0:780310 1:780310 2:779778 3:780310 4:779033 5:780300 6:780310 7:780307) 743958 pps (6984616 0:873342 1:873340 2:872808 3:873340 4:872063 5:873043 6:873340 7:873340) 744241 pps (7728857 0:966373 1:966370 2:965838 3:966370 4:965093 5:966073 6:966370 7:966370) 744232 pps (8473089 0:1059400 1:1059400 2:1058868 3:1059400 4:1058123 5:1059103 6:1059397 7:1059398) 743660 pps (9216749 0:1152434 1:1152430 2:1151898 3:1152430 4:1151153 5:1151556 6:1152427 7:1152430) 744251 pps (9961000 0:1245463 1:1245460 2:1244928 3:1245460 4:1244183 5:1244586 6:1245460 7:1245460) 36317 pps (9997317 0:1250000 1:1250000 2:1249468 3:1250000 4:1248723 5:1249126 6:1250000 7:1250000) 99.9%! rps count 818552 rps 1146570 rps run 5 ---- 686211 pps (686211 0:85780 1:85780 2:85775 3:85779 4:85780 5:85780 6:85775 7:85775) 744260 pps (1430471 0:178810 1:178810 2:178810 3:178810 4:178810 5:178810 6:178806 7:178805) 744242 pps (2174713 0:271840 1:271840 2:271840 3:271840 4:271840 5:271840 6:271838 7:271835) 744241 pps (2918954 0:364870 1:364870 2:364870 3:364870 4:364870 5:364870 6:364869 7:364865) 744238 pps (3663192 0:457900 1:457900 2:457900 3:457900 4:457900 5:457900 6:457900 7:457899) 744240 pps (4407432 0:550930 1:550930 2:550930 3:550930 4:550930 5:550930 6:550927 7:550925) 744244 pps (5151676 0:643960 1:643960 2:643960 3:643960 4:643960 5:643960 6:643960 7:643956) 744236 pps (5895912 0:736990 1:736990 2:736990 3:736990 4:736990 5:736990 6:736987 7:736985) 744241 pps (6640153 0:830020 1:830020 2:830020 3:830020 4:830020 5:830020 6:830018 7:830015) 744235 pps (7384388 0:923050 1:923050 2:923050 3:923050 4:923050 5:923049 6:923045 7:923047) 744244 pps (8128632 0:1016080 1:1016080 2:1016080 3:1016080 4:1016080 5:1016080 6:1016079 7:1016075) 744231 pps (8872863 0:1109110 1:1109110 2:1109110 3:1109110 4:1109108 5:1109105 6:1109105 7:1109105) 744258 pps (9617121 0:1202141 1:1202140 2:1202140 3:1202140 4:1202140 5:1202140 6:1202140 7:1202140) 382879 pps (10000000 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1250000 6:1250000 7:1250000) 100% rpsipi count .. 768383 rps 1178132 rps ^ permalink raw reply [flat|nested] 108+ messages in thread
* [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-29 20:36 ` jamal @ 2010-04-29 21:01 ` Eric Dumazet 2010-04-30 13:55 ` Brian Bloniarz 2010-04-30 23:35 ` David Miller 2010-04-30 19:30 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 1 sibling, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-29 21:01 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le jeudi 29 avril 2010 à 16:36 -0400, jamal a écrit : > Results attached. > With your app rps does a hell lot better and non-rps worse ;-> > With my proggie, non-rps does much better than yours and rps does > a lot worse for same setup. I see the scheduler kicking quiet a bit in > non-rps for you... > > The main difference between us as i see it is: > a) i use epoll - actually linked to libevent (1.0.something) > b) I fork processes and you use pthreads. > > I dont have time to chase it today, but 1) I am either going to change > yours to use libevent or make mine get rid of it then 2) move towards > pthreads or have yours fork.. > then observe if that makes any difference.. > Thanks ! Here is last 'patch of the day' for me ;) Next one will be able to coalesce wakeup calls (they'll be delayed at the end of net_rx_action(), like a patch I did last year to help multicast reception) vger seems to be down, I suspect I'll have to resend it later. [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> --- drivers/net/macvtap.c | 13 +++++++--- drivers/net/tun.c | 21 +++++++++------- include/linux/net.h | 14 +++++++---- include/net/af_unix.h | 20 ++++++++-------- include/net/sock.h | 40 ++++++++++++++++---------------- net/atm/common.c | 22 +++++++++++------ net/core/sock.c | 50 ++++++++++++++++++++++++---------------- net/core/stream.c | 10 +++++--- net/dccp/output.c | 10 ++++---- net/iucv/af_iucv.c | 11 +++++--- net/phonet/pep.c | 8 +++--- net/phonet/socket.c | 2 - net/rxrpc/af_rxrpc.c | 10 ++++---- net/sctp/socket.c | 2 - net/socket.c | 47 ++++++++++++++++++++++++++++--------- net/unix/af_unix.c | 17 ++++++------- 16 files changed, 182 insertions(+), 115 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index d97e1fd..1c4110d 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -37,6 +37,7 @@ struct macvtap_queue { struct sock sk; struct socket sock; + struct socket_wq wq; struct macvlan_dev *vlan; struct file *file; unsigned int flags; @@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = { static void macvtap_sock_write_space(struct sock *sk) { + wait_queue_head_t *wqueue; + if (!sock_writeable(sk) || !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND); + wqueue = sk_sleep(sk); + if (wqueue && waitqueue_active(wqueue)) + wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); } static int macvtap_open(struct inode *inode, struct file *file) @@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file) if (!q) goto out; - init_waitqueue_head(&q->sock.wait); + q->sock.wq = &q->wq; + init_waitqueue_head(&q->wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; @@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait) goto out; mask = 0; - poll_wait(file, &q->sock.wait, wait); + poll_wait(file, &q->wq.wait, wait); if (!skb_queue_empty(&q->sk.sk_receive_queue)) mask |= POLLIN | POLLRDNORM; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 20a1793..e525a6c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -109,7 +109,7 @@ struct tun_struct { struct tap_filter txflt; struct socket socket; - + struct socket_wq wq; #ifdef TUN_DEBUG int debug; #endif @@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev) /* Inform the methods they need to stop using the dev. */ if (tfile) { - wake_up_all(&tun->socket.wait); + wake_up_all(&tun->wq.wait); if (atomic_dec_and_test(&tfile->count)) __tun_detach(tun); } @@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Notify and wake up reader process */ if (tun->flags & TUN_FASYNC) kill_fasync(&tun->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tun->socket.wait, POLLIN | + wake_up_interruptible_poll(&tun->wq.wait, POLLIN | POLLRDNORM | POLLRDBAND); return NETDEV_TX_OK; @@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait) DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name); - poll_wait(file, &tun->socket.wait, wait); + poll_wait(file, &tun->wq.wait, wait); if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name); - add_wait_queue(&tun->socket.wait, &wait); + add_wait_queue(&tun->wq.wait, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; @@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, } current->state = TASK_RUNNING; - remove_wait_queue(&tun->socket.wait, &wait); + remove_wait_queue(&tun->wq.wait, &wait); return ret; } @@ -861,6 +861,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = { static void tun_sock_write_space(struct sock *sk) { struct tun_struct *tun; + wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; @@ -868,8 +869,9 @@ static void tun_sock_write_space(struct sock *sk) if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | + wqueue = sk_sleep(sk); + if (wqueue && waitqueue_active(wqueue)) + wake_up_interruptible_sync_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); tun = tun_sk(sk)->tun; @@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (!sk) goto err_free_dev; - init_waitqueue_head(&tun->socket.wait); + tun->socket.wq = &tun->wq; + init_waitqueue_head(&tun->wq.wait); tun->socket.ops = &tun_socket_ops; sock_init_data(&tun->socket, sk); sk->sk_write_space = tun_sock_write_space; diff --git a/include/linux/net.h b/include/linux/net.h index 4157b5d..2b4deee 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -59,6 +59,7 @@ typedef enum { #include <linux/wait.h> #include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ #include <linux/kmemcheck.h> +#include <linux/rcupdate.h> struct poll_table_struct; struct pipe_inode_info; @@ -116,6 +117,12 @@ enum sock_shutdown_cmd { SHUT_RDWR = 2, }; +struct socket_wq { + wait_queue_head_t wait; + struct fasync_struct *fasync_list; + struct rcu_head rcu; +} ____cacheline_aligned_in_smp; + /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) @@ -135,11 +142,8 @@ struct socket { kmemcheck_bitfield_end(type); unsigned long flags; - /* - * Please keep fasync_list & wait fields in the same cache line - */ - struct fasync_struct *fasync_list; - wait_queue_head_t wait; + + struct socket_wq *wq; struct file *file; struct sock *sk; diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 1614d78..20725e2 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -30,7 +30,7 @@ struct unix_skb_parms { #endif }; -#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) #define UNIXSID(skb) (&UNIXCB((skb)).secid) @@ -45,21 +45,23 @@ struct unix_skb_parms { struct unix_sock { /* WARNING: sk has to be the first member */ struct sock sk; - struct unix_address *addr; - struct dentry *dentry; - struct vfsmount *mnt; + struct unix_address *addr; + struct dentry *dentry; + struct vfsmount *mnt; struct mutex readlock; - struct sock *peer; - struct sock *other; + struct sock *peer; + struct sock *other; struct list_head link; - atomic_long_t inflight; - spinlock_t lock; + atomic_long_t inflight; + spinlock_t lock; unsigned int gc_candidate : 1; unsigned int gc_maybe_cycle : 1; - wait_queue_head_t peer_wait; + struct socket_wq peer_wq; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) +#define peer_wait peer_wq.wait + #ifdef CONFIG_SYSCTL extern int unix_sysctl_register(struct net *net); extern void unix_sysctl_unregister(struct net *net); diff --git a/include/net/sock.h b/include/net/sock.h index d361c77..03d0046 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -159,7 +159,7 @@ struct sock_common { * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_rcvbuf: size of receive buffer in bytes - * @sk_sleep: sock wait queue + * @sk_wq: sock wait queue and async head * @sk_dst_cache: destination cache * @sk_dst_lock: destination cache lock * @sk_policy: flow policy @@ -257,7 +257,7 @@ struct sock { struct sk_buff *tail; int len; } sk_backlog; - wait_queue_head_t *sk_sleep; + struct socket_wq *sk_wq; struct dst_entry *sk_dst_cache; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; @@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) static inline wait_queue_head_t *sk_sleep(struct sock *sk) { - return sk->sk_sleep; + return &sk->sk_wq->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. @@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); - sk->sk_sleep = NULL; + sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); - sk->sk_sleep = &parent->wait; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); security_sock_graft(sk, parent); @@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk) } /** - * sk_has_sleeper - check if there are any waiting processes - * @sk: socket + * wq_has_sleeper - check if there are any waiting processes + * @sk: struct socket_wq * - * Returns true if socket has waiting processes + * Returns true if socket_wq has waiting processes * - * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk) * ... ... * tp->rcv_nxt check sock_def_readable * ... { - * schedule ... - * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - * wake_up_interruptible(sk_sleep(sk)) + * schedule rcu_read_lock(); + * wq = rcu_dereference(sk->sk_wq); + * if (wq && waitqueue_active(&wq->wait)) + * wake_up_interruptible(&wq->wait) * ... * } * @@ -1421,28 +1422,27 @@ static inline int sk_has_allocations(const struct sock *sk) * could then endup calling schedule and sleep forever if there are no more * data on the socket. * - * The sk_has_sleeper is always called right after a call to read_lock, so we - * can use smp_mb__after_lock barrier. */ -static inline int sk_has_sleeper(struct sock *sk) +static inline bool wq_has_sleeper(struct socket_wq *wq) { + /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb__after_lock(); - return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); + smp_mb(); + return wq && waitqueue_active(&wq->wait); } - + /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file * @wait_address: socket wait queue * @p: poll_table * - * See the comments in the sk_has_sleeper function. + * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp, * We need to be sure we are in sync with the * socket flags modification. * - * This memory barrier is paired in the sk_has_sleeper. + * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } diff --git a/net/atm/common.c b/net/atm/common.c index e3e10e6..b43feb1 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -90,10 +90,13 @@ static void vcc_sock_destruct(struct sock *sk) static void vcc_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up(sk_sleep(sk)); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up(&wq->wait); + rcu_read_unlock(); } static inline int vcc_writable(struct sock *sk) @@ -106,16 +109,19 @@ static inline int vcc_writable(struct sock *sk) static void vcc_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (vcc_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static struct proto vcc_proto = { diff --git a/net/core/sock.c b/net/core/sock.c index 5104175..94c4aff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1211,7 +1211,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); - newsk->sk_sleep = NULL; + newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); @@ -1800,41 +1800,53 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk_sleep(sk)); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_poll(sk_sleep(sk), POLLERR); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_readable(struct sock *sk, int len) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ @@ -1842,7 +1854,7 @@ static void sock_def_write_space(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) @@ -1896,10 +1908,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - sk->sk_sleep = &sock->wait; + sk->sk_wq = sock->wq; sock->sk = sk; } else - sk->sk_sleep = NULL; + sk->sk_wq = NULL; spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); diff --git a/net/core/stream.c b/net/core/stream.c index 7b3c3f3..cc196f4 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -28,15 +28,19 @@ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; + struct socket_wq *wq; if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); - if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); } } diff --git a/net/dccp/output.c b/net/dccp/output.c index 2d3dcb3..aadbdb5 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -195,15 +195,17 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss); void dccp_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /** diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 9636b7d..8be324f 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -305,11 +305,14 @@ static inline int iucv_below_msglim(struct sock *sk) */ static void iucv_sock_wake_msglim(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk_sleep(sk)); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* Timers */ diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e2a9576..af4d38b 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -664,12 +664,12 @@ static int pep_wait_connreq(struct sock *sk, int noblock) if (signal_pending(tsk)) return sock_intr_errno(timeo); - prepare_to_wait_exclusive(&sk->sk_socket->wait, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - finish_wait(&sk->sk_socket->wait, &wait); + finish_wait(sk_sleep(sk), &wait); } return 0; @@ -910,10 +910,10 @@ disabled: goto out; } - prepare_to_wait(&sk->sk_socket->wait, &wait, + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits)); - finish_wait(&sk->sk_socket->wait, &wait); + finish_wait(sk_sleep(sk), &wait); if (sk->sk_state != TCP_ESTABLISHED) goto disabled; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c785bfd..6e9848b 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -265,7 +265,7 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock, struct pep_sock *pn = pep_sk(sk); unsigned int mask = 0; - poll_wait(file, &sock->wait, wait); + poll_wait(file, sk_sleep(sk), wait); switch (sk->sk_state) { case TCP_LISTEN: diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index c432d76..0b9bb20 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -62,13 +62,15 @@ static inline int rxrpc_writable(struct sock *sk) static void rxrpc_write_space(struct sock *sk) { _enter("%p", sk); - read_lock(&sk->sk_callback_lock); + rcu_read_lock(); if (rxrpc_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13d8229..d54700a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6065,7 +6065,7 @@ static void __sctp_write_space(struct sctp_association *asoc) * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ - if (sock->fasync_list && + if (sock->wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); diff --git a/net/socket.c b/net/socket.c index 9822081..a0a59cb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -252,9 +252,14 @@ static struct inode *sock_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; - init_waitqueue_head(&ei->socket.wait); + ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL); + if (!ei->socket.wq) { + kmem_cache_free(sock_inode_cachep, ei); + return NULL; + } + init_waitqueue_head(&ei->socket.wq->wait); + ei->socket.wq->fasync_list = NULL; - ei->socket.fasync_list = NULL; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; @@ -264,10 +269,21 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } + +static void wq_free_rcu(struct rcu_head *head) +{ + struct socket_wq *wq = container_of(head, struct socket_wq, rcu); + + kfree(wq); +} + static void sock_destroy_inode(struct inode *inode) { - kmem_cache_free(sock_inode_cachep, - container_of(inode, struct socket_alloc, vfs_inode)); + struct socket_alloc *ei; + + ei = container_of(inode, struct socket_alloc, vfs_inode); + call_rcu(&ei->socket.wq->rcu, wq_free_rcu); + kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) @@ -513,7 +529,7 @@ void sock_release(struct socket *sock) module_put(owner); } - if (sock->fasync_list) + if (sock->wq->fasync_list) printk(KERN_ERR "sock_release: fasync list not empty!\n"); percpu_sub(sockets_in_use, 1); @@ -1080,9 +1096,9 @@ static int sock_fasync(int fd, struct file *filp, int on) lock_sock(sk); - fasync_helper(fd, filp, on, &sock->fasync_list); + fasync_helper(fd, filp, on, &sock->wq->fasync_list); - if (!sock->fasync_list) + if (!sock->wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); @@ -1091,12 +1107,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock */ +/* This function may be called only under socket lock or callback_lock or rcu_lock */ int sock_wake_async(struct socket *sock, int how, int band) { - if (!sock || !sock->fasync_list) + struct socket_wq *wq; + + if (!sock) return -1; + rcu_read_lock(); + wq = rcu_dereference(sock->wq); + if (!wq || !wq->fasync_list) { + rcu_read_unlock(); + return -1; + } switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) @@ -1108,11 +1132,12 @@ int sock_wake_async(struct socket *sock, int how, int band) /* fall through */ case SOCK_WAKE_IO: call_kill: - kill_fasync(&sock->fasync_list, SIGIO, band); + kill_fasync(&wq->fasync_list, SIGIO, band); break; case SOCK_WAKE_URG: - kill_fasync(&sock->fasync_list, SIGURG, band); + kill_fasync(&wq->fasync_list, SIGURG, band); } + rcu_read_unlock(); return 0; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 87c0360..fef2cc5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk) static void unix_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (unix_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive @@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion) skpair->sk_err = ECONNRESET; unix_state_unlock(skpair); skpair->sk_state_change(skpair); - read_lock(&skpair->sk_callback_lock); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); - read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -1142,7 +1143,7 @@ restart: newsk->sk_peercred.pid = task_tgid_vnr(current); current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); newu = unix_sk(newsk); - newsk->sk_sleep = &newu->peer_wait; + newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); /* copy address information from listening to new sock*/ @@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&other->sk_callback_lock); } if (other) sock_put(other); ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-29 21:01 ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet @ 2010-04-30 13:55 ` Brian Bloniarz 2010-04-30 17:26 ` Eric Dumazet 2010-04-30 23:35 ` David Miller 1 sibling, 1 reply; 108+ messages in thread From: Brian Bloniarz @ 2010-04-30 13:55 UTC (permalink / raw) To: Eric Dumazet Cc: hadi, Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein Eric Dumazet wrote: > Here is last 'patch of the day' for me ;) > Next one will be able to coalesce wakeup calls (they'll be delayed at > the end of net_rx_action(), like a patch I did last year to help > multicast reception) > > vger seems to be down, I suspect I'll have to resend it later. > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion > > sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we > need two atomic operations (and associated dirtying) per incoming > packet. > This patch boots for me, I haven't noticed any strangeness yet. I ran a few benchmarks (the multicast fan-out mcasttest.c from last year, a few other things we have lying around). I think I see a modest improvement from this and your other 2 packets. Presumably the big wins are where multiple cores perform bh for the same socket, that's not the case in these benchmarks. If it's appropriate: Tested-by: Brian Bloniarz <bmb@athenacr.com> > Next one will be able to coalesce wakeup calls (they'll be delayed at > the end of net_rx_action(), like a patch I did last year to help > multicast reception) Keep em coming :) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-30 13:55 ` Brian Bloniarz @ 2010-04-30 17:26 ` Eric Dumazet 0 siblings, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-04-30 17:26 UTC (permalink / raw) To: Brian Bloniarz Cc: hadi, Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein Le vendredi 30 avril 2010 à 09:55 -0400, Brian Bloniarz a écrit : > > This patch boots for me, I haven't noticed any strangeness yet. > > I ran a few benchmarks (the multicast fan-out mcasttest.c > from last year, a few other things we have lying around). > I think I see a modest improvement from this and your other > 2 packets. Presumably the big wins are where multiple cores > perform bh for the same socket, that's not the case in > these benchmarks. If it's appropriate: > > Tested-by: Brian Bloniarz <bmb@athenacr.com> > > > Next one will be able to coalesce wakeup calls (they'll be delayed at > > the end of net_rx_action(), like a patch I did last year to help > > multicast reception) > > Keep em coming :) Thanks for testing ! Here is a respin of "net: relax dst refcnt in input path" patch for net-next-2.6 Not ready for inclusion, but seems to work quite well on multicast load : I get about 20% more packets on mcasttest (Avoid atomic ops on dst entries on input path, and partly on forwading path). On mccasttest, all sockets share same dst, so producer/consumers all fight on a single cache line. Old ref (for informations) : http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753 Not-Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> include/linux/skbuff.h | 45 +++++++++++++++++++++++++++++++++- include/net/dst.h | 47 +++++++++++++++++++++++++++++++++--- include/net/route.h | 2 - include/net/sock.h | 2 + net/bridge/br_netfilter.c | 2 - net/core/dev.c | 3 ++ net/core/skbuff.c | 3 +- net/core/sock.c | 6 ++++ net/ipv4/arp.c | 2 - net/ipv4/icmp.c | 8 +++--- net/ipv4/ip_forward.c | 1 net/ipv4/ip_fragment.c | 2 - net/ipv4/ip_input.c | 2 - net/ipv4/ip_options.c | 11 ++++---- net/ipv4/netfilter.c | 8 +++--- net/ipv4/route.c | 15 +++++++---- net/ipv4/xfrm4_input.c | 2 - net/ipv6/ip6_tunnel.c | 2 - net/netfilter/nf_queue.c | 2 + net/sched/sch_generic.c | 2 - 20 files changed, 136 insertions(+), 31 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 82f5116..6195bcf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -414,16 +414,59 @@ struct sk_buff { #include <asm/system.h> +/* + * skb might have a dst pointer attached, refcounted or not + * _skb_dst low order bit is set if refcount was taken + */ +#define SKB_DST_NOREF 1UL +#define SKB_DST_PTRMASK ~(SKB_DST_NOREF) + +/** + * skb_dst - returns skb dst_entry + * @skb: buffer + * + * Returns skb dst_entry, regardless of reference taken or not. + */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { - return (struct dst_entry *)skb->_skb_dst; + return (struct dst_entry *)(skb->_skb_dst & SKB_DST_PTRMASK); } +/** + * skb_dst_set - sets skb dst + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was taken on dst and should + * be released by skb_dst_drop() + */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->_skb_dst = (unsigned long)dst; } +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was _not_ taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + skb->_skb_dst = (unsigned long)dst | SKB_DST_NOREF; +} + +/** + * skb_dst_is_noref - Test is skb dst isnt refcounted + * @skb: buffer + */ +static inline bool skb_dst_is_noref(const struct sk_buff *skb) +{ + return (skb->_skb_dst & SKB_DST_NOREF) && skb_dst(skb); +} + static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); diff --git a/include/net/dst.h b/include/net/dst.h index aac5a5f..ad6ea9e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time) dst->lastuse = time; } +static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) +{ + dst->__use++; + dst->lastuse = time; +} + static inline struct dst_entry * dst_clone(struct dst_entry * dst) { @@ -177,11 +183,46 @@ struct dst_entry * dst_clone(struct dst_entry * dst) } extern void dst_release(struct dst_entry *dst); + +static inline void __skb_dst_drop(unsigned long _skb_dst) +{ + if (!(_skb_dst & SKB_DST_NOREF)) + dst_release((struct dst_entry *)(_skb_dst & SKB_DST_PTRMASK)); +} + +/** + * skb_dst_drop - drops skb dst + * @skb: buffer + * + * Drops dst reference count if a reference was taken. + */ static inline void skb_dst_drop(struct sk_buff *skb) { - if (skb->_skb_dst) - dst_release(skb_dst(skb)); - skb->_skb_dst = 0UL; + if (skb->_skb_dst) { + __skb_dst_drop(skb->_skb_dst); + skb->_skb_dst = 0UL; + } +} + +static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) +{ + nskb->_skb_dst = oskb->_skb_dst; + if (!(nskb->_skb_dst & SKB_DST_NOREF)) + dst_clone(skb_dst(nskb)); +} + +/** + * skb_dst_force - makes sure skb dst is refcounted + * @skb: buffer + * + * If dst is not yet refcounted, let's do it + */ +static inline void skb_dst_force(struct sk_buff *skb) +{ + if (skb->_skb_dst & SKB_DST_NOREF) { + skb->_skb_dst &= ~SKB_DST_NOREF; + dst_clone(skb_dst(skb)); + } } /* Children define the path of the packet through the diff --git a/include/net/route.h b/include/net/route.h index 2c9fba7..443f6d4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -112,7 +112,7 @@ extern void rt_cache_flush_batch(void); extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp); extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); -extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); +extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, bool noref); extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev); extern void ip_rt_send_redirect(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index d361c77..0a0f14d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -598,6 +598,8 @@ static inline int sk_stream_memory_free(struct sock *sk) /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { + /* dont let skb dst not referenced, we are going to leave rcu lock */ + skb_dst_force(skb); if (!sk->sk_backlog.tail) { sk->sk_backlog.head = sk->sk_backlog.tail = skb; } else { diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 4c4977d..c943ad4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -350,7 +350,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb) } nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; if (dnat_took_place(skb)) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev, false))) { struct flowi fl = { .nl_u = { .ip4_u = { diff --git a/net/core/dev.c b/net/core/dev.c index 100dcbd..c331b0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2047,6 +2047,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); __qdisc_update_bstats(q, skb->len); if (sch_direct_xmit(skb, q, dev, txq, root_lock)) __qdisc_run(q); @@ -2055,6 +2057,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { + skb_dst_force(skb); rc = qdisc_enqueue_root(skb, q); qdisc_run(q); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4218ff4..f400196 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -531,7 +531,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; - skb_dst_set(new, dst_clone(skb_dst(old))); + + skb_dst_copy(new, old); new->rxhash = old->rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); diff --git a/net/core/sock.c b/net/core/sock.c index 5104175..894bed6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + spin_lock_irqsave(&list->lock, flags); skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(list, skb); @@ -1535,6 +1540,7 @@ static void __release_sock(struct sock *sk) do { struct sk_buff *next = skb->next; + WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6e74706..502ac9f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -854,7 +854,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input(skb, tip, sip, 0, dev) == 0) { + ip_route_input(skb, tip, sip, 0, dev, true) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f3d339f..a113c08 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) err = __ip_route_output_key(net, &rt2, &fl); else { struct flowi fl2 = {}; - struct dst_entry *odst; + unsigned long odst; fl2.fl4_dst = fl.fl4_src; if (ip_route_output_key(net, &rt2, &fl2)) goto relookup_failed; /* Ugh! */ - odst = skb_dst(skb_in); + odst = skb_in->_skb_dst; /* save old dst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->u.dst.dev); + RT_TOS(tos), rt2->u.dst.dev, false); dst_release(&rt2->u.dst); rt2 = skb_rtable(skb_in); - skb_dst_set(skb_in, odst); + skb_in->_skb_dst = odst; /* restore old dst */ } if (err) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index af10942..0f58609 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -57,6 +57,7 @@ int ip_forward(struct sk_buff *skb) struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); +/* pr_err("ip_forward() skb->dst=%lx\n", skb->_skb_dst);*/ if (skb_warn_if_lro(skb)) goto drop; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 75347ea..cbcde7a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -220,7 +220,7 @@ static void ip_expire(unsigned long arg) if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { const struct iphdr *iph = ip_hdr(head); int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + iph->tos, head->dev, false); if (unlikely(err)) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f8ab7a3..5d365e8 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -332,7 +332,7 @@ static int ip_rcv_finish(struct sk_buff *skb) */ if (skb_dst(skb) == NULL) { int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, - skb->dev); + skb->dev, true); if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 4c09a31..1b65d68 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; + unsigned long odst; int err; if (!opt->srr) @@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) } memcpy(&nexthop, &optptr[srrptr-1], 4); - rt = skb_rtable(skb); + odst = skb->_skb_dst; skb_dst_set(skb, NULL); - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev, false); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { - ip_rt_put(rt2); - skb_dst_set(skb, &rt->u.dst); + skb_dst_drop(skb); + skb->_skb_dst = odst; return -EINVAL; } - ip_rt_put(rt); + __skb_dst_drop(odst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 82fb43c..e505007 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; - struct dst_entry *odst; + unsigned long odst; unsigned int hh_len; unsigned int type; @@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) if (ip_route_output_key(net, &rt, &fl) != 0) return -1; - odst = skb_dst(skb); + odst = skb->_skb_dst; if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + RT_TOS(iph->tos), rt->u.dst.dev, false) != 0) { dst_release(&rt->u.dst); return -1; } dst_release(&rt->u.dst); - dst_release(odst); + __skb_dst_drop(odst); } if (skb_dst(skb)->error) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a947428..4f169ce 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2300,7 +2300,7 @@ martian_source: } int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, bool noref) { struct rtable * rth; unsigned hash; @@ -2326,10 +2326,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + if (noref) { + dst_use_noref(&rth->u.dst, jiffies); + skb_dst_set_noref(skb, &rth->u.dst); + } else { + dst_use(&rth->u.dst, jiffies); + skb_dst_set(skb, &rth->u.dst); + } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2991,7 +2996,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb->protocol = htons(ETH_P_IP); skb->dev = dev; local_bh_disable(); - err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev, false); local_bh_enable(); rt = skb_rtable(skb); @@ -3055,7 +3060,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (rt_is_expired(rt)) continue; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set_noref(skb, dst_clone(&rt->u.dst)); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index c791bb6..0366cbc 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, - skb->dev)) + skb->dev, true)) goto drop; } return dst_input(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2599870..7ae0fa5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -570,7 +570,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, - skb2->dev) || + skb2->dev, false) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) goto out; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index c49ef21..cb3cde4 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -9,6 +9,7 @@ #include <linux/rcupdate.h> #include <net/protocol.h> #include <net/netfilter/nf_queue.h> +#include <net/dst.h> #include "nf_internals.h" @@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb, dev_hold(physoutdev); } #endif + skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index aeddabf..21e3976 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -179,7 +179,7 @@ static inline int qdisc_restart(struct Qdisc *q) skb = dequeue_skb(q); if (unlikely(!skb)) return 0; - + WARN_ON_ONCE(skb_dst_is_noref(skb)); root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); ^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-29 21:01 ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet 2010-04-30 13:55 ` Brian Bloniarz @ 2010-04-30 23:35 ` David Miller 2010-05-01 4:56 ` Eric Dumazet 2010-05-01 7:02 ` Eric Dumazet 1 sibling, 2 replies; 108+ messages in thread From: David Miller @ 2010-04-30 23:35 UTC (permalink / raw) To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 29 Apr 2010 23:01:49 +0200 > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion So what's the difference between call_rcu() freeing this little waitqueue struct and doing it for the entire socket? We'll still be doing an RCU call every socket destroy, and now we also have a new memory allocation/free per connection. This has to show up in things like 'lat_connect' and friends, does it not? ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-30 23:35 ` David Miller @ 2010-05-01 4:56 ` Eric Dumazet 2010-05-01 7:02 ` Eric Dumazet 1 sibling, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 4:56 UTC (permalink / raw) To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit : > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Thu, 29 Apr 2010 23:01:49 +0200 > > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion > > So what's the difference between call_rcu() freeing this little waitqueue > struct and doing it for the entire socket? > > We'll still be doing an RCU call every socket destroy, and now we also have > a new memory allocation/free per connection. > > This has to show up in things like 'lat_connect' and friends, does it not? Difference is this structure is small, one cache line at most. So the cost of call_rcu() on this structure, with the well known cache miss is very much reduced. The thing that might cost is the smp_mb(), because it translate to a "mfence" instruction, and it appears to cost more than a a regular "lock ..." Unfortunatly, oprofile doesnt work anymore on my bl460c machine after last BIOS upgrade... Oh well... ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-04-30 23:35 ` David Miller 2010-05-01 4:56 ` Eric Dumazet @ 2010-05-01 7:02 ` Eric Dumazet 2010-05-01 8:03 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 7:02 UTC (permalink / raw) To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit : > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Thu, 29 Apr 2010 23:01:49 +0200 > > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion > > So what's the difference between call_rcu() freeing this little waitqueue > struct and doing it for the entire socket? > > We'll still be doing an RCU call every socket destroy, and now we also have > a new memory allocation/free per connection. > > This has to show up in things like 'lat_connect' and friends, does it not? Before patch : lat_connect -N 10 127.0.0.1 TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds After : lat_connect -N 10 127.0.0.1 TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds Strange isnt it ? (special care should be taken with this bench, as it leave many sockets in TIME_WAIT state, so to get consistent numbers we have to wait a while before restarting it) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-05-01 7:02 ` Eric Dumazet @ 2010-05-01 8:03 ` Eric Dumazet 2010-05-01 22:00 ` David Miller 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 8:03 UTC (permalink / raw) To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb Le samedi 01 mai 2010 à 09:02 +0200, Eric Dumazet a écrit : > Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit : > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Thu, 29 Apr 2010 23:01:49 +0200 > > > > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion > > > > So what's the difference between call_rcu() freeing this little waitqueue > > struct and doing it for the entire socket? > > > > We'll still be doing an RCU call every socket destroy, and now we also have > > a new memory allocation/free per connection. > > > > This has to show up in things like 'lat_connect' and friends, does it not? > > Before patch : > > lat_connect -N 10 127.0.0.1 > TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds > > After : > > lat_connect -N 10 127.0.0.1 > TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds > > Strange isnt it ? > > (special care should be taken with this bench, as it leave many sockets > in TIME_WAIT state, so to get consistent numbers we have to wait a while > before restarting it) Oops, this was with the other patch (about dst no_refcounting in input path), sorry. With the "sock_def_readable() and friends RCU conversion" patch I got : lat_connect -N 10 127.0.0.1 TCP/IP connection cost to 127.0.0.1: 27.6244 microseconds Anyway, this lat_connect seems very unreliable (lot of variance) with linux-2.6.31, ~33 us with linux-2.6.33, ~30 us David, I also need this RCU thing in order to be able to group all wakeups at the end of net_rx_action(). Plan was to use RCU, so that I dont need to increase sk_refcnt when queueing a "wakeup" (and decrease sk_refcnt a long time after) Previous attempt was a bit hacky, http://patchwork.ozlabs.org/patch/24179/ I expect 2010 one will be cleaner :) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion 2010-05-01 8:03 ` Eric Dumazet @ 2010-05-01 22:00 ` David Miller 0 siblings, 0 replies; 108+ messages in thread From: David Miller @ 2010-05-01 22:00 UTC (permalink / raw) To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb From: Eric Dumazet <eric.dumazet@gmail.com> Date: Sat, 01 May 2010 10:03:31 +0200 > David, I also need this RCU thing in order to be able to group all > wakeups at the end of net_rx_action(). > > Plan was to use RCU, so that I dont need to increase sk_refcnt when > queueing a "wakeup" (and decrease sk_refcnt a long time after) > > Previous attempt was a bit hacky, > http://patchwork.ozlabs.org/patch/24179/ > > I expect 2010 one will be cleaner :) Fair enough, I'm convinced now, applied thanks! ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 20:36 ` jamal 2010-04-29 21:01 ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet @ 2010-04-30 19:30 ` jamal 2010-04-30 20:40 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: jamal @ 2010-04-30 19:30 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz [-- Attachment #1: Type: text/plain, Size: 1322 bytes --] Eric! I managed to mod your program to look conceptually similar to mine and i reproduced the results with same test kernel from yesterday. So it is likely the issue is in using epoll vs not using any async as in your case. Results attached as well as modified program. Note: the key things to remember: rps with this program gets worse over time and different net-next kernels since Apr14 (look at graph i supplied). Sorry, I am really busy-ed out to dig any further. cheers, jamal On Thu, 2010-04-29 at 16:36 -0400, jamal wrote: > On Thu, 2010-04-29 at 09:56 -0400, jamal wrote: > > > > > I will try your program instead so we can reduce the variables > > Results attached. > With your app rps does a hell lot better and non-rps worse ;-> > With my proggie, non-rps does much better than yours and rps does > a lot worse for same setup. I see the scheduler kicking quiet a bit in > non-rps for you... > > The main difference between us as i see it is: > a) i use epoll - actually linked to libevent (1.0.something) > b) I fork processes and you use pthreads. > > I dont have time to chase it today, but 1) I am either going to change > yours to use libevent or make mine get rid of it then 2) move towards > pthreads or have yours fork.. > then observe if that makes any difference.. > > > cheers, > jamal [-- Attachment #2: apr30-ericmod --] [-- Type: text/plain, Size: 8919 bytes --] First a few runs with Eric's code + epoll/libevent ------------------------------------------------------------------------------- PerfTop: 4009 irqs/sec kernel:83.4% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 2097.00 8.6% sky2_poll [sky2] 1742.00 7.2% _raw_spin_lock_irqsave [kernel] 831.00 3.4% system_call [kernel] 654.00 2.7% copy_user_generic_string [kernel] 654.00 2.7% datagram_poll [kernel] 647.00 2.7% fget [kernel] 623.00 2.6% _raw_spin_unlock_irqrestore [kernel] 547.00 2.3% _raw_spin_lock_bh [kernel] 506.00 2.1% sys_epoll_ctl [kernel] 475.00 2.0% kmem_cache_free [kernel] 466.00 1.9% schedule [kernel] 436.00 1.8% vread_tsc [kernel].vsyscall_fn 417.00 1.7% fput [kernel] 415.00 1.7% sys_epoll_wait [kernel] 402.00 1.7% _raw_spin_lock [kernel] ------------------------------------------------------------------------------- PerfTop: 616 irqs/sec kernel:98.7% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 2534.00 28.6% sky2_poll [sky2] 503.00 5.7% ip_route_input [kernel] 438.00 4.9% _raw_spin_lock_irqsave [kernel] 418.00 4.7% __udp4_lib_lookup [kernel] 378.00 4.3% __alloc_skb [kernel] 364.00 4.1% ip_rcv [kernel] 323.00 3.6% _raw_spin_lock [kernel] 315.00 3.5% sock_queue_rcv_skb [kernel] 284.00 3.2% __netif_receive_skb [kernel] 281.00 3.2% __udp4_lib_rcv [kernel] 266.00 3.0% __wake_up_common [kernel] 238.00 2.7% sock_def_readable [kernel] 181.00 2.0% __kmalloc [kernel] 163.00 1.8% kmem_cache_alloc [kernel] 150.00 1.7% ep_poll_callback [kernel] ------------------------------------------------------------------------------- PerfTop: 854 irqs/sec kernel:80.2% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 341.00 8.0% _raw_spin_lock_irqsave [kernel] 235.00 5.5% system_call [kernel] 174.00 4.1% datagram_poll [kernel] 174.00 4.1% fget [kernel] 173.00 4.1% copy_user_generic_string [kernel] 135.00 3.2% _raw_spin_unlock_irqrestore [kernel] 125.00 2.9% _raw_spin_lock_bh [kernel] 122.00 2.9% schedule [kernel] 113.00 2.6% sys_epoll_ctl [kernel] 113.00 2.6% kmem_cache_free [kernel] 108.00 2.5% vread_tsc [kernel].vsyscall_fn 105.00 2.5% sys_epoll_wait [kernel] 102.00 2.4% udp_recvmsg [kernel] 95.00 2.2% mutex_lock [kernel] Average 97.55% of 10M packets at 750Kpps Turn on rps mask ee and irq affinity to cpu0 ------------------------------------------------------------------------------- PerfTop: 3885 irqs/sec kernel:83.6% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 2945.00 16.7% sky2_poll [sky2] 653.00 3.7% _raw_spin_lock_irqsave [kernel] 460.00 2.6% system_call [kernel] 420.00 2.4% _raw_spin_unlock_irqrestore [kernel] 414.00 2.3% sky2_intr [sky2] 392.00 2.2% fget [kernel] 360.00 2.0% ip_rcv [kernel] 324.00 1.8% sys_epoll_ctl [kernel] 323.00 1.8% __netif_receive_skb [kernel] 310.00 1.8% schedule [kernel] 292.00 1.7% ip_route_input [kernel] 292.00 1.7% _raw_spin_lock [kernel] 291.00 1.7% copy_user_generic_string [kernel] 284.00 1.6% kmem_cache_free [kernel] 262.00 1.5% call_function_single_interrupt [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:98.1% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ ________ 4170.00 61.9% sky2_poll [sky2] 723.00 10.7% sky2_intr [sky2] 159.00 2.4% __alloc_skb [kernel] 140.00 2.1% get_rps_cpu [kernel] 106.00 1.6% __kmalloc [kernel] 95.00 1.4% enqueue_to_backlog [kernel] 86.00 1.3% kmem_cache_alloc [kernel] 85.00 1.3% irq_entries_start [kernel] 85.00 1.3% _raw_spin_lock_irqsave [kernel] 82.00 1.2% _raw_spin_lock [kernel] 66.00 1.0% swiotlb_sync_single [kernel] 58.00 0.9% sky2_remove [sky2] 49.00 0.7% default_send_IPI_mask_sequence_phys [kernel] 47.00 0.7% sky2_rx_submit [sky2] 36.00 0.5% _raw_spin_unlock_irqrestore [kernel] ------------------------------------------------------------------------------- PerfTop: 344 irqs/sec kernel:84.3% [1000Hz cycles], (all, cpu: 2) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ____________________ 114.00 5.2% _raw_spin_lock_irqsave [kernel] 79.00 3.6% fget [kernel] 78.00 3.6% ip_rcv [kernel] 78.00 3.6% system_call [kernel] 75.00 3.4% _raw_spin_unlock_irqrestore [kernel] 67.00 3.1% sys_epoll_ctl [kernel] 65.00 3.0% schedule [kernel] 61.00 2.8% ip_route_input [kernel] 48.00 2.2% vread_tsc [kernel].vsyscall_fn 48.00 2.2% call_function_single_interrupt [kernel] 46.00 2.1% kmem_cache_free [kernel] 45.00 2.1% __netif_receive_skb [kernel] 41.00 1.9% process_recv snkudp 40.00 1.8% kfree [kernel] 39.00 1.8% _raw_spin_lock [kernel] 92.97% of 10M packets at 750Kpps Ok, so this is exactly what i saw with my app. non-rps is better. To summarize: It used to be the opposite on net-next before around Apr14. rps has gotten worse. [-- Attachment #3: udpsnkfrk.c --] [-- Type: text/x-csrc, Size: 3650 bytes --] /* * Usage: udpsink [ -p baseport] nbports */ #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <string.h> #include <stdio.h> #include <errno.h> #include <unistd.h> #include <stdlib.h> #include <fcntl.h> #include <event.h> struct worker_data { struct event *snk_ev; struct event_base *base; struct timeval t; unsigned long pack_count; unsigned long bytes_count; unsigned long tout; int fd; /* move to avoid hole on 64-bit */ int pad1; /*64B - let Eric figure the math;-> */ //unsigned long _padd[16 - 3]; /* alignment */ }; void usage(int code) { fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n"); exit(code); } void process_recv(int fd, short ev, void *arg) { char buffer[4096]; struct sockaddr_in addr; socklen_t len = sizeof(addr); struct worker_data *wdata = (struct worker_data *)arg; int lu = 0; if ((event_add(wdata->snk_ev, &wdata->t)) < 0) { perror("cb event_add"); return; } if (ev == EV_TIMEOUT) { wdata->tout++; } else { lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0, (struct sockaddr *)&addr, &len); if (lu > 0) { wdata->pack_count++; wdata->bytes_count += lu; } } } int prep_thread(struct worker_data *wdata) { wdata->t.tv_sec = 1; wdata->t.tv_usec = random() % 50000L; wdata->base = event_init(); event_set(wdata->snk_ev, wdata->fd, EV_READ, process_recv, wdata); event_base_set(wdata->base, wdata->snk_ev); if ((event_add(wdata->snk_ev, &wdata->t)) < 0) { perror("event_add"); return -1; } return 0; } void *worker_func(void *arg) { struct worker_data *wdata = (struct worker_data *)arg; return (void *)event_base_loop(wdata->base, 0); } int main(int argc, char *argv[]) { int c; int baseport = 4000; int nbthreads; struct worker_data *wdata; unsigned long ototal = 0; int concurrent = 0; int verbose = 0; int i; while ((c = getopt(argc, argv, "cvp:")) != -1) { if (c == 'p') baseport = atoi(optarg); else if (c == 'c') concurrent = 1; else if (c == 'v') verbose++; else usage(1); } if (optind == argc) usage(1); nbthreads = atoi(argv[optind]); wdata = calloc(sizeof(struct worker_data), nbthreads); if (!wdata) { perror("calloc"); return 1; } for (i = 0; i < nbthreads; i++) { struct sockaddr_in addr; pthread_t tid; if (i && concurrent) { wdata[i].fd = wdata[0].fd; } else { wdata[i].snk_ev = malloc(sizeof(struct event)); if (!wdata[i].snk_ev) return 1; memset(wdata[i].snk_ev, 0, sizeof(struct event)); wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0); if (wdata[i].fd == -1) { free(wdata[i].snk_ev); perror("socket"); return 1; } memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; // addr.sin_addr.s_addr = inet_addr(argv[optind]); addr.sin_port = htons(baseport + i); if (bind (wdata[i].fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { free(wdata[i].snk_ev); perror("bind"); return 1; } // fcntl(wdata[i].fd, F_SETFL, O_NDELAY); } if (prep_thread(wdata + i)) { printf("failed to allocate thread %d, exit\n", i); exit(0); } pthread_create(&tid, NULL, worker_func, wdata + i); } for (;;) { unsigned long total; long delta; sleep(1); total = 0; for (i = 0; i < nbthreads; i++) { total += wdata[i].pack_count; } delta = total - ototal; if (delta) { printf("%lu pps (%lu", delta, total); if (verbose) { for (i = 0; i < nbthreads; i++) { if (wdata[i].pack_count) printf(" %d:%lu", i, wdata[i].pack_count); } } printf(")\n"); } ototal = total; } } ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-30 19:30 ` [PATCH net-next-2.6] net: speedup udp receive path jamal @ 2010-04-30 20:40 ` Eric Dumazet 2010-05-01 0:06 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-04-30 20:40 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le vendredi 30 avril 2010 à 15:30 -0400, jamal a écrit : > Eric! > > I managed to mod your program to look conceptually similar to mine > and i reproduced the results with same test kernel from yesterday. > So it is likely the issue is in using epoll vs not using any async as > in your case. > Results attached as well as modified program. > > Note: the key things to remember: > rps with this program gets worse over time and different net-next > kernels since Apr14 (look at graph i supplied). Sorry, I am really > busy-ed out to dig any further. > > cheers, > jamal > I am lost. I used your program, and with RPS off, I can get at most 220.000 pps with my "old" hardware. I dont understand how you can reach 700.000 pps with RPS off. Or is it with your Nehalem ? ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-30 20:40 ` Eric Dumazet @ 2010-05-01 0:06 ` jamal 2010-05-01 5:57 ` Eric Dumazet 0 siblings, 1 reply; 108+ messages in thread From: jamal @ 2010-05-01 0:06 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Fri, 2010-04-30 at 22:40 +0200, Eric Dumazet wrote: > > I used your program, and with RPS off, I can get at most 220.000 pps > with my "old" hardware. I dont understand how you can reach 700.000 pps > with RPS off. Or is it with your Nehalem ? Yes, Nehalem. RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the same trend on the old hardware? cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 0:06 ` jamal @ 2010-05-01 5:57 ` Eric Dumazet 2010-05-01 6:14 ` Eric Dumazet 2010-05-01 11:23 ` jamal 0 siblings, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 5:57 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit : > Yes, Nehalem. > RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the > same trend on the old hardware? > Of course not ! Or else RPS would be useless :( I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl() overhead for each packet...) RPS off : 220.000 pps RPS on (ee mask) : 700.000 pps (with a slightly modified tg3 driver) 96% of delivered packets This is on tg3 adapter, and tg3 has copybreak feature : small packets are copied into skb of the right size. define TG3_RX_COPY_THRESHOLD 256 -> 40 ... We really should disable this feature for RPS workload, unfortunatly ethtool cannot tweak this. So profile of cpu 0 (RPS ON) looks like : ------------------------------------------------------------------------------------------------------------------------ PerfTop: 1001 irqs/sec kernel:99.7% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------------------------------------------------ samples pcnt function DSO _______ _____ ______________________ _______ 819.00 12.6% __alloc_skb vmlinux 592.00 9.1% eth_type_trans vmlinux 509.00 7.8% _raw_spin_lock vmlinux 475.00 7.3% __kmalloc_track_caller vmlinux 358.00 5.5% tg3_read32 vmlinux 345.00 5.3% __netdev_alloc_skb vmlinux 329.00 5.0% kmem_cache_alloc vmlinux 307.00 4.7% _raw_spin_lock_irqsave vmlinux 284.00 4.4% bnx2_interrupt vmlinux 277.00 4.2% skb_pull vmlinux 248.00 3.8% tg3_poll_work vmlinux 202.00 3.1% __slab_alloc vmlinux 197.00 3.0% get_rps_cpu vmlinux 106.00 1.6% enqueue_to_backlog vmlinux 87.00 1.3% _raw_spin_lock_bh vmlinux 80.00 1.2% __copy_to_user_ll vmlinux 77.00 1.2% nommu_map_page vmlinux 77.00 1.2% __napi_gro_receive vmlinux 65.00 1.0% tg3_alloc_rx_skb vmlinux 60.00 0.9% skb_gro_reset_offset vmlinux 57.00 0.9% skb_put vmlinux 57.00 0.9% __slab_free vmlinux /* * Usage: udpsnkfrk [ -p baseport] nbports */ #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <string.h> #include <stdio.h> #include <errno.h> #include <unistd.h> #include <stdlib.h> #include <fcntl.h> #include <event.h> struct worker_data { struct event *snk_ev; struct event_base *base; struct timeval t; unsigned long pack_count; unsigned long bytes_count; unsigned long tout; int fd; /* move to avoid hole on 64-bit */ int pad1; unsigned long _padd[99]; /* avoid false sharing */ }; void usage(int code) { fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n"); exit(code); } void process_recv(int fd, short ev, void *arg) { char buffer[4096]; struct sockaddr_in addr; socklen_t len = sizeof(addr); struct worker_data *wdata = (struct worker_data *)arg; int lu = 0; if (ev == EV_TIMEOUT) { wdata->tout++; if ((event_add(wdata->snk_ev, &wdata->t)) < 0) { perror("cb event_add"); return; } } else { do { lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0, (struct sockaddr *)&addr, &len); if (lu > 0) { wdata->pack_count++; wdata->bytes_count += lu; } } while (lu > 0); } } int prep_thread(struct worker_data *wdata) { wdata->t.tv_sec = 1; wdata->t.tv_usec = random() % 50000L; wdata->base = event_init(); event_set(wdata->snk_ev, wdata->fd, EV_READ|EV_PERSIST, process_recv, wdata); event_base_set(wdata->base, wdata->snk_ev); if ((event_add(wdata->snk_ev, &wdata->t)) < 0) { perror("event_add"); return -1; } return 0; } void *worker_func(void *arg) { struct worker_data *wdata = (struct worker_data *)arg; return (void *)event_base_loop(wdata->base, 0); } int main(int argc, char *argv[]) { int c; int baseport = 4000; int nbthreads; struct worker_data *wdata; unsigned long ototal = 0; int concurrent = 0; int verbose = 0; int i; while ((c = getopt(argc, argv, "cvp:")) != -1) { if (c == 'p') baseport = atoi(optarg); else if (c == 'c') concurrent = 1; else if (c == 'v') verbose++; else usage(1); } if (optind == argc) usage(1); nbthreads = atoi(argv[optind]); wdata = calloc(sizeof(struct worker_data), nbthreads); if (!wdata) { perror("calloc"); return 1; } for (i = 0; i < nbthreads; i++) { struct sockaddr_in addr; pthread_t tid; if (i && concurrent) { wdata[i].fd = wdata[0].fd; } else { wdata[i].snk_ev = malloc(sizeof(struct event)); if (!wdata[i].snk_ev) return 1; memset(wdata[i].snk_ev, 0, sizeof(struct event)); wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0); if (wdata[i].fd == -1) { free(wdata[i].snk_ev); perror("socket"); return 1; } memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; // addr.sin_addr.s_addr = inet_addr(argv[optind]); addr.sin_port = htons(baseport + i); if (bind (wdata[i].fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { free(wdata[i].snk_ev); perror("bind"); return 1; } fcntl(wdata[i].fd, F_SETFL, O_NDELAY); } if (prep_thread(wdata + i)) { printf("failed to allocate thread %d, exit\n", i); exit(0); } pthread_create(&tid, NULL, worker_func, wdata + i); } for (;;) { unsigned long total; long delta; sleep(1); total = 0; for (i = 0; i < nbthreads; i++) { total += wdata[i].pack_count; } delta = total - ototal; if (delta) { printf("%lu pps (%lu", delta, total); if (verbose) { for (i = 0; i < nbthreads; i++) { if (wdata[i].pack_count) printf(" %d:%lu", i, wdata[i].pack_count); } } printf(")\n"); } ototal = total; } } ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 5:57 ` Eric Dumazet @ 2010-05-01 6:14 ` Eric Dumazet 2010-05-01 10:24 ` Changli Gao 2010-05-01 11:29 ` jamal 2010-05-01 11:23 ` jamal 1 sibling, 2 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 6:14 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le samedi 01 mai 2010 à 07:57 +0200, Eric Dumazet a écrit : > Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit : > > > Yes, Nehalem. > > RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the > > same trend on the old hardware? > > > > Of course not ! Or else RPS would be useless :( > > I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl() > overhead for each packet...) > > RPS off : 220.000 pps > > RPS on (ee mask) : 700.000 pps (with a slightly modified tg3 driver) > 96% of delivered packets BTW, using ee mask, cpu4 is not used at _all_, even for the user threads. Scheduler does a bad job IMHO. Using fe mask, I get all packets (sent at 733311pps by my pktgen machine), and my CPU0 even has idle time !!! Limit seems to be around 800.000 pps ------------------------------------------------------------------------------------------------------------------------ PerfTop: 5616 irqs/sec kernel:93.9% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------------------------------------------------ samples pcnt function DSO _______ _____ ___________________________ _______ 3492.00 6.2% __slab_free vmlinux 2334.00 4.2% _raw_spin_lock vmlinux 2314.00 4.1% _raw_spin_lock_irqsave vmlinux 1807.00 3.2% ip_rcv vmlinux 1605.00 2.9% schedule vmlinux 1474.00 2.6% __netif_receive_skb vmlinux 1464.00 2.6% kfree vmlinux 1405.00 2.5% ip_route_input vmlinux 1318.00 2.4% __copy_to_user_ll vmlinux 1214.00 2.2% __alloc_skb vmlinux 1160.00 2.1% nf_hook_slow vmlinux 1020.00 1.8% eth_type_trans vmlinux 860.00 1.5% sched_clock_local vmlinux 775.00 1.4% read_tsc vmlinux 773.00 1.4% ipt_do_table vmlinux 766.00 1.4% _raw_spin_unlock_irqrestore vmlinux 748.00 1.3% sock_recv_ts_and_drops vmlinux 747.00 1.3% ia32_sysenter_target vmlinux 740.00 1.3% select_nohz_load_balancer vmlinux 644.00 1.2% __kmalloc_track_caller vmlinux 596.00 1.1% tg3_read32 vmlinux 566.00 1.0% __udp4_lib_lookup vmlinux ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 6:14 ` Eric Dumazet @ 2010-05-01 10:24 ` Changli Gao 2010-05-01 10:47 ` Eric Dumazet 2010-05-01 11:29 ` jamal 1 sibling, 1 reply; 108+ messages in thread From: Changli Gao @ 2010-05-01 10:24 UTC (permalink / raw) To: Eric Dumazet Cc: hadi, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > BTW, using ee mask, cpu4 is not used at _all_, even for the user > threads. Scheduler does a bad job IMHO. > > Using fe mask, I get all packets (sent at 733311pps by my pktgen > machine), and my CPU0 even has idle time !!! > > Limit seems to be around 800.000 pps > > ------------------------------------------------------------------------------------------------------------------------ > PerfTop: 5616 irqs/sec kernel:93.9% [1000Hz cycles], (all, 8 CPUs) > ------------------------------------------------------------------------------------------------------------------------ > Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we can do weighted packet distributing: cpu0's weight is 1, and other cpus are 2. maybe we can utilize all the cpu power. -- Regards, Changli Gao(xiaosuo@gmail.com) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 10:24 ` Changli Gao @ 2010-05-01 10:47 ` Eric Dumazet 0 siblings, 0 replies; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 10:47 UTC (permalink / raw) To: Changli Gao Cc: hadi, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le samedi 01 mai 2010 à 18:24 +0800, Changli Gao a écrit : > On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > BTW, using ee mask, cpu4 is not used at _all_, even for the user > > threads. Scheduler does a bad job IMHO. > > > > Using fe mask, I get all packets (sent at 733311pps by my pktgen > > machine), and my CPU0 even has idle time !!! > > > > Limit seems to be around 800.000 pps > > > > ------------------------------------------------------------------------------------------------------------------------ > > PerfTop: 5616 irqs/sec kernel:93.9% [1000Hz cycles], (all, 8 CPUs) > > ------------------------------------------------------------------------------------------------------------------------ > > > > Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we > can do weighted packet distributing: cpu0's weight is 1, and other > cpus are 2. maybe we can utilize all the cpu power. > Nope, cpu0 was at 100% in this test, other cpus were about at 50% each. weigthed would be ok if I wanted to use cpu0 in the 'slave' cpus (RPS targets). But I know the workload I am interested to, and ability to resist to DDOS, want to keep cpu0 outside of IP/TCP/UDP stack. Later, skb_pull() inline in eth_type_trans() permitted to reach 840.000 pps. top - 12:42:55 up 3:00, 2 users, load average: 0.44, 0.11, 0.03 Tasks: 126 total, 1 running, 125 sleeping, 0 stopped, 0 zombie Cpu(s): 2.2%us, 16.5%sy, 0.0%ni, 46.5%id, 11.4%wa, 0.9%hi, 22.5%si, 0.0%st Mem: 4148112k total, 211152k used, 3936960k free, 15228k buffers Swap: 4192928k total, 0k used, 4192928k free, 121804k cached You can see average idle of 46% So there is probably more optimizations to do to reach maybe 1.300.000 pps ;) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 6:14 ` Eric Dumazet 2010-05-01 10:24 ` Changli Gao @ 2010-05-01 11:29 ` jamal 1 sibling, 0 replies; 108+ messages in thread From: jamal @ 2010-05-01 11:29 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, 2010-05-01 at 08:14 +0200, Eric Dumazet wrote: > BTW, using ee mask, cpu4 is not used at _all_, even for the user > threads. Scheduler does a bad job IMHO. I have the opposite frustration ;-> I did notice it got used. My goal was to totally avoid using it, for simple reason it is an SMT thread that shares same core as cpu0. In retrospect i should probably set irq affinity then to cpu0 and 4. > Using fe mask, I get all packets (sent at 733311pps by my pktgen > machine), and my CPU0 even has idle time !!! I will try this next time i get the chance. cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 5:57 ` Eric Dumazet 2010-05-01 6:14 ` Eric Dumazet @ 2010-05-01 11:23 ` jamal 2010-05-01 11:42 ` Eric Dumazet 1 sibling, 1 reply; 108+ messages in thread From: jamal @ 2010-05-01 11:23 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote: > I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl() > overhead for each packet...) Thats a different test case then ;-> You can also get rid of the timer (I doubt it will show much difference in results) - I have it in there because it i am trying to replicate what i saw causing the regression. > RPS off : 220.000 pps > > RPS on (ee mask) : 700.000 pps (with a slightly modified tg3 driver) > 96% of delivered packets > That's a very very huge gap. What were the numbers before you changed to EV_PERSIST? Note: i did not add any of your other patches for dst refcnt, sockets etc. Were you running with those patches in these tests? I will try the next opportunity i get to have latest kernel + those patches. > This is on tg3 adapter, and tg3 has copybreak feature : small packets > are copied into skb of the right size. Ok, so the driver tuning is also important then (and it shows in the profile). cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 11:23 ` jamal @ 2010-05-01 11:42 ` Eric Dumazet 2010-05-01 11:56 ` jamal 0 siblings, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 11:42 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le samedi 01 mai 2010 à 07:23 -0400, jamal a écrit : > On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote: > > > I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl() > > overhead for each packet...) > > Thats a different test case then ;-> You can also get rid of the timer > (I doubt it will show much difference in results) - I have it in there > because it i am trying to replicate what i saw causing the regression. > > > RPS off : 220.000 pps > > > > RPS on (ee mask) : 700.000 pps (with a slightly modified tg3 driver) > > 96% of delivered packets > > > > That's a very very huge gap. What were the numbers before you changed to > EV_PERSIST? But, whole point of epoll is to not change interest each time you get an event. Without EV_PERSIST, you need two more syscalls per recvfrom() epoll_wait() epoll_ctl(REMOVE) epoll_ctl(ADD) recvfrom() Even poll() would be faster in your case poll(one fd) recvfrom() > Note: i did not add any of your other patches for dst refcnt, sockets > etc. Were you running with those patches in these tests? I will try the > next opportunity i get to have latest kernel + those patches. > > > This is on tg3 adapter, and tg3 has copybreak feature : small packets > > are copied into skb of the right size. > > Ok, so the driver tuning is also important then (and it shows in the > profile). I always thought copybreak was borderline... It can help to reduce memory footprint (allocating 128 bytes instead of 2048/4096 bytes per frame), but with RPS, it would make sense to perform copybreak after RPS, not before. Reducing memory footprint also means less changes on udp_memory_allocated /tcp_memory_allocate (memory reclaim logic) ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 11:42 ` Eric Dumazet @ 2010-05-01 11:56 ` jamal 2010-05-01 13:22 ` Eric Dumazet 2010-05-03 20:10 ` jamal 0 siblings, 2 replies; 108+ messages in thread From: jamal @ 2010-05-01 11:56 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, 2010-05-01 at 13:42 +0200, Eric Dumazet wrote: > But, whole point of epoll is to not change interest each time you get an > event. > > Without EV_PERSIST, you need two more syscalls per recvfrom() > > epoll_wait() > epoll_ctl(REMOVE) > epoll_ctl(ADD) > recvfrom() > > Even poll() would be faster in your case > > poll(one fd) > recvfrom() > This is true - but my goal was/is to replicate the regression i was seeing[1]. I will try with PERSIST next opportunity. If it gets better then it is something that needs documentation in the doc Tom promised ;-> > I always thought copybreak was borderline... > It can help to reduce memory footprint (allocating 128 bytes instead of > 2048/4096 bytes per frame), but with RPS, it would make sense to perform > copybreak after RPS, not before. > > Reducing memory footprint also means less changes on > udp_memory_allocated /tcp_memory_allocate (memory reclaim logic) Indeed, something that didnt cross my mind in the rush to test - it is one of those things that need to be mentioned in some doc somewhere. Tom, are you listening? ;-> cheers, jamal [1]i.e with this program rps was getting worse (it was much better before say net-next of apr14) and that non-rps has been getting better numbers since. The regression is real - but it is likely in another subsystem. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 11:56 ` jamal @ 2010-05-01 13:22 ` Eric Dumazet 2010-05-01 13:49 ` jamal 2010-05-03 20:10 ` jamal 1 sibling, 1 reply; 108+ messages in thread From: Eric Dumazet @ 2010-05-01 13:22 UTC (permalink / raw) To: hadi Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz Le samedi 01 mai 2010 à 07:56 -0400, jamal a écrit : > > [1]i.e with this program rps was getting worse (it was much better > before say net-next of apr14) and that non-rps has been getting better > numbers since. The regression is real - but it is likely in another > subsystem. > You must understand that the whole 'bench' is mostly governed by scheduler artifacts. The regression you mention is probably a side effect. By slowing down one part, its possible to zap all calls to scheduler and go maybe 300% faster (Because consumer threads can avoid 3/4 of the time to schedule) Reciprocally, optimizing one part of the network stack might make threads hitting an empty queue, and need to call more often the scheduler. This is why some higly specialized programs never block/schedule and perform busy loops instead. ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 13:22 ` Eric Dumazet @ 2010-05-01 13:49 ` jamal 0 siblings, 0 replies; 108+ messages in thread From: jamal @ 2010-05-01 13:49 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, 2010-05-01 at 15:22 +0200, Eric Dumazet wrote: > You must understand that the whole 'bench' is mostly governed by > scheduler artifacts. The regression you mention is probably a side > effect. likely. > By slowing down one part, its possible to zap all calls to scheduler and > go maybe 300% faster (Because consumer threads can avoid 3/4 of the time > to schedule) > > Reciprocally, optimizing one part of the network stack might make > threads hitting an empty queue, and need to call more often the > scheduler. It is fair to say that what i am seeing is _not_ fatal because it is rps that is regressing; non-rps is fine. I would consider non-rps to be the common use scenario and if that was doing badly then it is a problem. The good news is it is getting better - likely because of some changes made on behalf of rps ;-> With rps, one could follow some instructions on how to make it better. I am hoping that some of the system "magic" is documented as Tom mentioned he will. > This is why some higly specialized programs never block/schedule and > perform busy loops instead. Agreed. My brain cells should learn to accept this fact ;-> cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-05-01 11:56 ` jamal 2010-05-01 13:22 ` Eric Dumazet @ 2010-05-03 20:10 ` jamal 1 sibling, 0 replies; 108+ messages in thread From: jamal @ 2010-05-03 20:10 UTC (permalink / raw) To: Eric Dumazet Cc: Changli Gao, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Sat, 2010-05-01 at 07:56 -0400, jamal wrote: > On Sat, 2010-05-01 at 13:42 +0200, Eric Dumazet wrote: > > > But, whole point of epoll is to not change interest each time you get an > > event. > > > > Without EV_PERSIST, you need two more syscalls per recvfrom() > > > > epoll_wait() > > epoll_ctl(REMOVE) > > epoll_ctl(ADD) > > recvfrom() > > > > Even poll() would be faster in your case > > > > poll(one fd) > > recvfrom() > > > > This is true - but my goal was/is to replicate the regression i was > seeing[1]. > I will try with PERSIST next opportunity. If it gets better > then it is something that needs documentation in the doc Tom > promised ;-> I tried it with PERSIST and today's net-next and you are right: rps was better compared with (99.4% vs 98.1% of 750Kpps). If however i removed the PERSIST i.e both rps and non-rps have two extra syscalls, again rps performed worse (93.2% vs 97.8% of 750Kpps). Eric, I know the answer is not to do the non-PERSIST mode for rps ;-> But lets just ignore that for a sec: what the heck is going on? I would expect the degradation to be the same for both non-rps. I also wanna do the broken record reminder that kernels before net-next of Apr14 were doing about 97% (as opposed to 93% currently for same test). cheers, jamal ^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [PATCH net-next-2.6] net: speedup udp receive path 2010-04-29 12:45 ` Eric Dumazet 2010-04-29 13:17 ` jamal @ 2010-04-29 23:07 ` Changli Gao 1 sibling, 0 replies; 108+ messages in thread From: Changli Gao @ 2010-04-29 23:07 UTC (permalink / raw) To: Eric Dumazet Cc: hadi, David Miller, therbert, shemminger, netdev, Eilon Greenstein, Brian Bloniarz On Thu, Apr 29, 2010 at 8:45 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > Changli, I wonder how you can cook "performance" patches without testing > them at all for real... This cannot be true ? > I am sorry. But I wasn't against your patch, and I just wanted to understand the test result from jamal. It is my fault submitting a performance patch without testing them. I should not reply on code inspection for the performance patch. -- Regards, Changli Gao(xiaosuo@gmail.com) ^ permalink raw reply [flat|nested] 108+ messages in thread
end of thread, other threads:[~2010-05-04 1:09 UTC | newest] Thread overview: 108+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-04-23 8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao 2010-04-23 9:27 ` Eric Dumazet 2010-04-23 22:02 ` jamal 2010-04-24 14:10 ` jamal 2010-04-26 14:03 ` Eric Dumazet 2010-04-26 14:55 ` Eric Dumazet 2010-04-26 21:06 ` jamal [not found] ` <20100429174056.GA8044@gargoyle.fritz.box> 2010-04-29 17:56 ` Eric Dumazet 2010-04-29 18:10 ` OFT - reserving CPU's for networking Stephen Hemminger 2010-04-29 19:19 ` Thomas Gleixner 2010-04-29 20:02 ` Eric Dumazet 2010-04-30 18:15 ` Brian Bloniarz 2010-04-30 18:57 ` David Miller 2010-04-30 19:58 ` Thomas Gleixner 2010-04-30 21:01 ` Andi Kleen 2010-04-30 22:30 ` David Miller 2010-05-01 10:53 ` Andi Kleen 2010-05-01 22:03 ` David Miller 2010-05-01 22:58 ` Andi Kleen 2010-05-01 23:29 ` David Miller 2010-05-01 23:44 ` Ben Hutchings 2010-05-01 20:31 ` Martin Josefsson 2010-05-01 22:13 ` David Miller [not found] ` <20100429182347.GA8512@gargoyle.fritz.box> 2010-04-29 19:12 ` [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Eric Dumazet [not found] ` <20100429214144.GA10663@gargoyle.fritz.box> 2010-04-30 5:25 ` Eric Dumazet 2010-04-30 23:38 ` David Miller 2010-05-01 11:00 ` Andi Kleen 2010-05-02 6:56 ` Eric Dumazet 2010-05-02 9:20 ` Andi Kleen 2010-05-02 10:54 ` Eric Dumazet 2010-05-02 14:13 ` Arjan van de Ven 2010-05-02 14:27 ` Eric Dumazet 2010-05-02 15:32 ` Eric Dumazet 2010-05-02 17:54 ` Arjan van de Ven 2010-05-02 19:22 ` Eric Dumazet 2010-05-02 22:06 ` Andi Kleen 2010-05-03 3:50 ` Arjan van de Ven 2010-05-03 5:17 ` Eric Dumazet 2010-05-03 10:22 ` Arjan van de Ven 2010-05-03 10:34 ` Andi Kleen 2010-05-03 14:09 ` Arjan van de Ven 2010-05-03 14:45 ` Brian Bloniarz 2010-05-04 1:10 ` Arjan van de Ven 2010-05-03 15:52 ` Andi Kleen 2010-05-04 1:11 ` Arjan van de Ven 2010-05-02 21:30 ` Andi Kleen 2010-05-02 15:46 ` Andi Kleen 2010-05-02 16:35 ` Eric Dumazet 2010-05-02 17:43 ` Arjan van de Ven 2010-05-02 17:47 ` Eric Dumazet 2010-05-02 21:25 ` Andi Kleen 2010-05-02 21:45 ` Eric Dumazet 2010-05-02 21:54 ` Andi Kleen 2010-05-02 22:08 ` Eric Dumazet 2010-05-03 20:15 ` jamal 2010-04-26 21:03 ` jamal 2010-04-23 10:26 ` Eric Dumazet 2010-04-27 22:08 ` David Miller 2010-04-27 22:18 ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet 2010-04-27 22:19 ` David Miller 2010-04-28 13:14 ` Eilon Greenstein 2010-04-28 15:44 ` Eliezer Tamir 2010-04-28 16:53 ` David Miller [not found] ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com> 2010-04-28 16:55 ` David Miller 2010-04-28 11:33 ` jamal 2010-04-28 12:33 ` Eric Dumazet 2010-04-28 12:36 ` jamal 2010-04-28 14:06 ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet 2010-04-28 14:19 ` Eric Dumazet 2010-04-28 14:34 ` Eric Dumazet 2010-04-28 21:36 ` David Miller 2010-04-28 22:22 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet 2010-04-28 22:39 ` David Miller 2010-04-28 23:44 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 2010-04-29 0:00 ` jamal 2010-04-29 4:09 ` Eric Dumazet 2010-04-29 11:35 ` jamal 2010-04-29 12:12 ` Changli Gao 2010-04-29 12:45 ` Eric Dumazet 2010-04-29 13:17 ` jamal 2010-04-29 13:21 ` Eric Dumazet 2010-04-29 13:37 ` jamal 2010-04-29 13:49 ` Eric Dumazet 2010-04-29 13:56 ` jamal 2010-04-29 20:36 ` jamal 2010-04-29 21:01 ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet 2010-04-30 13:55 ` Brian Bloniarz 2010-04-30 17:26 ` Eric Dumazet 2010-04-30 23:35 ` David Miller 2010-05-01 4:56 ` Eric Dumazet 2010-05-01 7:02 ` Eric Dumazet 2010-05-01 8:03 ` Eric Dumazet 2010-05-01 22:00 ` David Miller 2010-04-30 19:30 ` [PATCH net-next-2.6] net: speedup udp receive path jamal 2010-04-30 20:40 ` Eric Dumazet 2010-05-01 0:06 ` jamal 2010-05-01 5:57 ` Eric Dumazet 2010-05-01 6:14 ` Eric Dumazet 2010-05-01 10:24 ` Changli Gao 2010-05-01 10:47 ` Eric Dumazet 2010-05-01 11:29 ` jamal 2010-05-01 11:23 ` jamal 2010-05-01 11:42 ` Eric Dumazet 2010-05-01 11:56 ` jamal 2010-05-01 13:22 ` Eric Dumazet 2010-05-01 13:49 ` jamal 2010-05-03 20:10 ` jamal 2010-04-29 23:07 ` Changli Gao
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).