* [PATCH net-next] net: percpu net_device refcount @ 2010-10-07 17:12 Eric Dumazet 2010-10-07 17:30 ` Stephen Hemminger 2010-10-11 19:13 ` David Miller 0 siblings, 2 replies; 13+ messages in thread From: Eric Dumazet @ 2010-10-07 17:12 UTC (permalink / raw) To: David Miller; +Cc: netdev We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> --- include/linux/netdevice.h | 6 ++-- net/core/dev.c | 47 ++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6abcef6..fa1d88d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1020,7 +1020,7 @@ struct net_device { struct timer_list watchdog_timer; /* Number of references to this device */ - atomic_t refcnt ____cacheline_aligned_in_smp; + int __percpu *pcpu_refcnt; /* delayed register/unregister */ struct list_head todo_list; @@ -1792,7 +1792,7 @@ extern void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - atomic_dec(&dev->refcnt); + irqsafe_cpu_dec(*dev->pcpu_refcnt); } /** @@ -1803,7 +1803,7 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - atomic_inc(&dev->refcnt); + irqsafe_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/core/dev.c b/net/core/dev.c index 7d14955..2186e1e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5195,9 +5195,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5205,6 +5202,19 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); +#if 0 + /* We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount + */ + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + return -ENOMEM; + + /* initialize the ref count */ + dev_hold(dev); +#endif + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5246,6 +5256,15 @@ out: } EXPORT_SYMBOL(register_netdev); +static int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5260,11 +5279,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5291,11 +5313,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5353,7 +5377,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5523,9 +5547,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5556,6 +5584,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5589,6 +5619,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet @ 2010-10-07 17:30 ` Stephen Hemminger 2010-10-07 18:06 ` Eric Dumazet 2010-10-08 21:56 ` Paul E. McKenney 2010-10-11 19:13 ` David Miller 1 sibling, 2 replies; 13+ messages in thread From: Stephen Hemminger @ 2010-10-07 17:30 UTC (permalink / raw) To: Eric Dumazet; +Cc: David Miller, netdev On Thu, 07 Oct 2010 19:12:35 +0200 Eric Dumazet <eric.dumazet@gmail.com> wrote: > We tried very hard to remove all possible dev_hold()/dev_put() pairs in > network stack, using RCU conversions. > > There is still an unavoidable device refcount change for every dst we > create/destroy, and this can slow down some workloads (routers or some > app servers) > > We can switch to a percpu refcount implementation, now dynamic per_cpu > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes > per device. > It makes sense, but what about 256 cores and 1024 Vlans? That adds up to 4M of memory which is might be noticeable. -- ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-07 17:30 ` Stephen Hemminger @ 2010-10-07 18:06 ` Eric Dumazet 2010-10-08 21:56 ` Paul E. McKenney 1 sibling, 0 replies; 13+ messages in thread From: Eric Dumazet @ 2010-10-07 18:06 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David Miller, netdev Le jeudi 07 octobre 2010 à 10:30 -0700, Stephen Hemminger a écrit : > It makes sense, but what about 256 cores and 1024 Vlans? > That adds up to 4M of memory which is might be noticeable. > > Well, 256 cores and 1024 Vlan -> 1 Mbyte of memory, not 4 ;) This seems reasonable to me. Eventually we could use a fallback, if percpu allocation failed -> use a static field in net_device. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-07 17:30 ` Stephen Hemminger 2010-10-07 18:06 ` Eric Dumazet @ 2010-10-08 21:56 ` Paul E. McKenney 2010-10-09 6:23 ` Eric Dumazet 1 sibling, 1 reply; 13+ messages in thread From: Paul E. McKenney @ 2010-10-08 21:56 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, netdev On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote: > On Thu, 07 Oct 2010 19:12:35 +0200 > Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > We tried very hard to remove all possible dev_hold()/dev_put() pairs in > > network stack, using RCU conversions. > > > > There is still an unavoidable device refcount change for every dst we > > create/destroy, and this can slow down some workloads (routers or some > > app servers) > > > > We can switch to a percpu refcount implementation, now dynamic per_cpu > > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes > > per device. > > It makes sense, but what about 256 cores and 1024 Vlans? > That adds up to 4M of memory which is might be noticeable. I bet that systems that have 256 cores have >100GB of memory, at which point 4MB is way down in the noise. Thanx, Paul ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-08 21:56 ` Paul E. McKenney @ 2010-10-09 6:23 ` Eric Dumazet 2010-10-09 16:58 ` Paul E. McKenney 0 siblings, 1 reply; 13+ messages in thread From: Eric Dumazet @ 2010-10-09 6:23 UTC (permalink / raw) To: paulmck; +Cc: Stephen Hemminger, David Miller, netdev Le vendredi 08 octobre 2010 à 14:56 -0700, Paul E. McKenney a écrit : > On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote: > > On Thu, 07 Oct 2010 19:12:35 +0200 > > Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > > We tried very hard to remove all possible dev_hold()/dev_put() pairs in > > > network stack, using RCU conversions. > > > > > > There is still an unavoidable device refcount change for every dst we > > > create/destroy, and this can slow down some workloads (routers or some > > > app servers) > > > > > > We can switch to a percpu refcount implementation, now dynamic per_cpu > > > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes > > > per device. > > > > It makes sense, but what about 256 cores and 1024 Vlans? > > That adds up to 4M of memory which is might be noticeable. > > I bet that systems that have 256 cores have >100GB of memory, at which > point 4MB is way down in the noise. Well, first its 1MB added, and secondly we added percpu stats for vlan devices, and this consumed 8x more : (struct vlan_rx_stats is 32 bytes per cpu and per vlan 32*256*1024 -> 8 Mbytes Some strange machines have many cores sharing a small amount of memory, but I am not sure they want to run many net devices ;) ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-09 6:23 ` Eric Dumazet @ 2010-10-09 16:58 ` Paul E. McKenney 0 siblings, 0 replies; 13+ messages in thread From: Paul E. McKenney @ 2010-10-09 16:58 UTC (permalink / raw) To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev On Sat, Oct 09, 2010 at 08:23:16AM +0200, Eric Dumazet wrote: > Le vendredi 08 octobre 2010 à 14:56 -0700, Paul E. McKenney a écrit : > > On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote: > > > On Thu, 07 Oct 2010 19:12:35 +0200 > > > Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > > > > We tried very hard to remove all possible dev_hold()/dev_put() pairs in > > > > network stack, using RCU conversions. > > > > > > > > There is still an unavoidable device refcount change for every dst we > > > > create/destroy, and this can slow down some workloads (routers or some > > > > app servers) > > > > > > > > We can switch to a percpu refcount implementation, now dynamic per_cpu > > > > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes > > > > per device. > > > > > > It makes sense, but what about 256 cores and 1024 Vlans? > > > That adds up to 4M of memory which is might be noticeable. > > > > I bet that systems that have 256 cores have >100GB of memory, at which > > point 4MB is way down in the noise. > > Well, first its 1MB added, and secondly we added percpu stats for vlan > devices, and this consumed 8x more : > > (struct vlan_rx_stats is 32 bytes per cpu and per vlan > 32*256*1024 -> 8 Mbytes > > Some strange machines have many cores sharing a small amount of memory, > but I am not sure they want to run many net devices ;) I do have to admit that the rapid growth rate in the data required might well be cause for concern. But only if it continues. ;-) Thanx, Paul ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet 2010-10-07 17:30 ` Stephen Hemminger @ 2010-10-11 19:13 ` David Miller 2010-10-11 19:38 ` Eric Dumazet 1 sibling, 1 reply; 13+ messages in thread From: David Miller @ 2010-10-11 19:13 UTC (permalink / raw) To: eric.dumazet; +Cc: netdev From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 07 Oct 2010 19:12:35 +0200 > We tried very hard to remove all possible dev_hold()/dev_put() pairs in > network stack, using RCU conversions. > > There is still an unavoidable device refcount change for every dst we > create/destroy, and this can slow down some workloads (routers or some > app servers) > > We can switch to a percpu refcount implementation, now dynamic per_cpu > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes > per device. ... > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Ok, I'm fine with this. But could you please get rid of that "#if 0" code block? The comment is fine and should stay, but the commented out code shouldn't really stay there. Thanks! ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-11 19:13 ` David Miller @ 2010-10-11 19:38 ` Eric Dumazet 2010-10-11 19:41 ` David Miller 0 siblings, 1 reply; 13+ messages in thread From: Eric Dumazet @ 2010-10-11 19:38 UTC (permalink / raw) To: David Miller; +Cc: netdev Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit : > Ok, I'm fine with this. > > But could you please get rid of that "#if 0" code block? The comment > is fine and should stay, but the commented out code shouldn't really > stay there. Sure, here is second version, against latest net-next-2.6 Thanks ! [PATCH net-next] net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> --- include/linux/netdevice.h | 6 ++--- net/core/dev.c | 39 +++++++++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4160db3..c0b5e05 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1026,7 +1026,7 @@ struct net_device { struct timer_list watchdog_timer; /* Number of references to this device */ - atomic_t refcnt ____cacheline_aligned_in_smp; + int __percpu *pcpu_refcnt; /* delayed register/unregister */ struct list_head todo_list; @@ -1798,7 +1798,7 @@ extern void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - atomic_dec(&dev->refcnt); + irqsafe_cpu_dec(*dev->pcpu_refcnt); } /** @@ -1809,7 +1809,7 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - atomic_inc(&dev->refcnt); + irqsafe_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/core/dev.c b/net/core/dev.c index 193eafa..4b2d820 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5243,6 +5245,15 @@ out: } EXPORT_SYMBOL(register_netdev); +static int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5257,11 +5268,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5288,11 +5302,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5350,7 +5366,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5520,9 +5536,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5553,6 +5573,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5586,6 +5608,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-11 19:38 ` Eric Dumazet @ 2010-10-11 19:41 ` David Miller 2010-10-11 19:49 ` David Miller 0 siblings, 1 reply; 13+ messages in thread From: David Miller @ 2010-10-11 19:41 UTC (permalink / raw) To: eric.dumazet; +Cc: netdev From: Eric Dumazet <eric.dumazet@gmail.com> Date: Mon, 11 Oct 2010 21:38:49 +0200 > Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit : > >> Ok, I'm fine with this. >> >> But could you please get rid of that "#if 0" code block? The comment >> is fine and should stay, but the commented out code shouldn't really >> stay there. > > Sure, here is second version, against latest net-next-2.6 > > Thanks ! > > [PATCH net-next] net: percpu net_device refcount Excellent, applied, thanks! ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-11 19:41 ` David Miller @ 2010-10-11 19:49 ` David Miller 2010-10-11 19:51 ` Eric Dumazet 0 siblings, 1 reply; 13+ messages in thread From: David Miller @ 2010-10-11 19:49 UTC (permalink / raw) To: eric.dumazet; +Cc: netdev From: David Miller <davem@davemloft.net> Date: Mon, 11 Oct 2010 12:41:37 -0700 (PDT) > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Mon, 11 Oct 2010 21:38:49 +0200 > >> [PATCH net-next] net: percpu net_device refcount > > Excellent, applied, thanks! Actually I have to revert, this breaks the infiniband drivers which access netdev->refcnt directly. drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd': drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt' drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp': drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt' drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect': drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt' drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept': drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt' ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount 2010-10-11 19:49 ` David Miller @ 2010-10-11 19:51 ` Eric Dumazet 2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet 0 siblings, 1 reply; 13+ messages in thread From: Eric Dumazet @ 2010-10-11 19:51 UTC (permalink / raw) To: David Miller; +Cc: netdev Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit : > Actually I have to revert, this breaks the infiniband drivers > which access netdev->refcnt directly. > > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd': > drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt' > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp': > drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt' > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect': > drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt' > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept': > drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt' Ah ok, I'll make a build test before submitting v3, sorry for the inconvenience. Thanks ^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH net-next V3] net: percpu net_device refcount 2010-10-11 19:51 ` Eric Dumazet @ 2010-10-11 20:22 ` Eric Dumazet 2010-10-12 19:36 ` David Miller 0 siblings, 1 reply; 13+ messages in thread From: Eric Dumazet @ 2010-10-11 20:22 UTC (permalink / raw) To: David Miller; +Cc: netdev Le lundi 11 octobre 2010 à 21:51 +0200, Eric Dumazet a écrit : > Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit : > > > Actually I have to revert, this breaks the infiniband drivers > > which access netdev->refcnt directly. > > > > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd': > > drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt' > > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp': > > drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt' > > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect': > > drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt' > > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept': > > drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt' > > Ah ok, I'll make a build test before submitting v3, sorry for the > inconvenience. > This was a bit long (allyesconfig), but eventually succeeded ... Thanks ! [PATCH net-next V3] net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> --- V3: export netdev_refcnt_read() for infiniband debugging drivers/infiniband/hw/nes/nes_cm.c | 4 +- drivers/infiniband/hw/nes/nes_verbs.c | 4 +- include/linux/netdevice.h | 7 ++-- net/core/dev.c | 40 +++++++++++++++++++----- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 61e0efd..6220d9d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt) nesibdev = nesvnic->nesibdev; nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", - atomic_read(&nesvnic->netdev->refcnt)); + netdev_refcnt_read(nesvnic->netdev)); if (nesqp->active_conn) { @@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) atomic_inc(&cm_accepts); nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", - atomic_read(&nesvnic->netdev->refcnt)); + netdev_refcnt_read(nesvnic->netdev)); /* allocate the ietf frame and space for private data */ nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 9046e66..546fc22 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, - atomic_read(&nesvnic->netdev->refcnt)); + netdev_refcnt_read(nesvnic->netdev)); err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, nesadapter->max_pd, &pd_num, &nesadapter->next_pd); @@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, /* update the QP table */ nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", - atomic_read(&nesvnic->netdev->refcnt)); + netdev_refcnt_read(nesvnic->netdev)); return &nesqp->ibqp; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4160db3..14fbb04 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1026,7 +1026,7 @@ struct net_device { struct timer_list watchdog_timer; /* Number of references to this device */ - atomic_t refcnt ____cacheline_aligned_in_smp; + int __percpu *pcpu_refcnt; /* delayed register/unregister */ struct list_head todo_list; @@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev) unregister_netdevice_queue(dev, NULL); } +extern int netdev_refcnt_read(const struct net_device *dev); extern void free_netdev(struct net_device *dev); extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); @@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - atomic_dec(&dev->refcnt); + irqsafe_cpu_dec(*dev->pcpu_refcnt); } /** @@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - atomic_inc(&dev->refcnt); + irqsafe_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/core/dev.c b/net/core/dev.c index 193eafa..04972a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5243,6 +5245,16 @@ out: } EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5350,7 +5367,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next V3] net: percpu net_device refcount 2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet @ 2010-10-12 19:36 ` David Miller 0 siblings, 0 replies; 13+ messages in thread From: David Miller @ 2010-10-12 19:36 UTC (permalink / raw) To: eric.dumazet; +Cc: netdev From: Eric Dumazet <eric.dumazet@gmail.com> Date: Mon, 11 Oct 2010 22:22:12 +0200 > This was a bit long (allyesconfig), but eventually succeeded ... ... > [PATCH net-next V3] net: percpu net_device refcount Applied, thanks Eric. ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2010-10-12 19:36 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet 2010-10-07 17:30 ` Stephen Hemminger 2010-10-07 18:06 ` Eric Dumazet 2010-10-08 21:56 ` Paul E. McKenney 2010-10-09 6:23 ` Eric Dumazet 2010-10-09 16:58 ` Paul E. McKenney 2010-10-11 19:13 ` David Miller 2010-10-11 19:38 ` Eric Dumazet 2010-10-11 19:41 ` David Miller 2010-10-11 19:49 ` David Miller 2010-10-11 19:51 ` Eric Dumazet 2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet 2010-10-12 19:36 ` David Miller
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).