* [PATCH net-next] net: percpu net_device refcount
@ 2010-10-07 17:12 Eric Dumazet
2010-10-07 17:30 ` Stephen Hemminger
2010-10-11 19:13 ` David Miller
0 siblings, 2 replies; 13+ messages in thread
From: Eric Dumazet @ 2010-10-07 17:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev
We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.
There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers)
We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.
On x86, dev_hold(dev) code :
before
lock incl 0x280(%ebx)
after:
movl 0x260(%ebx),%eax
incl fs:(%eax)
Stress bench :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)
Before:
real 1m1.662s
user 0m14.373s
sys 12m55.960s
After:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/linux/netdevice.h | 6 ++--
net/core/dev.c | 47 ++++++++++++++++++++++++++++++------
2 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6abcef6..fa1d88d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1020,7 +1020,7 @@ struct net_device {
struct timer_list watchdog_timer;
/* Number of references to this device */
- atomic_t refcnt ____cacheline_aligned_in_smp;
+ int __percpu *pcpu_refcnt;
/* delayed register/unregister */
struct list_head todo_list;
@@ -1792,7 +1792,7 @@ extern void netdev_run_todo(void);
*/
static inline void dev_put(struct net_device *dev)
{
- atomic_dec(&dev->refcnt);
+ irqsafe_cpu_dec(*dev->pcpu_refcnt);
}
/**
@@ -1803,7 +1803,7 @@ static inline void dev_put(struct net_device *dev)
*/
static inline void dev_hold(struct net_device *dev)
{
- atomic_inc(&dev->refcnt);
+ irqsafe_cpu_inc(*dev->pcpu_refcnt);
}
/* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 7d14955..2186e1e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5195,9 +5195,6 @@ int init_dummy_netdev(struct net_device *dev)
*/
dev->reg_state = NETREG_DUMMY;
- /* initialize the ref count */
- atomic_set(&dev->refcnt, 1);
-
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
@@ -5205,6 +5202,19 @@ int init_dummy_netdev(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
+#if 0
+ /* We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount
+ */
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ return -ENOMEM;
+
+ /* initialize the ref count */
+ dev_hold(dev);
+#endif
+
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5246,6 +5256,15 @@ out:
}
EXPORT_SYMBOL(register_netdev);
+static int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+
/*
* netdev_wait_allrefs - wait until all references are gone.
*
@@ -5260,11 +5279,14 @@ EXPORT_SYMBOL(register_netdev);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
+ int refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -5291,11 +5313,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
msleep(250);
+ refcnt = netdev_refcnt_read(dev);
+
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage "
"count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -5353,7 +5377,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
+ BUG_ON(netdev_refcnt_read(dev));
WARN_ON(rcu_dereference_raw(dev->ip_ptr));
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
@@ -5523,9 +5547,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
- if (dev_addr_init(dev))
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
goto free_tx;
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
dev_mc_init(dev);
dev_uc_init(dev);
@@ -5556,6 +5584,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
free_tx:
kfree(tx);
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
free_p:
kfree(p);
return NULL;
@@ -5589,6 +5619,9 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet
@ 2010-10-07 17:30 ` Stephen Hemminger
2010-10-07 18:06 ` Eric Dumazet
2010-10-08 21:56 ` Paul E. McKenney
2010-10-11 19:13 ` David Miller
1 sibling, 2 replies; 13+ messages in thread
From: Stephen Hemminger @ 2010-10-07 17:30 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev
On Thu, 07 Oct 2010 19:12:35 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> network stack, using RCU conversions.
>
> There is still an unavoidable device refcount change for every dst we
> create/destroy, and this can slow down some workloads (routers or some
> app servers)
>
> We can switch to a percpu refcount implementation, now dynamic per_cpu
> infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> per device.
>
It makes sense, but what about 256 cores and 1024 Vlans?
That adds up to 4M of memory which is might be noticeable.
--
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-07 17:30 ` Stephen Hemminger
@ 2010-10-07 18:06 ` Eric Dumazet
2010-10-08 21:56 ` Paul E. McKenney
1 sibling, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2010-10-07 18:06 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Le jeudi 07 octobre 2010 à 10:30 -0700, Stephen Hemminger a écrit :
> It makes sense, but what about 256 cores and 1024 Vlans?
> That adds up to 4M of memory which is might be noticeable.
>
>
Well, 256 cores and 1024 Vlan -> 1 Mbyte of memory, not 4 ;)
This seems reasonable to me.
Eventually we could use a fallback, if percpu allocation failed -> use a
static field in net_device.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-07 17:30 ` Stephen Hemminger
2010-10-07 18:06 ` Eric Dumazet
@ 2010-10-08 21:56 ` Paul E. McKenney
2010-10-09 6:23 ` Eric Dumazet
1 sibling, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2010-10-08 21:56 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, netdev
On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote:
> On Thu, 07 Oct 2010 19:12:35 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> > network stack, using RCU conversions.
> >
> > There is still an unavoidable device refcount change for every dst we
> > create/destroy, and this can slow down some workloads (routers or some
> > app servers)
> >
> > We can switch to a percpu refcount implementation, now dynamic per_cpu
> > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> > per device.
>
> It makes sense, but what about 256 cores and 1024 Vlans?
> That adds up to 4M of memory which is might be noticeable.
I bet that systems that have 256 cores have >100GB of memory, at which
point 4MB is way down in the noise.
Thanx, Paul
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-08 21:56 ` Paul E. McKenney
@ 2010-10-09 6:23 ` Eric Dumazet
2010-10-09 16:58 ` Paul E. McKenney
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2010-10-09 6:23 UTC (permalink / raw)
To: paulmck; +Cc: Stephen Hemminger, David Miller, netdev
Le vendredi 08 octobre 2010 à 14:56 -0700, Paul E. McKenney a écrit :
> On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote:
> > On Thu, 07 Oct 2010 19:12:35 +0200
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > > We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> > > network stack, using RCU conversions.
> > >
> > > There is still an unavoidable device refcount change for every dst we
> > > create/destroy, and this can slow down some workloads (routers or some
> > > app servers)
> > >
> > > We can switch to a percpu refcount implementation, now dynamic per_cpu
> > > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> > > per device.
> >
> > It makes sense, but what about 256 cores and 1024 Vlans?
> > That adds up to 4M of memory which is might be noticeable.
>
> I bet that systems that have 256 cores have >100GB of memory, at which
> point 4MB is way down in the noise.
Well, first its 1MB added, and secondly we added percpu stats for vlan
devices, and this consumed 8x more :
(struct vlan_rx_stats is 32 bytes per cpu and per vlan
32*256*1024 -> 8 Mbytes
Some strange machines have many cores sharing a small amount of memory,
but I am not sure they want to run many net devices ;)
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-09 6:23 ` Eric Dumazet
@ 2010-10-09 16:58 ` Paul E. McKenney
0 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2010-10-09 16:58 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev
On Sat, Oct 09, 2010 at 08:23:16AM +0200, Eric Dumazet wrote:
> Le vendredi 08 octobre 2010 à 14:56 -0700, Paul E. McKenney a écrit :
> > On Thu, Oct 07, 2010 at 10:30:51AM -0700, Stephen Hemminger wrote:
> > > On Thu, 07 Oct 2010 19:12:35 +0200
> > > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >
> > > > We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> > > > network stack, using RCU conversions.
> > > >
> > > > There is still an unavoidable device refcount change for every dst we
> > > > create/destroy, and this can slow down some workloads (routers or some
> > > > app servers)
> > > >
> > > > We can switch to a percpu refcount implementation, now dynamic per_cpu
> > > > infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> > > > per device.
> > >
> > > It makes sense, but what about 256 cores and 1024 Vlans?
> > > That adds up to 4M of memory which is might be noticeable.
> >
> > I bet that systems that have 256 cores have >100GB of memory, at which
> > point 4MB is way down in the noise.
>
> Well, first its 1MB added, and secondly we added percpu stats for vlan
> devices, and this consumed 8x more :
>
> (struct vlan_rx_stats is 32 bytes per cpu and per vlan
> 32*256*1024 -> 8 Mbytes
>
> Some strange machines have many cores sharing a small amount of memory,
> but I am not sure they want to run many net devices ;)
I do have to admit that the rapid growth rate in the data required might
well be cause for concern. But only if it continues. ;-)
Thanx, Paul
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet
2010-10-07 17:30 ` Stephen Hemminger
@ 2010-10-11 19:13 ` David Miller
2010-10-11 19:38 ` Eric Dumazet
1 sibling, 1 reply; 13+ messages in thread
From: David Miller @ 2010-10-11 19:13 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 07 Oct 2010 19:12:35 +0200
> We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> network stack, using RCU conversions.
>
> There is still an unavoidable device refcount change for every dst we
> create/destroy, and this can slow down some workloads (routers or some
> app servers)
>
> We can switch to a percpu refcount implementation, now dynamic per_cpu
> infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> per device.
...
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Ok, I'm fine with this.
But could you please get rid of that "#if 0" code block? The comment
is fine and should stay, but the commented out code shouldn't really
stay there.
Thanks!
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-11 19:13 ` David Miller
@ 2010-10-11 19:38 ` Eric Dumazet
2010-10-11 19:41 ` David Miller
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2010-10-11 19:38 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit :
> Ok, I'm fine with this.
>
> But could you please get rid of that "#if 0" code block? The comment
> is fine and should stay, but the commented out code shouldn't really
> stay there.
Sure, here is second version, against latest net-next-2.6
Thanks !
[PATCH net-next] net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.
There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)
We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.
On x86, dev_hold(dev) code :
before
lock incl 0x280(%ebx)
after:
movl 0x260(%ebx),%eax
incl fs:(%eax)
Stress bench :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)
Before:
real 1m1.662s
user 0m14.373s
sys 12m55.960s
After:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/linux/netdevice.h | 6 ++---
net/core/dev.c | 39 +++++++++++++++++++++++++++++-------
2 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..c0b5e05 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
struct timer_list watchdog_timer;
/* Number of references to this device */
- atomic_t refcnt ____cacheline_aligned_in_smp;
+ int __percpu *pcpu_refcnt;
/* delayed register/unregister */
struct list_head todo_list;
@@ -1798,7 +1798,7 @@ extern void netdev_run_todo(void);
*/
static inline void dev_put(struct net_device *dev)
{
- atomic_dec(&dev->refcnt);
+ irqsafe_cpu_dec(*dev->pcpu_refcnt);
}
/**
@@ -1809,7 +1809,7 @@ static inline void dev_put(struct net_device *dev)
*/
static inline void dev_hold(struct net_device *dev)
{
- atomic_inc(&dev->refcnt);
+ irqsafe_cpu_inc(*dev->pcpu_refcnt);
}
/* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..4b2d820 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
*/
dev->reg_state = NETREG_DUMMY;
- /* initialize the ref count */
- atomic_set(&dev->refcnt, 1);
-
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,15 @@ out:
}
EXPORT_SYMBOL(register_netdev);
+static int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+
/*
* netdev_wait_allrefs - wait until all references are gone.
*
@@ -5257,11 +5268,14 @@ EXPORT_SYMBOL(register_netdev);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
+ int refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -5288,11 +5302,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
msleep(250);
+ refcnt = netdev_refcnt_read(dev);
+
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage "
"count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -5350,7 +5366,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
+ BUG_ON(netdev_refcnt_read(dev));
WARN_ON(rcu_dereference_raw(dev->ip_ptr));
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
@@ -5520,9 +5536,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
- if (dev_addr_init(dev))
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
goto free_tx;
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
dev_mc_init(dev);
dev_uc_init(dev);
@@ -5553,6 +5573,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
free_tx:
kfree(tx);
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
free_p:
kfree(p);
return NULL;
@@ -5586,6 +5608,9 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-11 19:38 ` Eric Dumazet
@ 2010-10-11 19:41 ` David Miller
2010-10-11 19:49 ` David Miller
0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2010-10-11 19:41 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 21:38:49 +0200
> Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit :
>
>> Ok, I'm fine with this.
>>
>> But could you please get rid of that "#if 0" code block? The comment
>> is fine and should stay, but the commented out code shouldn't really
>> stay there.
>
> Sure, here is second version, against latest net-next-2.6
>
> Thanks !
>
> [PATCH net-next] net: percpu net_device refcount
Excellent, applied, thanks!
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-11 19:41 ` David Miller
@ 2010-10-11 19:49 ` David Miller
2010-10-11 19:51 ` Eric Dumazet
0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2010-10-11 19:49 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
From: David Miller <davem@davemloft.net>
Date: Mon, 11 Oct 2010 12:41:37 -0700 (PDT)
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 11 Oct 2010 21:38:49 +0200
>
>> [PATCH net-next] net: percpu net_device refcount
>
> Excellent, applied, thanks!
Actually I have to revert, this breaks the infiniband drivers
which access netdev->refcnt directly.
drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next] net: percpu net_device refcount
2010-10-11 19:49 ` David Miller
@ 2010-10-11 19:51 ` Eric Dumazet
2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2010-10-11 19:51 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit :
> Actually I have to revert, this breaks the infiniband drivers
> which access netdev->refcnt directly.
>
> drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
> drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
> drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
> drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
> drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'
Ah ok, I'll make a build test before submitting v3, sorry for the
inconvenience.
Thanks
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH net-next V3] net: percpu net_device refcount
2010-10-11 19:51 ` Eric Dumazet
@ 2010-10-11 20:22 ` Eric Dumazet
2010-10-12 19:36 ` David Miller
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2010-10-11 20:22 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Le lundi 11 octobre 2010 à 21:51 +0200, Eric Dumazet a écrit :
> Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit :
>
> > Actually I have to revert, this breaks the infiniband drivers
> > which access netdev->refcnt directly.
> >
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
> > drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
> > drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
> > drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
> > drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'
>
> Ah ok, I'll make a build test before submitting v3, sorry for the
> inconvenience.
>
This was a bit long (allyesconfig), but eventually succeeded ...
Thanks !
[PATCH net-next V3] net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.
There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)
We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.
On x86, dev_hold(dev) code :
before
lock incl 0x280(%ebx)
after:
movl 0x260(%ebx),%eax
incl fs:(%eax)
Stress bench :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)
Before:
real 1m1.662s
user 0m14.373s
sys 12m55.960s
After:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
V3: export netdev_refcnt_read() for infiniband debugging
drivers/infiniband/hw/nes/nes_cm.c | 4 +-
drivers/infiniband/hw/nes/nes_verbs.c | 4 +-
include/linux/netdevice.h | 7 ++--
net/core/dev.c | 40 +++++++++++++++++++-----
4 files changed, 41 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 61e0efd..6220d9d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
nesibdev = nesvnic->nesibdev;
nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
- atomic_read(&nesvnic->netdev->refcnt));
+ netdev_refcnt_read(nesvnic->netdev));
if (nesqp->active_conn) {
@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
atomic_inc(&cm_accepts);
nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
- atomic_read(&nesvnic->netdev->refcnt));
+ netdev_refcnt_read(nesvnic->netdev));
/* allocate the ietf frame and space for private data */
nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 9046e66..546fc22 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
- atomic_read(&nesvnic->netdev->refcnt));
+ netdev_refcnt_read(nesvnic->netdev));
err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
/* update the QP table */
nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
- atomic_read(&nesvnic->netdev->refcnt));
+ netdev_refcnt_read(nesvnic->netdev));
return &nesqp->ibqp;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..14fbb04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
struct timer_list watchdog_timer;
/* Number of references to this device */
- atomic_t refcnt ____cacheline_aligned_in_smp;
+ int __percpu *pcpu_refcnt;
/* delayed register/unregister */
struct list_head todo_list;
@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev)
unregister_netdevice_queue(dev, NULL);
}
+extern int netdev_refcnt_read(const struct net_device *dev);
extern void free_netdev(struct net_device *dev);
extern void synchronize_net(void);
extern int register_netdevice_notifier(struct notifier_block *nb);
@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void);
*/
static inline void dev_put(struct net_device *dev)
{
- atomic_dec(&dev->refcnt);
+ irqsafe_cpu_dec(*dev->pcpu_refcnt);
}
/**
@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev)
*/
static inline void dev_hold(struct net_device *dev)
{
- atomic_inc(&dev->refcnt);
+ irqsafe_cpu_inc(*dev->pcpu_refcnt);
}
/* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..04972a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
*/
dev->reg_state = NETREG_DUMMY;
- /* initialize the ref count */
- atomic_set(&dev->refcnt, 1);
-
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,16 @@ out:
}
EXPORT_SYMBOL(register_netdev);
+int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
/*
* netdev_wait_allrefs - wait until all references are gone.
*
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
+ int refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
msleep(250);
+ refcnt = netdev_refcnt_read(dev);
+
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage "
"count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
+ BUG_ON(netdev_refcnt_read(dev));
WARN_ON(rcu_dereference_raw(dev->ip_ptr));
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
- if (dev_addr_init(dev))
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
goto free_tx;
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
dev_mc_init(dev);
dev_uc_init(dev);
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
free_tx:
kfree(tx);
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
free_p:
kfree(p);
return NULL;
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next V3] net: percpu net_device refcount
2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet
@ 2010-10-12 19:36 ` David Miller
0 siblings, 0 replies; 13+ messages in thread
From: David Miller @ 2010-10-12 19:36 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 22:22:12 +0200
> This was a bit long (allyesconfig), but eventually succeeded ...
...
> [PATCH net-next V3] net: percpu net_device refcount
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2010-10-12 19:36 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet
2010-10-07 17:30 ` Stephen Hemminger
2010-10-07 18:06 ` Eric Dumazet
2010-10-08 21:56 ` Paul E. McKenney
2010-10-09 6:23 ` Eric Dumazet
2010-10-09 16:58 ` Paul E. McKenney
2010-10-11 19:13 ` David Miller
2010-10-11 19:38 ` Eric Dumazet
2010-10-11 19:41 ` David Miller
2010-10-11 19:49 ` David Miller
2010-10-11 19:51 ` Eric Dumazet
2010-10-11 20:22 ` [PATCH net-next V3] " Eric Dumazet
2010-10-12 19:36 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).