public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: David Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Subject: Re: [PATCH net-next] net: percpu net_device refcount
Date: Mon, 11 Oct 2010 21:38:49 +0200	[thread overview]
Message-ID: <1286825929.3218.7.camel@edumazet-laptop> (raw)
In-Reply-To: <20101011.121344.260085789.davem@davemloft.net>

Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit :

> Ok, I'm fine with this.
> 
> But could you please get rid of that "#if 0" code block?  The comment
> is fine and should stay, but the commented out code shouldn't really
> stay there.

Sure, here is second version, against latest net-next-2.6

Thanks !

[PATCH net-next] net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    6 ++---
 net/core/dev.c            |   39 +++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..c0b5e05 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
 	struct timer_list	watchdog_timer;
 
 	/* Number of references to this device */
-	atomic_t		refcnt ____cacheline_aligned_in_smp;
+	int __percpu		*pcpu_refcnt;
 
 	/* delayed register/unregister */
 	struct list_head	todo_list;
@@ -1798,7 +1798,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	atomic_dec(&dev->refcnt);
+	irqsafe_cpu_dec(*dev->pcpu_refcnt);
 }
 
 /**
@@ -1809,7 +1809,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	atomic_inc(&dev->refcnt);
+	irqsafe_cpu_inc(*dev->pcpu_refcnt);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..4b2d820 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* initialize the ref count */
-	atomic_set(&dev->refcnt, 1);
-
 	/* NAPI wants this */
 	INIT_LIST_HEAD(&dev->napi_list);
 
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,15 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+static int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+
 /*
  * netdev_wait_allrefs - wait until all references are gone.
  *
@@ -5257,11 +5268,14 @@ EXPORT_SYMBOL(register_netdev);
 static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
 
 	linkwatch_forget_dev(dev);
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -5288,11 +5302,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		msleep(250);
 
+		refcnt = netdev_refcnt_read(dev);
+
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, refcnt);
 			warning_time = jiffies;
 		}
 	}
@@ -5350,7 +5366,7 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(atomic_read(&dev->refcnt));
+		BUG_ON(netdev_refcnt_read(dev));
 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
@@ -5520,9 +5536,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
-	if (dev_addr_init(dev))
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
 		goto free_tx;
 
+	if (dev_addr_init(dev))
+		goto free_pcpu;
+
 	dev_mc_init(dev);
 	dev_uc_init(dev);
 
@@ -5553,6 +5573,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 free_tx:
 	kfree(tx);
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
 free_p:
 	kfree(p);
 	return NULL;
@@ -5586,6 +5608,9 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);



  reply	other threads:[~2010-10-11 19:38 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-07 17:12 [PATCH net-next] net: percpu net_device refcount Eric Dumazet
2010-10-07 17:30 ` Stephen Hemminger
2010-10-07 18:06   ` Eric Dumazet
2010-10-08 21:56   ` Paul E. McKenney
2010-10-09  6:23     ` Eric Dumazet
2010-10-09 16:58       ` Paul E. McKenney
2010-10-11 19:13 ` David Miller
2010-10-11 19:38   ` Eric Dumazet [this message]
2010-10-11 19:41     ` David Miller
2010-10-11 19:49       ` David Miller
2010-10-11 19:51         ` Eric Dumazet
2010-10-11 20:22           ` [PATCH net-next V3] " Eric Dumazet
2010-10-12 19:36             ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1286825929.3218.7.camel@edumazet-laptop \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox