Netdev List
 help / color / mirror / Atom feed
* [patch 2/2] vhost: fix return code for log_access_ok()
From: Dan Carpenter @ 2010-10-11 17:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Juan Quintela, David S. Miller, Rusty Russell, kvm,
	virtualization, netdev, kernel-janitors

access_ok() returns 1 if it's OK otherwise it should return 0.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c2aa12c..f82fe57 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -371,7 +371,7 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 	/* Make sure 64 bit math will not overflow. */
 	if (a > ULONG_MAX - (unsigned long)log_base ||
 	    a + (unsigned long)log_base > ULONG_MAX)
-		return -EFAULT;
+		return 0;
 
 	return access_ok(VERIFY_WRITE, log_base + a,
 			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);

^ permalink raw reply related

* Re: [patch 1/2] vhost: potential integer overflows
From: Al Viro @ 2010-10-11 17:26 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Michael S. Tsirkin, Juan Quintela, David S. Miller, Rusty Russell,
	kvm, virtualization, netdev, kernel-janitors
In-Reply-To: <20101011172256.GF5851@bicker>

On Mon, Oct 11, 2010 at 07:22:57PM +0200, Dan Carpenter wrote:
> I did an audit for potential integer overflows of values which get passed
> to access_ok() and here are the results.

FWIW, UINT_MAX is wrong here.  What you want is maximal size_t value.

> Signed-off-by: Dan Carpenter <error27@gmail.com>
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index dd3d6f7..c2aa12c 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -429,6 +429,14 @@ static int vq_access_ok(unsigned int num,
>  			struct vring_avail __user *avail,
>  			struct vring_used __user *used)
>  {
> +
> +	if (num > UINT_MAX / sizeof *desc)
> +		return 0;
> +	if (num > UINT_MAX / sizeof *avail->ring - sizeof *avail)
> +		return 0;
> +	if (num > UINT_MAX / sizeof *used->ring - sizeof *used)
> +		return 0;
> +
>  	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
>  	       access_ok(VERIFY_READ, avail,
>  			 sizeof *avail + num * sizeof *avail->ring) &&
> @@ -447,6 +455,9 @@ int vhost_log_access_ok(struct vhost_dev *dev)
>  /* Caller should have vq mutex and device mutex */
>  static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
>  {
> +	if (vq->num > UINT_MAX / sizeof *vq->used->ring - sizeof *vq->used)
> +		return 0;
> +
>  	return vq_memory_access_ok(log_base, vq->dev->memory,
>  			    vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
>  		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
> @@ -606,12 +617,17 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  			}
>  
>  			/* Also validate log access for used ring if enabled. */
> -			if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
> -			    !log_access_ok(vq->log_base, a.log_guest_addr,
> +			if (a.flags & (0x1 << VHOST_VRING_F_LOG)) {
> +				if (vq->num > UINT_MAX / sizeof *vq->used->ring - sizeof *vq->used) {
> +					r = -EINVAL;
> +					break;
> +				}
> +				if (!log_access_ok(vq->log_base, a.log_guest_addr,
>  					   sizeof *vq->used +
>  					   vq->num * sizeof *vq->used->ring)) {
> -				r = -EINVAL;
> -				break;
> +					r = -EINVAL;
> +					break;
> +				}
>  			}
>  		}
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] af_packet: account for VLAN when checking packet size
From: Phil Sutter @ 2010-10-11 17:29 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, johann.baudy
In-Reply-To: <20101011.090153.226774563.davem@davemloft.net>

On Mon, Oct 11, 2010 at 09:01:53AM -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 11 Oct 2010 16:03:02 +0200
> 
> > If we dont test ETH_P_8021Q protocol here, we allow sending 1504 bytes
> > frames for MTU=1500
> > 
> > Should we really care ?
> > 
> > If not, just do
> > 
> > reserve = dev->hard_header_len + VLAN_HLEN;
> 
> Thats a good point, I think we need to validate the SKB protocol
> field.

Which is set to the value of the passed struct sockaddr_ll field
sll_protocol. At least in the two userspace code samples I have here,
the later field is set to htons(ETH_P_ALL). So unless one changes the
API, the only way to find out the packet type is to actually parse the
given ethernet header.

Since tpacket_rcv() just interprets the vlan_tci skb field, such
detailed packet inspection is otherwise not done in af_packet.c.

Greetings, Phil

^ permalink raw reply

* Re: [PATCH net-next 2/5] bnx2x: save cycles in setting gso_size
From: David Miller @ 2010-10-11 17:48 UTC (permalink / raw)
  To: vladz; +Cc: bhutchings, dmitry, netdev, eilong
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDEE428246@SJEXCHCCR02.corp.ad.broadcom.com>

From: "Vladislav Zolotarov" <vladz@broadcom.com>
Date: Mon, 11 Oct 2010 10:06:19 -0700

> Dave, we will respin this patch series.

Ok, thanks guys.

^ permalink raw reply

* Re: [PATCH 1/1] ATM: solos-pci, remove use after free
From: David Miller @ 2010-10-11 18:12 UTC (permalink / raw)
  To: eric.dumazet
  Cc: jslaby, netdev, linux-atm-general, linux-kernel, jirislaby, chas
In-Reply-To: <1286785169.2737.3.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 10:19:29 +0200

> Le lundi 11 octobre 2010 à 09:50 +0200, Jiri Slaby a écrit :
>> Stanse found we do in console_show:
>>   kfree_skb(skb);
>>   return skb->len;
>> which is not good. Fix that by remembering the len and use it in the
>> function instead.
>> 
>> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
>> Cc: Chas Williams <chas@cmf.nrl.navy.mil>
>> ---
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH 3/3] NET: wimax, fix use after free
From: David Miller @ 2010-10-11 18:12 UTC (permalink / raw)
  To: inaky.perez-gonzalez; +Cc: jslaby, netdev, linux-kernel, jirislaby, linux-wimax
In-Reply-To: <1286815606.21592.11.camel@localhost.localdomain>

From: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Date: Mon, 11 Oct 2010 09:46:46 -0700

> On Mon, 2010-10-11 at 02:26 -0700, Jiri Slaby wrote: 
>> Stanse found that i2400m_rx frees skb, but still uses skb->len even
>> though it has skb_len defined. So use skb_len properly in the code.
>> 
>> And also define it unsinged int rather than size_t to solve
>> compilation warnings.
>> 
>> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
> 
> Ops, fail. Thanks for the catch. I assume you have compile tested it.
> 
> Acked-by: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>

Applied.

^ permalink raw reply

* Re: [PATCH 1/1] ATM: mpc, fix use after free
From: David Miller @ 2010-10-11 18:12 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jslaby, netdev, linux-atm-general, linux-kernel, jirislaby
In-Reply-To: <1286787580.2737.4.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 10:59:40 +0200

> Le lundi 11 octobre 2010 à 10:46 +0200, Jiri Slaby a écrit :
>> Stanse found that mpc_push frees skb and then it dereferences it. It
>> is a typo, new_skb should be dereferenced there.
>> 
>> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
>> Cc: Eric Dumazet <eric.dumazet@gmail.com>
 ...
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH 2/3] ATM: iphase, remove sleep-inside-atomic
From: David Miller @ 2010-10-11 18:13 UTC (permalink / raw)
  To: jslaby; +Cc: netdev, linux-kernel, jirislaby, chas
In-Reply-To: <1286789218-13976-2-git-send-email-jslaby@suse.cz>

From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 11 Oct 2010 11:26:57 +0200

> Stanse found that ia_init_one locks a spinlock and inside of that it
> calls ia_start which calls:
> * request_irq
> * tx_init which does kmalloc(GFP_KERNEL)
> 
> Both of them can thus sleep and result in a deadlock. I don't see a
> reason to have a per-device spinlock there which is used only there
> and inited right before the lock location. So remove it completely.
> 
> Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Applied.

^ permalink raw reply

* Re: [PATCH 1/3] NET: pch, fix use after free
From: David Miller @ 2010-10-11 18:13 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jslaby, netdev, linux-kernel, jirislaby, masa-korg
In-Reply-To: <1286789918.2737.8.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 11:38:38 +0200

> Le lundi 11 octobre 2010 à 11:26 +0200, Jiri Slaby a écrit :
>> Stanse found that pch_gbe_xmit_frame uses skb after it is freed. Fix
>> that.
>> 
>> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
>> Cc: Masayuki Ohtake <masa-korg@dsn.okisemi.com>
>> ---
>>  drivers/net/pch_gbe/pch_gbe_main.c |    2 +-
>>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> Applicable to net-next-2.6 only, this driver is not yet in Linus tree
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

I'll apply this to net-next-2.6, thanks.

^ permalink raw reply

* Re: NFS & atl1c : "RPC: multiple fragments per record not supported"
From: J. Bruce Fields @ 2010-10-11 18:16 UTC (permalink / raw)
  To: Phil Endecott; +Cc: Linux Kernel Mailing List, Linux NFS Mailing List, netdev
In-Reply-To: <1286722097803@dmwebmail.dmwebmail.chezphil.org>

On Sun, Oct 10, 2010 at 03:48:17PM +0100, Phil Endecott wrote:
> Dear Experts,
> 
> I am seeing the error "RPC: multiple fragments per record not
> supported" on my NFS server when an NFS client with an atl1c network
> driver talks to it.
> 
> The server is a QNAP TS119 ARM box running Debian's 2.6.33.2 kernel.
> It works reliably with other clients.
> 
> The client is a new x86 system with an "Atheros Communications
> AR8131 Gigabit Ethernet (rev c0)" (1969:1063).  The kernel is
> Debian's 2.6.32-5-686 and the driver seems to be atl1c.

To my knowledge the Linux client has never sent packets that would
trigger the prink above, so off hand it does sound like some sort of
corruption at the network level.

(Independently of that: we should fix the server to support multiple
fragments per record at some point.  But if you hadn't hit that printk,
I'm guessing you would have had a failure soon enough anyway.)

> Typically NFS works for a few seconds and then stops, with that
> message repeated on the server.  Other network activity seems
> reliable (e.g. HTTP, ssh, etc.)
> 
> If I use a USB-ethernet adaptor instead of the built-in gigabit it
> works reliably.  (The USB device is not gigabit, but I do still see
> the problems if I limit the port to 100 Mbit on the switch.)
> 
> I see the problem with NFS v3 and v4.  However, I only see it with
> proto=tcp.  By changing the NFS protocol to UDP, the problem seems
> to go away [well, it has been working for about 20 minutes now
> without any issues].
> 
> Google finds a previous report here:
> http://lkml.org/lkml/2010/1/20/198 ; the suggestion is to turn off
> tcp segmentation offload, but it seems that this is not possible
> with my system:
> 
> # ethtool -K eth0 tso off
> Cannot set device tcp segmentation offload settings: Operation not supported
> 
> I have looked at the changes to atl1c since 2.6.32 (http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=history;f=drivers/net/atl1c;h=53cd10d07d040b7bec957acb1c69bc7b44897e69;hb=HEAD)
> and they seem harmless.
> 
> I wiresharked the network activity while this error was being shown,
> and it did include some packets with the high-contrast colour
> schemes that wireshark uses for "bad" packets.  Unfortunately my
> laptop ran out of battery before I could decipher these packets
> further.
> 
> So, is this a known issue?  Do people agree that the atl1c driver is
> most likely the culprit?  Can I offer any further debugging?

I haven't seen that before.  Adding netdev to the cc:, as you seem to
have reasonable evidence that the problem is the network driver.

--b.

^ permalink raw reply

* Re: [PATCH] atl1c: Add support for Atheros AR8152 and AR8152
From: Luis R. Rodriguez @ 2010-10-11 18:48 UTC (permalink / raw)
  To: David Miller
  Cc: ben@decadent.org.uk, Luis Rodriguez, netdev@vger.kernel.org,
	Jie Yang, linux-team
In-Reply-To: <20101010.210304.71107535.davem@davemloft.net>

On Sun, Oct 10, 2010 at 09:03:04PM -0700, David Miller wrote:
> From: Ben Hutchings <ben@decadent.org.uk>
> Date: Mon, 11 Oct 2010 02:18:50 +0100
> 
> > Your commit 496c185c9495629ef1c65387cb2594578393cfe0 "atl1c: Add support
> > for Atheros AR8152 and AR8152" included the following changes:
>  ...
> >> +		if (hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b) {
>  ...
> >> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
>  ...
> > Shouldn't the first if-statement use the same condition as the second
> > i.e. matching the previously-defined hardware types athr_l1c and
> > athr_l2c?
> 
> Yeah that definitely looks like a bug to me.

Good catch, unfortunatley I don't have the source code I used to port
this work the day I did this anymore locally, so adding 
Jie Yang who is actually our maintainer for this driver.

Jie, can you please confirm if this patch is correct?

diff --git a/drivers/net/atl1c/atl1c_hw.c b/drivers/net/atl1c/atl1c_hw.c
index d8501f0..0a7b786 100644
--- a/drivers/net/atl1c/atl1c_hw.c
+++ b/drivers/net/atl1c/atl1c_hw.c
@@ -132,7 +132,7 @@ static int atl1c_get_permanent_address(struct atl1c_hw *hw)
 			return -1;
 	}
 	/* Disable OTP_CLK */
-	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
+	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b)) {
 		otp_ctrl_data &= ~OTP_CTRL_CLK_EN;
 		AT_WRITE_REG(hw, REG_OTP_CTRL, otp_ctrl_data);
 		msleep(1);

  Luis

^ permalink raw reply related

* Re: [PATCH] atl1c: Add support for Atheros AR8152 and AR8152
From: Stephen Hemminger @ 2010-10-11 19:01 UTC (permalink / raw)
  To: Luis R. Rodriguez
  Cc: David Miller, ben@decadent.org.uk, Luis Rodriguez,
	netdev@vger.kernel.org, Jie Yang, linux-team
In-Reply-To: <20101011184835.GA10049@tux>

On Mon, 11 Oct 2010 11:48:35 -0700
"Luis R. Rodriguez" <lrodriguez@atheros.com> wrote:

> On Sun, Oct 10, 2010 at 09:03:04PM -0700, David Miller wrote:
> > From: Ben Hutchings <ben@decadent.org.uk>
> > Date: Mon, 11 Oct 2010 02:18:50 +0100
> > 
> > > Your commit 496c185c9495629ef1c65387cb2594578393cfe0 "atl1c: Add support
> > > for Atheros AR8152 and AR8152" included the following changes:
> >  ...
> > >> +		if (hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b) {
> >  ...
> > >> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
> >  ...
> > > Shouldn't the first if-statement use the same condition as the second
> > > i.e. matching the previously-defined hardware types athr_l1c and
> > > athr_l2c?
> > 
> > Yeah that definitely looks like a bug to me.
> 
> Good catch, unfortunatley I don't have the source code I used to port
> this work the day I did this anymore locally, so adding 
> Jie Yang who is actually our maintainer for this driver.
> 
> Jie, can you please confirm if this patch is correct?
> 
> diff --git a/drivers/net/atl1c/atl1c_hw.c b/drivers/net/atl1c/atl1c_hw.c
> index d8501f0..0a7b786 100644
> --- a/drivers/net/atl1c/atl1c_hw.c
> +++ b/drivers/net/atl1c/atl1c_hw.c
> @@ -132,7 +132,7 @@ static int atl1c_get_permanent_address(struct atl1c_hw *hw)
>  			return -1;
>  	}
>  	/* Disable OTP_CLK */
> -	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b)) {
>  		otp_ctrl_data &= ~OTP_CTRL_CLK_EN;
>  		AT_WRITE_REG(hw, REG_OTP_CTRL, otp_ctrl_data);
>  		msleep(1);
> 

Did you notice ((gratuitous extra parens))


-- 

^ permalink raw reply

* Re: [PATCH] atl1c: Add support for Atheros AR8152 and AR8152
From: David Miller @ 2010-10-11 19:02 UTC (permalink / raw)
  To: shemminger; +Cc: lrodriguez, ben, Luis.Rodriguez, netdev, Jie.Yang, linux-team
In-Reply-To: <20101011120128.2cd460c9@nehalam>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 11 Oct 2010 12:01:28 -0700

> On Mon, 11 Oct 2010 11:48:35 -0700
> "Luis R. Rodriguez" <lrodriguez@atheros.com> wrote:
>> @@ -132,7 +132,7 @@ static int atl1c_get_permanent_address(struct atl1c_hw *hw)
>>  			return -1;
>>  	}
>>  	/* Disable OTP_CLK */
>> -	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
>> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b)) {
>>  		otp_ctrl_data &= ~OTP_CTRL_CLK_EN;
>>  		AT_WRITE_REG(hw, REG_OTP_CTRL, otp_ctrl_data);
>>  		msleep(1);
>> 
> 
> Did you notice ((gratuitous extra parens))

Yeah let's kill that too while we're changing this.

^ permalink raw reply

* Re: [PATCH net-next] net: percpu net_device refcount
From: David Miller @ 2010-10-11 19:13 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286471555.2912.291.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 07 Oct 2010 19:12:35 +0200

> We tried very hard to remove all possible dev_hold()/dev_put() pairs in
> network stack, using RCU conversions.
> 
> There is still an unavoidable device refcount change for every dst we
> create/destroy, and this can slow down some workloads (routers or some
> app servers)
> 
> We can switch to a percpu refcount implementation, now dynamic per_cpu
> infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
> per device.
 ...
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Ok, I'm fine with this.

But could you please get rid of that "#if 0" code block?  The comment
is fine and should stay, but the commented out code shouldn't really
stay there.

Thanks!

^ permalink raw reply

* Re: [PATCH net-next] neigh: speedup neigh_resolve_output()
From: David Miller @ 2010-10-11 19:16 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286484247.3745.91.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 07 Oct 2010 22:44:07 +0200

> Le jeudi 07 octobre 2010 à 14:53 +0200, Eric Dumazet a écrit :
> 
>> Further improvements would need to use a seqlock instead of an rwlock to
>> protect neigh->ha[], to not dirty neigh too often and remove two atomic
>> ops.
>> 
> 
> I implemented this idea in following patch, on top on previous one.
> 
> [PATCH net-next] neigh: speedup neigh_resolve_output()

So Eric do you think this is more efficient than the idea I
proposed, which is to "cmpxchg 'hh' and RCU"?

If you think this seqlock thing is faster or more desirable
for some reason, I'll add it.

Thanks!

^ permalink raw reply

* Re: [PATCH] net: clear heap allocations for privileged ethtool actions
From: David Miller @ 2010-10-11 19:24 UTC (permalink / raw)
  To: bhutchings
  Cc: kees.cook, linux-kernel, jgarzik, jeffrey.t.kirsher,
	peter.p.waskiewicz.jr, netdev
In-Reply-To: <1286487284.2271.37.camel@achroite.uk.solarflarecom.com>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 07 Oct 2010 22:34:44 +0100

> On Thu, 2010-10-07 at 14:10 -0700, Kees Cook wrote:
>> Several other ethtool functions leave heap uncleared (potentially) by
>> drivers. Some interfaces appear safe (eeprom, etc), in that the sizes
>> are well controlled. In some situations (e.g. unchecked error conditions),
>> the heap will remain unchanged in areas before copying back to userspace.
>> Note that these are less of an issue since these all require CAP_NET_ADMIN.
>> 
>> Cc: stable@kernel.org
>> Signed-off-by: Kees Cook <kees.cook@canonical.com>
 ...
> Acked-by: Ben Hutchings <bhutchings@solarflare.com>

So I've applied Kees's patch to net-2.6, and I'll merge net-2.6
into net-next-2.6 so I can resolve the vmalloc/kzalloc merge
conflict before Stephen Rothwell and others have to deal with it
in -next.

Thanks!

^ permalink raw reply

* Re: [PATCH net-next] net: percpu net_device refcount
From: Eric Dumazet @ 2010-10-11 19:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101011.121344.260085789.davem@davemloft.net>

Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit :

> Ok, I'm fine with this.
> 
> But could you please get rid of that "#if 0" code block?  The comment
> is fine and should stay, but the commented out code shouldn't really
> stay there.

Sure, here is second version, against latest net-next-2.6

Thanks !

[PATCH net-next] net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    6 ++---
 net/core/dev.c            |   39 +++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..c0b5e05 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
 	struct timer_list	watchdog_timer;
 
 	/* Number of references to this device */
-	atomic_t		refcnt ____cacheline_aligned_in_smp;
+	int __percpu		*pcpu_refcnt;
 
 	/* delayed register/unregister */
 	struct list_head	todo_list;
@@ -1798,7 +1798,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	atomic_dec(&dev->refcnt);
+	irqsafe_cpu_dec(*dev->pcpu_refcnt);
 }
 
 /**
@@ -1809,7 +1809,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	atomic_inc(&dev->refcnt);
+	irqsafe_cpu_inc(*dev->pcpu_refcnt);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..4b2d820 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* initialize the ref count */
-	atomic_set(&dev->refcnt, 1);
-
 	/* NAPI wants this */
 	INIT_LIST_HEAD(&dev->napi_list);
 
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,15 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+static int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+
 /*
  * netdev_wait_allrefs - wait until all references are gone.
  *
@@ -5257,11 +5268,14 @@ EXPORT_SYMBOL(register_netdev);
 static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
 
 	linkwatch_forget_dev(dev);
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -5288,11 +5302,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		msleep(250);
 
+		refcnt = netdev_refcnt_read(dev);
+
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, refcnt);
 			warning_time = jiffies;
 		}
 	}
@@ -5350,7 +5366,7 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(atomic_read(&dev->refcnt));
+		BUG_ON(netdev_refcnt_read(dev));
 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
@@ -5520,9 +5536,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
-	if (dev_addr_init(dev))
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
 		goto free_tx;
 
+	if (dev_addr_init(dev))
+		goto free_pcpu;
+
 	dev_mc_init(dev);
 	dev_uc_init(dev);
 
@@ -5553,6 +5573,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 free_tx:
 	kfree(tx);
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
 free_p:
 	kfree(p);
 	return NULL;
@@ -5586,6 +5608,9 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);



^ permalink raw reply related

* Re: [PATCH net-next] net: percpu net_device refcount
From: David Miller @ 2010-10-11 19:41 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286825929.3218.7.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 21:38:49 +0200

> Le lundi 11 octobre 2010 à 12:13 -0700, David Miller a écrit :
> 
>> Ok, I'm fine with this.
>> 
>> But could you please get rid of that "#if 0" code block?  The comment
>> is fine and should stay, but the commented out code shouldn't really
>> stay there.
> 
> Sure, here is second version, against latest net-next-2.6
> 
> Thanks !
> 
> [PATCH net-next] net: percpu net_device refcount

Excellent, applied, thanks!

^ permalink raw reply

* Re: [PATCH net-next] neigh: speedup neigh_resolve_output()
From: Eric Dumazet @ 2010-10-11 19:46 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101011.121609.246536311.davem@davemloft.net>

Le lundi 11 octobre 2010 à 12:16 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 07 Oct 2010 22:44:07 +0200
> 
> > Le jeudi 07 octobre 2010 à 14:53 +0200, Eric Dumazet a écrit :
> > 
> >> Further improvements would need to use a seqlock instead of an rwlock to
> >> protect neigh->ha[], to not dirty neigh too often and remove two atomic
> >> ops.
> >> 
> > 
> > I implemented this idea in following patch, on top on previous one.
> > 
> > [PATCH net-next] neigh: speedup neigh_resolve_output()
> 
> So Eric do you think this is more efficient than the idea I
> proposed, which is to "cmpxchg 'hh' and RCU"?
> 
> If you think this seqlock thing is faster or more desirable
> for some reason, I'll add it.
> 
> Thanks!

Hmm, we would need a neigh->ha pointer to some struct, with rcu
protection. It adds a dereference in hot path. I believe this seqlock
(only for pathological cases, where dst are used for few packets) should
be fine.

Note: After this patch, I have a "struct neighbour" small field reorg
pending, not yet submitted.

Thanks



^ permalink raw reply

* Re: [PATCH net-next] net: percpu net_device refcount
From: David Miller @ 2010-10-11 19:49 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <20101011.124137.102555159.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Mon, 11 Oct 2010 12:41:37 -0700 (PDT)

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 11 Oct 2010 21:38:49 +0200
> 
>> [PATCH net-next] net: percpu net_device refcount
> 
> Excellent, applied, thanks!

Actually I have to revert, this breaks the infiniband drivers
which access netdev->refcnt directly.

drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'

^ permalink raw reply

* Re: [PATCH net-next] net: percpu net_device refcount
From: Eric Dumazet @ 2010-10-11 19:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101011.124938.179933637.davem@davemloft.net>

Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit :

> Actually I have to revert, this breaks the infiniband drivers
> which access netdev->refcnt directly.
> 
> drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
> drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
> drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
> drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
> drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
> drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'

Ah ok, I'll make a build test before submitting v3, sorry for the
inconvenience.

Thanks



^ permalink raw reply

* Re: [PATCH net-next] neigh: speedup neigh_resolve_output()
From: David Miller @ 2010-10-11 20:01 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286826361.3218.14.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 21:46:01 +0200

> Hmm, we would need a neigh->ha pointer to some struct, with rcu
> protection. It adds a dereference in hot path. I believe this seqlock
> (only for pathological cases, where dst are used for few packets) should
> be fine.

Good point.

Ok, assuming it passes build testing I'll push this seqlock
patch to net-next-2.6

Thanks!


^ permalink raw reply

* Re: [PATCH net-next] net dst: use a percpu_counter to track entries
From: David Miller @ 2010-10-11 20:07 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286555854.2959.605.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 08 Oct 2010 18:37:34 +0200

> struct dst_ops tracks number of allocated dst in an atomic_t field,
> subject to high cache line contention in stress workload.
> 
> Switch to a percpu_counter, to reduce number of time we need to dirty a
> central location. Place it on a separate cache line to avoid dirtying
> read only fields.
 ...
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

When I first read the subject line for this patch, I was
scared, because I thought you were using a percpu counter
for dst entry refcounts :-)

Anyways this is fine, applied, thanks!

^ permalink raw reply

* Re: [Bugme-new] [Bug 19992] New: b44 + CONFIG_DEBUG_SHIRQ (=y on fedora) fails to resume
From: Andrew Morton @ 2010-10-11 20:15 UTC (permalink / raw)
  To: james; +Cc: bugzilla-daemon, bugme-daemon, netdev, Gary Zambrano
In-Reply-To: <bug-19992-10286@https.bugzilla.kernel.org/>


(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).

On Sun, 10 Oct 2010 16:57:11 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=19992
> 
>            Summary: b44 + CONFIG_DEBUG_SHIRQ (=y on fedora) fails to
>                     resume
>            Product: Drivers
>            Version: 2.5
>     Kernel Version: 2.6.36-rc7
>           Platform: All
>         OS/Version: Linux
>               Tree: Mainline
>             Status: NEW
>           Severity: high
>           Priority: P1
>          Component: Network
>         AssignedTo: drivers_network@kernel-bugs.osdl.org
>         ReportedBy: james@albanarts.com
>         Regression: Yes
> 
> 
> b44 network driver causes system to hang on resume when CONFIG_DEBUG_SHIRQ=y.
> I've done some TRACE_RESUME'ing and the following happens:
> * b44_resume() (drivers/net/b44.c) calls request_irq with IRQF_SHARED (after
> freeing it in the suspend function)
> * request_irq() (kernel/irq/manage.c) calls the interrupt handler directly if
> IRQF_SHARED and CONFIG_DEBUG_SHIRQ=y. It says "It's a shared IRQ -- the driver
> ought to be prepared for it to happen immediately, so let's make sure...."
> * b44_interrupt() gets as far as the first br32 and no further:
>   istat = br32(bp, B44_ISTAT);
> 
> I presume it hasn't yet woken the device up so reading a register somehow fails
> and hangs the system.
> 
> If I comment out the code in request_irq() to test the shared irq handler all
> works fine.
> 
> I'm guessing either the b44 driver shouldn't be freeing/requesting irqs in
> suspend/resume functions, or should be resetting the hardware first so that the
> test handler call doesn't fail, but I don't know enough about why it is freeing
> the irq across suspend to be confident fixing it.
> 
> This has been like this for a while (2.6.34 at least). Suspend used to work on
> fedora with this hardware so I think this is a regression. I'm happy to test
> any patches.

Thanks.  Yup, if the driver/device isn't ready to accept an IRQ when
request_irq() is called then there might be a problem should a real
interrupt happen very shortly after request_irq() is called.

The code looks OK to me so perhaps it is indeed some weird hardware
problem.  Maybe a little delay after the ssb_bus_powerup() is needed?


^ permalink raw reply

* [PATCH net-next V3] net: percpu net_device refcount
From: Eric Dumazet @ 2010-10-11 20:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1286826685.3218.16.camel@edumazet-laptop>

Le lundi 11 octobre 2010 à 21:51 +0200, Eric Dumazet a écrit :
> Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit :
> 
> > Actually I have to revert, this breaks the infiniband drivers
> > which access netdev->refcnt directly.
> > 
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
> > drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
> > drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
> > drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
> > drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'
> 
> Ah ok, I'll make a build test before submitting v3, sorry for the
> inconvenience.
> 

This was a bit long (allyesconfig), but eventually succeeded ...

Thanks !

[PATCH net-next V3] net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
V3: export netdev_refcnt_read() for infiniband debugging

 drivers/infiniband/hw/nes/nes_cm.c    |    4 +-
 drivers/infiniband/hw/nes/nes_verbs.c |    4 +-
 include/linux/netdevice.h             |    7 ++--
 net/core/dev.c                        |   40 +++++++++++++++++++-----
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 61e0efd..6220d9d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
 	nesibdev = nesvnic->nesibdev;
 
 	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	if (nesqp->active_conn) {
 
@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	atomic_inc(&cm_accepts);
 
 	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	/* allocate the ietf frame and space for private data */
 	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 9046e66..546fc22 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
 
 	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
 			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
 			nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 	/* update the QP table */
 	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
 	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	return &nesqp->ibqp;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..14fbb04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
 	struct timer_list	watchdog_timer;
 
 	/* Number of references to this device */
-	atomic_t		refcnt ____cacheline_aligned_in_smp;
+	int __percpu		*pcpu_refcnt;
 
 	/* delayed register/unregister */
 	struct list_head	todo_list;
@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev)
 	unregister_netdevice_queue(dev, NULL);
 }
 
+extern int 		netdev_refcnt_read(const struct net_device *dev);
 extern void		free_netdev(struct net_device *dev);
 extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	atomic_dec(&dev->refcnt);
+	irqsafe_cpu_dec(*dev->pcpu_refcnt);
 }
 
 /**
@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	atomic_inc(&dev->refcnt);
+	irqsafe_cpu_inc(*dev->pcpu_refcnt);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..04972a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* initialize the ref count */
-	atomic_set(&dev->refcnt, 1);
-
 	/* NAPI wants this */
 	INIT_LIST_HEAD(&dev->napi_list);
 
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,16 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
 /*
  * netdev_wait_allrefs - wait until all references are gone.
  *
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
 static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
 
 	linkwatch_forget_dev(dev);
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		msleep(250);
 
+		refcnt = netdev_refcnt_read(dev);
+
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, refcnt);
 			warning_time = jiffies;
 		}
 	}
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(atomic_read(&dev->refcnt));
+		BUG_ON(netdev_refcnt_read(dev));
 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
-	if (dev_addr_init(dev))
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
 		goto free_tx;
 
+	if (dev_addr_init(dev))
+		goto free_pcpu;
+
 	dev_mc_init(dev);
 	dev_uc_init(dev);
 
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 free_tx:
 	kfree(tx);
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
 free_p:
 	kfree(p);
 	return NULL;
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox