Netdev List
 help / color / mirror / Atom feed
* Re: [RFC PATCH net-next] drivers/net Documentation/networking: Create directory intel_wired_lan
From: Joe Perches @ 2010-10-12  0:00 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: Don, e1000-devel, Bruce Allan, Jesse Brandeburg, linux-kernel,
	Greg Rose, John Ronciak, netdev
In-Reply-To: <AANLkTikVKufQRuMGd=cdig7ff4k2aw9Q1FWwMz3pgon-@mail.gmail.com>

On Mon, 2010-10-11 at 16:52 -0700, Jeff Kirsher wrote:
> On Sun, Oct 10, 2010 at 13:42, Joe Perches <joe@perches.com> wrote:
> > Perhaps it's better to move drivers from the very populated
> > drivers/net directory into vendor specific directories similar
> > to the Atheros approach used for drivers/net/wireless/ath/
> NAK
> First, I think we need to keep the documentation in /Documentation/networking.
> Second, the changes are extensive and would create a lot of regression testing.

I don't see any actual changes here other than layout.
What kind of regression testing do you think necessary?

> We have been looking at solutions like this for future
> drivers/hardware and is on the list of items we are currently working
> on, but feel it should not be made retroactively due to the regression
> testing and massive changes that would need to be made.

Might as well start somewhere.


------------------------------------------------------------------------------
Beautiful is writing same markup. Internet Explorer 9 supports
standards for HTML5, CSS3, SVG 1.1,  ECMAScript5, and DOM L2 & L3.
Spend less time writing and  rewriting code and more time creating great
experiences on the web. Be a part of the beta today.
http://p.sf.net/sfu/beautyoftheweb
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [RFC PATCH net-next] drivers/net Documentation/networking: Create directory intel_wired_lan
From: Jeff Kirsher @ 2010-10-11 23:52 UTC (permalink / raw)
  To: Joe Perches
  Cc: e1000-devel, Bruce Allan, Jesse Brandeburg, linux-kernel,
	Greg Rose, John Ronciak, netdev
In-Reply-To: <1286743352.11039.165.camel@Joe-Laptop>

On Sun, Oct 10, 2010 at 13:42, Joe Perches <joe@perches.com> wrote:
> Perhaps it's better to move drivers from the very populated
> drivers/net directory into vendor specific directories similar
> to the Atheros approach used for drivers/net/wireless/ath/
>
> Move intel drivers and Documentation to separate directories
> Create drivers/net/intel_wired_lan/Kconfig.<speed> and Makefile
> Modify drivers/net/Kconfig and Makefile
> Update MAINTAINERS
>
> Signed-off-by: Joe Perches <joe@perches.com>
> ---
>  .../networking/{ => intel_wired_lan}/e100.txt      |    0
>  .../networking/{ => intel_wired_lan}/e1000.txt     |    0
>  .../networking/{ => intel_wired_lan}/igb.txt       |    0
>  .../networking/{ => intel_wired_lan}/igbvf.txt     |    0
>  .../networking/{ => intel_wired_lan}/ixgb.txt      |    0
>  .../networking/{ => intel_wired_lan}/ixgbe.txt     |    0
>  .../networking/{ => intel_wired_lan}/ixgbevf.txt   |    0
>  MAINTAINERS                                        |   18 +--
>  drivers/net/Kconfig                                |  214 +-------------------
>  drivers/net/Makefile                               |    8 -
>  drivers/net/intel_wired_lan/Kconfig.100            |   25 +++
>  drivers/net/intel_wired_lan/Kconfig.1000           |  102 ++++++++++
>  drivers/net/intel_wired_lan/Kconfig.10000          |   81 ++++++++
>  drivers/net/intel_wired_lan/Makefile               |    9 +
>  drivers/net/{ => intel_wired_lan}/e100.c           |    0
>  drivers/net/{ => intel_wired_lan}/e1000/Makefile   |    0
>  drivers/net/{ => intel_wired_lan}/e1000/e1000.h    |    0
>  .../{ => intel_wired_lan}/e1000/e1000_ethtool.c    |    0
>  drivers/net/{ => intel_wired_lan}/e1000/e1000_hw.c |    0
>  drivers/net/{ => intel_wired_lan}/e1000/e1000_hw.h |    0
>  .../net/{ => intel_wired_lan}/e1000/e1000_main.c   |    0
>  .../net/{ => intel_wired_lan}/e1000/e1000_osdep.h  |    0
>  .../net/{ => intel_wired_lan}/e1000/e1000_param.c  |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/82571.c   |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/Makefile  |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/defines.h |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/e1000.h   |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/es2lan.c  |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/ethtool.c |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/hw.h      |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/ich8lan.c |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/lib.c     |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/netdev.c  |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/param.c   |    0
>  drivers/net/{ => intel_wired_lan}/e1000e/phy.c     |    0
>  drivers/net/{ => intel_wired_lan}/igb/Makefile     |    0
>  .../net/{ => intel_wired_lan}/igb/e1000_82575.c    |    0
>  .../net/{ => intel_wired_lan}/igb/e1000_82575.h    |    0
>  .../net/{ => intel_wired_lan}/igb/e1000_defines.h  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_hw.h   |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_mac.c  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_mac.h  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_mbx.c  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_mbx.h  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_nvm.c  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_nvm.h  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_phy.c  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_phy.h  |    0
>  drivers/net/{ => intel_wired_lan}/igb/e1000_regs.h |    0
>  drivers/net/{ => intel_wired_lan}/igb/igb.h        |    0
>  .../net/{ => intel_wired_lan}/igb/igb_ethtool.c    |    0
>  drivers/net/{ => intel_wired_lan}/igb/igb_main.c   |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/Makefile   |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/defines.h  |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/ethtool.c  |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/igbvf.h    |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/mbx.c      |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/mbx.h      |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/netdev.c   |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/regs.h     |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/vf.c       |    0
>  drivers/net/{ => intel_wired_lan}/igbvf/vf.h       |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/Makefile    |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb.h      |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_ee.c   |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_ee.h   |    0
>  .../net/{ => intel_wired_lan}/ixgb/ixgb_ethtool.c  |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_hw.c   |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_hw.h   |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_ids.h  |    0
>  drivers/net/{ => intel_wired_lan}/ixgb/ixgb_main.c |    0
>  .../net/{ => intel_wired_lan}/ixgb/ixgb_osdep.h    |    0
>  .../net/{ => intel_wired_lan}/ixgb/ixgb_param.c    |    0
>  drivers/net/{ => intel_wired_lan}/ixgbe/Makefile   |    0
>  drivers/net/{ => intel_wired_lan}/ixgbe/ixgbe.h    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_82598.c  |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_82599.c  |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_common.c |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_common.h |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_dcb.c    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_dcb.h    |    0
>  .../{ => intel_wired_lan}/ixgbe/ixgbe_dcb_82598.c  |    0
>  .../{ => intel_wired_lan}/ixgbe/ixgbe_dcb_82598.h  |    0
>  .../{ => intel_wired_lan}/ixgbe/ixgbe_dcb_82599.c  |    0
>  .../{ => intel_wired_lan}/ixgbe/ixgbe_dcb_82599.h  |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_dcb_nl.c |    0
>  .../{ => intel_wired_lan}/ixgbe/ixgbe_ethtool.c    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_fcoe.c   |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_fcoe.h   |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_main.c   |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_mbx.c    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_mbx.h    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_phy.c    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_phy.h    |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_sriov.c  |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_sriov.h  |    0
>  .../net/{ => intel_wired_lan}/ixgbe/ixgbe_type.h   |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/Makefile |    0
>  .../net/{ => intel_wired_lan}/ixgbevf/defines.h    |    0
>  .../net/{ => intel_wired_lan}/ixgbevf/ethtool.c    |    0
>  .../net/{ => intel_wired_lan}/ixgbevf/ixgbevf.h    |    0
>  .../{ => intel_wired_lan}/ixgbevf/ixgbevf_main.c   |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/mbx.c    |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/mbx.h    |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/regs.h   |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/vf.c     |    0
>  drivers/net/{ => intel_wired_lan}/ixgbevf/vf.h     |    0
>  107 files changed, 224 insertions(+), 233 deletions(-)
>

NAK
I agree with Stephen that this will generate a lot of confusion and....

First, I think we need to keep the documentation in /Documentation/networking.
Second, the changes are extensive and would create a lot of regression testing.

We have been looking at solutions like this for future
drivers/hardware and is on the list of items we are currently working
on, but feel it should not be made retroactively due to the regression
testing and massive changes that would need to be made.

-- 
Cheers,
Jeff

------------------------------------------------------------------------------
Beautiful is writing same markup. Internet Explorer 9 supports
standards for HTML5, CSS3, SVG 1.1,  ECMAScript5, and DOM L2 & L3.
Spend less time writing and  rewriting code and more time creating great
experiences on the web. Be a part of the beta today.
http://p.sf.net/sfu/beautyoftheweb
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH] b44: fix resume, request_irq after hw reset
From: Andrew Morton @ 2010-10-11 23:34 UTC (permalink / raw)
  To: James Hogan
  Cc: Gary Zambrano, Jiri Pirko, FUJITA Tomonori, Hauke Mehrtens,
	Larry Finger, netdev, linux-kernel, David S. Miller
In-Reply-To: <201010120022.13171.james@albanarts.com>

On Tue, 12 Oct 2010 00:22:12 +0100
James Hogan <james@albanarts.com> wrote:

> This driver was hanging on resume because it was requesting a shared irq
> that it wasn't ready to immediately handle, which was tested in
> request_irq because of the CONFIG_DEBUG_SHIRQ config option. The
> interrupt handler tried to read the interrupt status register but for
> some reason it hung the system.
> 
> The request_irq is now moved a bit later after resetting the hardware
> which seems to fix it.
> 
> Signed-off-by: James Hogan <james@albanarts.com>
> ---
>  drivers/net/b44.c |   12 ++++++------
>  1 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/b44.c b/drivers/net/b44.c
> index 1e620e2..dbba981 100644
> --- a/drivers/net/b44.c
> +++ b/drivers/net/b44.c
> @@ -2296,12 +2296,6 @@ static int b44_resume(struct ssb_device *sdev)
>  	if (!netif_running(dev))
>  		return 0;
>  
> -	rc = request_irq(dev->irq, b44_interrupt, IRQF_SHARED, dev->name, dev);
> -	if (rc) {
> -		netdev_err(dev, "request_irq failed\n");
> -		return rc;
> -	}
> -
>  	spin_lock_irq(&bp->lock);
>  
>  	b44_init_rings(bp);
> @@ -2309,6 +2303,12 @@ static int b44_resume(struct ssb_device *sdev)
>  	netif_device_attach(bp->dev);
>  	spin_unlock_irq(&bp->lock);
>  
> +	rc = request_irq(dev->irq, b44_interrupt, IRQF_SHARED, dev->name, dev);
> +	if (rc) {
> +		netdev_err(dev, "request_irq failed\n");
> +		return rc;
> +	}
> +
>  	b44_enable_ints(bp);
>  	netif_wake_queue(dev);

OK, running the interrupt handler before b44_init_hw() is presumably
the problem here.

The hardware surely won't be generating interrupts until we've run
b44_init_hw() and b44_enable_ints(), so this patch really is only to
keep CONFIG_DEBUG_SHIRQ happy.

^ permalink raw reply

* Re: [Bugme-new] [Bug 19992] New: b44 + CONFIG_DEBUG_SHIRQ (=y on fedora) fails to resume
From: James Hogan @ 2010-10-11 23:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bugzilla-daemon, bugme-daemon, netdev, Gary Zambrano
In-Reply-To: <20101011131539.bbb99afe.akpm@linux-foundation.org>

On Monday 11 October 2010 21:15:39 Andrew Morton wrote:
> (switched to email.  Please respond via emailed reply-to-all, not via the
> bugzilla web interface).
> 
> On Sun, 10 Oct 2010 16:57:11 GMT
> 
> bugzilla-daemon@bugzilla.kernel.org wrote:
> > https://bugzilla.kernel.org/show_bug.cgi?id=19992
> > 
> >            Summary: b44 + CONFIG_DEBUG_SHIRQ (=y on fedora) fails to
> >            
> >                     resume
> >            
> >            Product: Drivers
> >            Version: 2.5
> >     
> >     Kernel Version: 2.6.36-rc7
> >     
> >           Platform: All
> >         
> >         OS/Version: Linux
> >         
> >               Tree: Mainline
> >             
> >             Status: NEW
> >           
> >           Severity: high
> >           Priority: P1
> >          
> >          Component: Network
> >         
> >         AssignedTo: drivers_network@kernel-bugs.osdl.org
> >         ReportedBy: james@albanarts.com
> >         Regression: Yes
> > 
> > b44 network driver causes system to hang on resume when
> > CONFIG_DEBUG_SHIRQ=y. I've done some TRACE_RESUME'ing and the following
> > happens:
> > * b44_resume() (drivers/net/b44.c) calls request_irq with IRQF_SHARED
> > (after freeing it in the suspend function)
> > * request_irq() (kernel/irq/manage.c) calls the interrupt handler
> > directly if IRQF_SHARED and CONFIG_DEBUG_SHIRQ=y. It says "It's a shared
> > IRQ -- the driver ought to be prepared for it to happen immediately, so
> > let's make sure...."
> > 
> > * b44_interrupt() gets as far as the first br32 and no further:
> >   istat = br32(bp, B44_ISTAT);
> > 
> > I presume it hasn't yet woken the device up so reading a register somehow
> > fails and hangs the system.
> > 
> > If I comment out the code in request_irq() to test the shared irq handler
> > all works fine.
> > 
> > I'm guessing either the b44 driver shouldn't be freeing/requesting irqs
> > in suspend/resume functions, or should be resetting the hardware first
> > so that the test handler call doesn't fail, but I don't know enough
> > about why it is freeing the irq across suspend to be confident fixing
> > it.
> > 
> > This has been like this for a while (2.6.34 at least). Suspend used to
> > work on fedora with this hardware so I think this is a regression. I'm
> > happy to test any patches.
> 
> Thanks.  Yup, if the driver/device isn't ready to accept an IRQ when
> request_irq() is called then there might be a problem should a real
> interrupt happen very shortly after request_irq() is called.
> 
> The code looks OK to me so perhaps it is indeed some weird hardware
> problem.  Maybe a little delay after the ssb_bus_powerup() is needed?

Thanks for the ideas. I tried a delay and it didn't work, but when I moved the 
request_irq after the spinlocked code which appears to reset the hardware, all 
was fine, which kind of makes sense.

See patch "b44: fix resume, request_irq after hw reset"

Cheers
James

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: dont use netdev_alloc_skb()
From: Eric Dumazet @ 2010-10-11 23:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Michael Chan, Eilon Greenstein
In-Reply-To: <1286838210.30423.128.camel@edumazet-laptop>

Le mardi 12 octobre 2010 à 01:03 +0200, Eric Dumazet a écrit :
> netdev_alloc_skb() is a very wrong interface, really.
> 
> We should remove/deprecate it.
> 
> For multi queue devices, it makes more sense to allocate skb on local
> node of the cpu handling RX interrupts. This allow each cpu to
> manipulate its own slub/slab queues/structures without doing expensive
> cross-node business.
> 
> For non multi queue devices, IRQ affinity should be set so that a cpu
> close to the device services interrupts. Even if not set, using
> dev_alloc_skb() is faster.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Or maybe revert :

commit b30973f877fea1a3fb84e05599890fcc082a88e5
Author: Christoph Hellwig <hch@lst.de>
Date:   Wed Dec 6 20:32:36 2006 -0800

    [PATCH] node-aware skb allocation
    
    Node-aware allocation of skbs for the receive path.
    
    Details:
    
      - __alloc_skb gets a new node argument and cals the node-aware
        slab functions with it.
      - netdev_alloc_skb passed the node number it gets from dev_to_node
        to it, everyone else passes -1 (any node)
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Cc: Christoph Lameter <clameter@engr.sgi.com>
    Cc: "David S. Miller" <davem@davemloft.net>
    Signed-off-by: Andrew Morton <akpm@osdl.org>


Apparently, only Christoph and Andrew signed it.




^ permalink raw reply

* [PATCH] b44: fix resume, request_irq after hw reset
From: James Hogan @ 2010-10-11 23:22 UTC (permalink / raw)
  To: Gary Zambrano, Jiri Pirko, FUJITA Tomonori, Hauke Mehrtens,
	Larry Finger <Larry.Finger@
  Cc: David S. Miller

This driver was hanging on resume because it was requesting a shared irq
that it wasn't ready to immediately handle, which was tested in
request_irq because of the CONFIG_DEBUG_SHIRQ config option. The
interrupt handler tried to read the interrupt status register but for
some reason it hung the system.

The request_irq is now moved a bit later after resetting the hardware
which seems to fix it.

Signed-off-by: James Hogan <james@albanarts.com>
---
 drivers/net/b44.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index 1e620e2..dbba981 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -2296,12 +2296,6 @@ static int b44_resume(struct ssb_device *sdev)
 	if (!netif_running(dev))
 		return 0;
 
-	rc = request_irq(dev->irq, b44_interrupt, IRQF_SHARED, dev->name, dev);
-	if (rc) {
-		netdev_err(dev, "request_irq failed\n");
-		return rc;
-	}
-
 	spin_lock_irq(&bp->lock);
 
 	b44_init_rings(bp);
@@ -2309,6 +2303,12 @@ static int b44_resume(struct ssb_device *sdev)
 	netif_device_attach(bp->dev);
 	spin_unlock_irq(&bp->lock);
 
+	rc = request_irq(dev->irq, b44_interrupt, IRQF_SHARED, dev->name, dev);
+	if (rc) {
+		netdev_err(dev, "request_irq failed\n");
+		return rc;
+	}
+
 	b44_enable_ints(bp);
 	netif_wake_queue(dev);
 
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH 2/2 net-next] bnx2: Enable AER on PCIE devices only
From: David Miller @ 2010-10-11 23:12 UTC (permalink / raw)
  To: mchan; +Cc: netdev
In-Reply-To: <1286494973-5115-2-git-send-email-mchan@broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Thu, 7 Oct 2010 16:42:53 -0700

> To prevent unnecessary error message.  pci_save_state() is also moved to
> the end of ->probe() so that all PCI config, including AER state, will be
> saved.
> 
> Update version to 2.0.18.
> 
> Signed-off-by: Michael Chan <mchan@broadcom.com>
> Reviewed-by: Benjamin Li <mchan@broadcom.com>

Applied.

^ permalink raw reply

* Re: [PATCH 1/2 net-next] bnx2: Update firmware to 6.0.x.
From: David Miller @ 2010-10-11 23:12 UTC (permalink / raw)
  To: mchan; +Cc: netdev
In-Reply-To: <1286494973-5115-1-git-send-email-mchan@broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Thu, 7 Oct 2010 16:42:52 -0700

> - Improved flow control and simplified interface
> - Use hardware RSS indirection table instead of the slower firmware-
>   based table
> - Lower latency interrupt on 5709
> 
> Signed-off-by: Michael Chan <mchan@broadcom.com>
> Reviewed-by: Benjamin Li <benli@broadcom.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] net dst: use a percpu_counter to track entries
From: David Miller @ 2010-10-11 23:10 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286832573.30423.93.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 23:29:33 +0200

> Le lundi 11 octobre 2010 à 13:07 -0700, David Miller a écrit :
> 
>> When I first read the subject line for this patch, I was
>> scared, because I thought you were using a percpu counter
>> for dst entry refcounts :-)
>> 
>> Anyways this is fine, applied, thanks!
> 
> I though about a crazy idea for loopback device (but not a percpu
> counter for dst refcounts)
> 
> Add a cpu field somewhere so that each cpu manipulates a different dst.
> 
> Or maybe just never cache such dst, so that each tcp socket uses its own
> dst.

We could even make the cpu (in lieu of a socket) a key in the lookup,
but that would complicate handling things like PMTU indications and
redirects.

^ permalink raw reply

* Re: [PATCH net-next] neigh: reorder struct neighbour fields
From: David Miller @ 2010-10-11 23:09 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286835654.30423.107.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 12 Oct 2010 00:20:54 +0200

> Le mardi 12 octobre 2010 à 00:02 +0200, Eric Dumazet a écrit :
>> Here is the followup patch.
>> 
>> Thanks !
>> 
> 
> Oops, this was an old version, the up2date ones also took care of "used"
> field.
...
> [PATCH net-next V2] neigh: reorder struct neighbour fields

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-2.6] tg3: restore rx_dropped accounting
From: David Miller @ 2010-10-11 23:06 UTC (permalink / raw)
  To: mcarlson; +Cc: eric.dumazet, netdev, mchan
In-Reply-To: <20101011222657.GA7307@mcarlson.broadcom.com>

From: "Matt Carlson" <mcarlson@broadcom.com>
Date: Mon, 11 Oct 2010 15:26:57 -0700

> On Sun, Oct 10, 2010 at 10:55:52PM -0700, Eric Dumazet wrote:
>> commit 511d22247be7 (tg3: 64 bit stats on all arches), overlooked the
>> rx_dropped accounting.
>> 
>> We use a full "struct rtnl_link_stats64" to hold rx_dropped value, but
>> forgot to report it in tg3_get_stats64().
>> 
>> Use an "unsigned long" instead to shrink "struct tg3" by 176 bytes, and
>> report this value to stats readers.
>> 
>> Increment rx_dropped counter for oversized frames.
>> 
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Acked-by: Matt Carlson <mcarlson@broadcom.com>

Applied, thanks Matt.

^ permalink raw reply

* [PATCH net-next] bnx2x: dont use netdev_alloc_skb()
From: Eric Dumazet @ 2010-10-11 23:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Michael Chan, Eilon Greenstein

netdev_alloc_skb() is a very wrong interface, really.

We should remove/deprecate it.

For multi queue devices, it makes more sense to allocate skb on local
node of the cpu handling RX interrupts. This allow each cpu to
manipulate its own slub/slab queues/structures without doing expensive
cross-node business.

For non multi queue devices, IRQ affinity should be set so that a cpu
close to the device services interrupts. Even if not set, using
dev_alloc_skb() is faster.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/bnx2x/bnx2x_cmn.c     |   11 +++++------
 drivers/net/bnx2x/bnx2x_cmn.h     |    2 +-
 drivers/net/bnx2x/bnx2x_ethtool.c |    2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 97ef674..0561bd9 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -335,7 +335,7 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	struct sw_rx_bd *rx_buf = &fp->tpa_pool[queue];
 	struct sk_buff *skb = rx_buf->skb;
 	/* alloc new skb */
-	struct sk_buff *new_skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+	struct sk_buff *new_skb = dev_alloc_skb(bp->rx_buf_size);
 
 	/* Unmap skb in the pool anyway, as we are going to change
 	   pool entry status to BNX2X_TPA_STOP even if new skb allocation
@@ -576,8 +576,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 			    (len <= RX_COPY_THRESH)) {
 				struct sk_buff *new_skb;
 
-				new_skb = netdev_alloc_skb(bp->dev,
-							   len + pad);
+				new_skb = dev_alloc_skb(len + pad);
 				if (new_skb == NULL) {
 					DP(NETIF_MSG_RX_ERR,
 					   "ERROR  packet dropped "
@@ -587,9 +586,9 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 				}
 
 				/* aligned copy */
-				skb_copy_from_linear_data_offset(skb, pad,
-						    new_skb->data + pad, len);
 				skb_reserve(new_skb, pad);
+				skb_copy_from_linear_data_offset(skb, pad,
+						    new_skb->data, len);
 				skb_put(new_skb, len);
 
 				bnx2x_reuse_rx_skb(fp, bd_cons, bd_prod);
@@ -841,7 +840,7 @@ void bnx2x_init_rx_rings(struct bnx2x *bp)
 		if (!fp->disable_tpa) {
 			for (i = 0; i < max_agg_queues; i++) {
 				fp->tpa_pool[i].skb =
-				   netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+				   dev_alloc_skb(bp->rx_buf_size);
 				if (!fp->tpa_pool[i].skb) {
 					BNX2X_ERR("Failed to allocate TPA "
 						  "skb pool for queue[%d] - "
diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
index 7f52cec..f422cfc 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/bnx2x/bnx2x_cmn.h
@@ -833,7 +833,7 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
 	struct eth_rx_bd *rx_bd = &fp->rx_desc_ring[index];
 	dma_addr_t mapping;
 
-	skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+	skb = dev_alloc_skb(bp->rx_buf_size);
 	if (unlikely(skb == NULL))
 		return -ENOMEM;
 
diff --git a/drivers/net/bnx2x/bnx2x_ethtool.c b/drivers/net/bnx2x/bnx2x_ethtool.c
index 54fe061..5224daa 100644
--- a/drivers/net/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/bnx2x/bnx2x_ethtool.c
@@ -1430,7 +1430,7 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode, u8 link_up)
 	/* prepare the loopback packet */
 	pkt_size = (((bp->dev->mtu < ETH_MAX_PACKET_SIZE) ?
 		     bp->dev->mtu : ETH_MAX_PACKET_SIZE) + ETH_HLEN);
-	skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+	skb = dev_alloc_skb(bp->rx_buf_size);
 	if (!skb) {
 		rc = -ENOMEM;
 		goto test_loopback_exit;



^ permalink raw reply related

* Re: [PATCH] b44: fix carrier detection on bind
From: David Miller @ 2010-10-11 22:45 UTC (permalink / raw)
  To: fercerpav; +Cc: zambrano, netdev, stable
In-Reply-To: <1286836950-3161-1-git-send-email-fercerpav@gmail.com>

From: Paul Fertser <fercerpav@gmail.com>
Date: Tue, 12 Oct 2010 02:42:30 +0400

> For carrier detection to work properly when binding the driver with a cable
> unplugged, netif_carrier_off() should be called after register_netdev(),
> not before.
> 
> Signed-off-by: Paul Fertser <fercerpav@gmail.com>

Applied, thanks.

^ permalink raw reply

* [PATCH] b44: fix carrier detection on bind
From: Paul Fertser @ 2010-10-11 22:42 UTC (permalink / raw)
  To: Gary Zambrano; +Cc: David S. Miller, netdev, Paul Fertser, stable

For carrier detection to work properly when binding the driver with a cable
unplugged, netif_carrier_off() should be called after register_netdev(),
not before.

Signed-off-by: Paul Fertser <fercerpav@gmail.com>
Cc: stable@kernel.org
---

Without this it's quite annoying to boot a laptop without ethernet
connection as the userspace is getting confused and can even add a default
route through a non-functional interface (when it's configured to use
static ip).

 drivers/net/b44.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index 37617ab..d711eb6 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -2161,8 +2161,6 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
 	dev->irq = sdev->irq;
 	SET_ETHTOOL_OPS(dev, &b44_ethtool_ops);
 
-	netif_carrier_off(dev);
-
 	err = ssb_bus_powerup(sdev->bus, 0);
 	if (err) {
 		dev_err(sdev->dev,
@@ -2204,6 +2202,8 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
 		goto err_out_powerdown;
 	}
 
+	netif_carrier_off(dev);
+
 	ssb_set_drvdata(sdev, dev);
 
 	/* Chip reset provides power to the b44 MAC & PCI cores, which
-- 
1.7.2.2


^ permalink raw reply related

* Re: [RFC PATCH 1/2] r8169: check dma mapping failures
From: Francois Romieu @ 2010-10-11 22:08 UTC (permalink / raw)
  To: Stanislaw Gruszka; +Cc: netdev, Denis Kirjanov
In-Reply-To: <1286548203-10831-1-git-send-email-sgruszka@redhat.com>

Stanislaw Gruszka <sgruszka@redhat.com> :
[...]
> diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
> index bc669a4..b3b28b1 100644
> --- a/drivers/net/r8169.c
> +++ b/drivers/net/r8169.c
> @@ -4018,20 +4018,24 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,

You can replace the pci_dev parameter with pdev->dev

>  
>  	skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
>  	if (!skb)
> -		goto err_out;
> +		goto err0;

err_out_0 (with a big please)

>  	skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
>  
>  	mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
>  				 PCI_DMA_FROMDEVICE);
> +	if (dma_mapping_error(&pdev->dev, mapping))
> +		goto err1;

err_free_skb_1

>  	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
> -out:
> +
>  	return skb;
>  
> -err_out:
> +err1:
> +	dev_kfree_skb(skb);
> +err0:
>  	rtl8169_make_unusable_by_asic(desc);
> -	goto out;
> +	return NULL;

I'd keep the 'goto out' and NULL the skb after the dev_kfree_skb but
it's up to you.

>  }
>  
>  static void rtl8169_rx_clear(struct rtl8169_private *tp)
> @@ -4115,11 +4119,11 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
>  	tx_skb->len = 0;
>  }
>  
> -static void rtl8169_tx_clear(struct rtl8169_private *tp)
> +static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start, u32 end)
>  {
> -	unsigned int i;
> +	u32 i;
>  
> -	for (i = tp->dirty_tx; i < tp->dirty_tx + NUM_TX_DESC; i++) {
> +	for (i = start; i < end; i++) {

Feel free to fix the (existing) wraparound bug.

>  		unsigned int entry = i % NUM_TX_DESC;
>  		struct ring_info *tx_skb = tp->tx_skb + entry;
>  		unsigned int len = tx_skb->len;
> @@ -4136,6 +4140,11 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp)
>  			tp->dev->stats.tx_dropped++;
>  		}
>  	}
> +}
> +
> +static inline void rtl8169_tx_clear(struct rtl8169_private *tp)

Though used several times, it's hardly time critical : drop the inline.

> +{
> +	rtl8169_tx_clear_range(tp, tp->dirty_tx, tp->dirty_tx + NUM_TX_DESC);
>  	tp->cur_tx = tp->dirty_tx = 0;
>  }
>  
> @@ -4254,6 +4263,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,

Please add a local &tp->pci_dev->dev variable.

>  		addr = ((void *) page_address(frag->page)) + frag->page_offset;
>  		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
>  					 PCI_DMA_TODEVICE);
> +		if (dma_mapping_error(&tp->pci_dev->dev, mapping))
> +			return -cur_frag;

I ponder adding an 'unlikely', goto some cleaning statement and return a
signification deprieved "I failed" value. The caller does it anyway but
I am not sure if it really is its business.

>  
>  		/* anti gcc 2.95.3 bugware (sic) */
>  		status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
> @@ -4314,7 +4325,10 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
>  	opts1 = DescOwn | rtl8169_tso_csum(skb, dev);
>  
>  	frags = rtl8169_xmit_frags(tp, skb, opts1);
> -	if (frags) {
> +	if (frags < 0) {

'unsigned int frags' problem.

[...]
> @@ -4355,6 +4371,10 @@ err_stop:
>  	netif_stop_queue(dev);
>  	dev->stats.tx_dropped++;
>  	return NETDEV_TX_BUSY;
> +
> +err_dma:
> +	rtl8169_tx_clear_range(tp, entry, entry + frags + 1);
> +	return NETDEV_TX_OK;

Third return statement, second NETDEV_TX_OK, no tx_dropped increase. May be
a bit heavy handed.

Otherwise it's cool.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH net-2.6] tg3: restore rx_dropped accounting
From: Matt Carlson @ 2010-10-11 22:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Matthew Carlson, Michael Chan
In-Reply-To: <1286776552.2692.232.camel@edumazet-laptop>

On Sun, Oct 10, 2010 at 10:55:52PM -0700, Eric Dumazet wrote:
> commit 511d22247be7 (tg3: 64 bit stats on all arches), overlooked the
> rx_dropped accounting.
> 
> We use a full "struct rtnl_link_stats64" to hold rx_dropped value, but
> forgot to report it in tg3_get_stats64().
> 
> Use an "unsigned long" instead to shrink "struct tg3" by 176 bytes, and
> report this value to stats readers.
> 
> Increment rx_dropped counter for oversized frames.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Acked-by: Matt Carlson <mcarlson@broadcom.com>

> CC: Michael Chan <mchan@broadcom.com>
> CC: Matt Carlson <mcarlson@broadcom.com>
> ---
>  drivers/net/tg3.c |    6 ++++--
>  drivers/net/tg3.h |    2 +-
>  2 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
> index bc3af78..1ec4b9e 100644
> --- a/drivers/net/tg3.c
> +++ b/drivers/net/tg3.c
> @@ -4666,7 +4666,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
>  				       desc_idx, *post_ptr);
>  		drop_it_no_recycle:
>  			/* Other statistics kept track of by card. */
> -			tp->net_stats.rx_dropped++;
> +			tp->rx_dropped++;
>  			goto next_pkt;
>  		}
>  
> @@ -4726,7 +4726,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
>  		if (len > (tp->dev->mtu + ETH_HLEN) &&
>  		    skb->protocol != htons(ETH_P_8021Q)) {
>  			dev_kfree_skb(skb);
> -			goto next_pkt;
> +			goto drop_it_no_recycle;
>  		}
>  
>  		if (desc->type_flags & RXD_FLAG_VLAN &&
> @@ -9240,6 +9240,8 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
>  	stats->rx_missed_errors = old_stats->rx_missed_errors +
>  		get_stat64(&hw_stats->rx_discards);
>  
> +	stats->rx_dropped = tp->rx_dropped;
> +
>  	return stats;
>  }
>  
> diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
> index 4937bd1..be7ff13 100644
> --- a/drivers/net/tg3.h
> +++ b/drivers/net/tg3.h
> @@ -2759,7 +2759,7 @@ struct tg3 {
>  
>  
>  	/* begin "everything else" cacheline(s) section */
> -	struct rtnl_link_stats64	net_stats;
> +	unsigned long			rx_dropped;
>  	struct rtnl_link_stats64	net_stats_prev;
>  	struct tg3_ethtool_stats	estats;
>  	struct tg3_ethtool_stats	estats_prev;
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply

* Re: [PATCH] atl1c: Add support for Atheros AR8152 and AR8152
From: Ben Hutchings @ 2010-10-11 22:28 UTC (permalink / raw)
  To: Luis R. Rodriguez
  Cc: David Miller, Luis Rodriguez, netdev@vger.kernel.org, Jie Yang,
	linux-team
In-Reply-To: <20101011184835.GA10049@tux>

[-- Attachment #1: Type: text/plain, Size: 1796 bytes --]

On Mon, 2010-10-11 at 11:48 -0700, Luis R. Rodriguez wrote:
> On Sun, Oct 10, 2010 at 09:03:04PM -0700, David Miller wrote:
> > From: Ben Hutchings <ben@decadent.org.uk>
> > Date: Mon, 11 Oct 2010 02:18:50 +0100
> > 
> > > Your commit 496c185c9495629ef1c65387cb2594578393cfe0 "atl1c: Add support
> > > for Atheros AR8152 and AR8152" included the following changes:
> >  ...
> > >> +		if (hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b) {
> >  ...
> > >> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
> >  ...
> > > Shouldn't the first if-statement use the same condition as the second
> > > i.e. matching the previously-defined hardware types athr_l1c and
> > > athr_l2c?
> > 
> > Yeah that definitely looks like a bug to me.
> 
> Good catch, unfortunatley I don't have the source code I used to port
> this work the day I did this anymore locally, so adding 
> Jie Yang who is actually our maintainer for this driver.
> 
> Jie, can you please confirm if this patch is correct?

I was suggesting that the first condition was wrong and the second was
right.

Ben.

> diff --git a/drivers/net/atl1c/atl1c_hw.c b/drivers/net/atl1c/atl1c_hw.c
> index d8501f0..0a7b786 100644
> --- a/drivers/net/atl1c/atl1c_hw.c
> +++ b/drivers/net/atl1c/atl1c_hw.c
> @@ -132,7 +132,7 @@ static int atl1c_get_permanent_address(struct atl1c_hw *hw)
>  			return -1;
>  	}
>  	/* Disable OTP_CLK */
> -	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c)) {
> +	if ((hw->nic_type == athr_l1c || hw->nic_type == athr_l2c_b)) {
>  		otp_ctrl_data &= ~OTP_CTRL_CLK_EN;
>  		AT_WRITE_REG(hw, REG_OTP_CTRL, otp_ctrl_data);
>  		msleep(1);
> 
>   Luis
> 

-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: tbf/htb qdisc limitations
From: Steven Brudenell @ 2010-10-11 22:27 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev
In-Reply-To: <4CB1A22B.9090701@gmail.com>

> I'm not sure you checked how the "burst" works, and doubt it could
> help you here. Anyway, do you think: rate 2KB/s with burst 5GB
> config would be useful for you?

i actually really do want something like 2KB/s with 5GB burst
(modifying parameters such that burst + rate * 30 days <= 5GB, but you
get the idea). but this isn't possible given the implementation:

i see that overall, virtual "tokens" map to "scheduler ticks", where a
"scheduler tick" is 64ns. (net/sched/sch_{tbf,htb}.c,
include/net/pkt_sched.h -- these 64ns units are called "ticks" despite
being unrelated to HZ). the "burst" parameter is also stored and
passed from userspace as a u32. so, the maximum configurable burst in
both cases is rate * 275s, since we can only track 275s worth of
"scheduler ticks" in a u32 ( (1<<32) / NSEC_PER_SEC * 64 =~ 275s ).

> My proposal is you don't bother with 1) and 2), but first do the
> hack in tbf or htb directly, using or omitting rate tables, how
> you like, and test this idea.

i'll give it a shot, though given that i hate writing the same code
twice, i would prefer to know the right way to change netlink before i
write a functional test.

due to the implementation coupling i don't see any way to make any
permanent change *without* changing the netlink interface -- even
changing that u32 to a u64, which would only need to be a u64 in
userspace because userspace does the munging today!

(what's worse, today userspace has to specify the full rate table over
netlink, instead of just specifying the rate and having the kernel
driver compute the table or whatever other data structure it deems
necessary. i think decoupling interface from implementation is a
worthy goal by itself. if they were decoupled, i could have just coded
a patch and not bothered y'all in the first place....)

> But it seems the right way is to collect monthly stats with some
> userspace tool and change qdisc config dynamically. You might
> look at network admins' lists for small ISPs exemplary scripts
> doing such nasty things to their users, or have a look at ppp
> accounting tools.

<non technical sidetrack>
i disagree outright that a userspace tool is the "right" way to solve
my constraints.

my constraints are:
1) i need to guarantee i never ever go over the monthly transfer limit
(bad experiences with Comcast... you can check out of Red Tape Hotel
any time you like, but you can never leave).
2) i want to be able to transfer short bursts at top speed whenever
possible (that's what i'm paying for in the first place).
3) i need to ration transfer usage so i am never stuck in a situation
of being limited to snail speeds until the end of the month (on a
Comcast connection in my area, i can reasonably sustain 2MB/sec
downstream, which eats 250GB in ~36 hours, so this constraint becomes
important).

tbf with a large burst size seems ideal for my constraints. i can't
quantify this, but it seems like no simpler strategy satisfies the
constraints well and no more complex strategy is necessary. i think
any userspace solution i could write would end up trying to emulate
tbf with large burst.

a userspace tool updating qdisc parameters, even if run in an infinite
loop, would always have a big chunky time resolution compared to an
inline packet shaper (which is important for #2, and for #1 to a
degree). i could write a packet shaper in userspace, but this does not
make sense to given that kernel qos already exists, and already has a
tbf implementation that just needs a little love.
</non technical sidetrack>

given all that, i'd just like to know

1) whether it's forbidden or bad to do floating point math in a packet
scheduler, and

2) the best way to go about making breaking changes to netlink.

^ permalink raw reply

* Re: [PATCH net-next] neigh: reorder struct neighbour fields
From: Eric Dumazet @ 2010-10-11 22:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1286834567.30423.102.camel@edumazet-laptop>

Le mardi 12 octobre 2010 à 00:02 +0200, Eric Dumazet a écrit :
> Here is the followup patch.
> 
> Thanks !
> 

Oops, this was an old version, the up2date ones also took care of "used"
field.

I guess its time for a sleep, sorry again.


[PATCH net-next V2] neigh: reorder struct neighbour fields

(refcnt) and (ha_lock, ha, used, dev, output, ops, primary_key) should
be placed on a separate cache lines.

refcnt can be often written, while other fields are mostly read.

This gave me good result on stress test :

before:

real    0m45.570s
user    0m15.525s
sys     9m56.669s

After:

real    0m41.841s
user    0m15.261s
sys     8m45.949s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f04e7a2..55590ab 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -94,8 +94,6 @@ struct neighbour {
 	struct neighbour __rcu	*next;
 	struct neigh_table	*tbl;
 	struct neigh_parms	*parms;
-	struct net_device	*dev;
-	unsigned long		used;
 	unsigned long		confirmed;
 	unsigned long		updated;
 	__u8			flags;
@@ -103,16 +101,18 @@ struct neighbour {
 	__u8			type;
 	__u8			dead;
 	atomic_t		refcnt;
+	struct sk_buff_head	arp_queue;
+	struct timer_list	timer;
+	unsigned long		used;
 	atomic_t		probes;
 	rwlock_t		lock;
 	seqlock_t		ha_lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
 	struct hh_cache		*hh;
 	int			(*output)(struct sk_buff *skb);
-	struct sk_buff_head	arp_queue;
-	struct timer_list	timer;
 	const struct neigh_ops	*ops;
 	struct rcu_head		rcu;
+	struct net_device	*dev;
 	u8			primary_key[0];
 };
 



^ permalink raw reply related

* [PATCH net-next] neigh: reorder struct neighbour fields
From: Eric Dumazet @ 2010-10-11 22:02 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101011.130128.245392712.davem@davemloft.net>

Here is the followup patch.

Thanks !

[PATCH net-next] neigh: reorder struct neighbour fields

(refcnt) and (ha_lock, ha, dev, output, ops, primary_key) should be
placed on a separate cache lines.

refcnt can be often written, while other fields are mostly read.

This gave me good result on stress test :

before:

real    0m45.570s
user    0m15.525s
sys     9m56.669s

After:

real    0m41.841s
user    0m15.261s
sys     8m45.949s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/neighbour.h |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f04e7a2..822802a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -94,7 +94,6 @@ struct neighbour {
 	struct neighbour __rcu	*next;
 	struct neigh_table	*tbl;
 	struct neigh_parms	*parms;
-	struct net_device	*dev;
 	unsigned long		used;
 	unsigned long		confirmed;
 	unsigned long		updated;
@@ -103,16 +102,17 @@ struct neighbour {
 	__u8			type;
 	__u8			dead;
 	atomic_t		refcnt;
+	struct sk_buff_head	arp_queue;
+	struct timer_list	timer;
 	atomic_t		probes;
 	rwlock_t		lock;
 	seqlock_t		ha_lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
 	struct hh_cache		*hh;
 	int			(*output)(struct sk_buff *skb);
-	struct sk_buff_head	arp_queue;
-	struct timer_list	timer;
 	const struct neigh_ops	*ops;
 	struct rcu_head		rcu;
+	struct net_device	*dev;
 	u8			primary_key[0];
 };
 



^ permalink raw reply related

* Re: [PATCH net-next] net dst: use a percpu_counter to track entries
From: Eric Dumazet @ 2010-10-11 21:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101011.130758.70192030.davem@davemloft.net>

Le lundi 11 octobre 2010 à 13:07 -0700, David Miller a écrit :

> When I first read the subject line for this patch, I was
> scared, because I thought you were using a percpu counter
> for dst entry refcounts :-)
> 
> Anyways this is fine, applied, thanks!

I though about a crazy idea for loopback device (but not a percpu
counter for dst refcounts)

Add a cpu field somewhere so that each cpu manipulates a different dst.

Or maybe just never cache such dst, so that each tcp socket uses its own
dst.




^ permalink raw reply

* Re: [PATCH] net: introduce alloc_skb_order0
From: Eric Dumazet @ 2010-10-11 21:17 UTC (permalink / raw)
  To: Stanislaw Gruszka; +Cc: David Miller, Francois Romieu, netdev
In-Reply-To: <1286813133.2737.36.camel@edumazet-laptop>


> 1) Allocation of a physically continous 16Kbytes bloc for the rx-ring,
> at device initialization (GFP_KERNEL OK here)
> 
>    Here, the only thing you could do is to not allocate real skbs but
> only 16KB data blocs (no need for the sk_buf, only the ->data part), and
> force copybreak for all incoming packets (remove the rx_copybreak
> tunable)
> 

Here is the patch I cooked for net-next-2.6 to implement this idea.

I tested it on a dev machine and it works well.

[PATCH net-next] r8169: use 50% less ram for RX ring

Using standard skb allocations in r8169 leads to order-3 allocations (if
PAGE_SIZE=4096), because NIC needs 16383 bytes, and skb overhead makes
this bigger than 16384 -> 32768 bytes per "skb"

Using kmalloc() permits to reduce memory requirements of one r8169 nic
by 4Mbytes. (256 frames * 16Kbytes). This is fine since a hardware bug
requires us to copy incoming frames, so we build real skb when doing
this copy.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/r8169.c |  183 ++++++++++++++----------------------------
 1 file changed, 64 insertions(+), 119 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index bc669a4..1760533 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -187,12 +187,7 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8169_pci_tbl) = {
 
 MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
 
-/*
- * we set our copybreak very high so that we don't have
- * to allocate 16k frames all the time (see note in
- * rtl8169_open()
- */
-static int rx_copybreak = 16383;
+static int rx_buf_sz = 16383;
 static int use_dac;
 static struct {
 	u32 msg_enable;
@@ -484,10 +479,8 @@ struct rtl8169_private {
 	struct RxDesc *RxDescArray;	/* 256-aligned Rx descriptor ring */
 	dma_addr_t TxPhyAddr;
 	dma_addr_t RxPhyAddr;
-	struct sk_buff *Rx_skbuff[NUM_RX_DESC];	/* Rx data buffers */
+	void *Rx_databuff[NUM_RX_DESC];	/* Rx data buffers */
 	struct ring_info tx_skb[NUM_TX_DESC];	/* Tx data buffers */
-	unsigned align;
-	unsigned rx_buf_sz;
 	struct timer_list timer;
 	u16 cp_cmd;
 	u16 intr_event;
@@ -515,8 +508,6 @@ struct rtl8169_private {
 
 MODULE_AUTHOR("Realtek and the Linux r8169 crew <netdev@vger.kernel.org>");
 MODULE_DESCRIPTION("RealTek RTL-8169 Gigabit Ethernet driver");
-module_param(rx_copybreak, int, 0);
-MODULE_PARM_DESC(rx_copybreak, "Copy breakpoint for copy-only-tiny-frames");
 module_param(use_dac, int, 0);
 MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
 module_param_named(debug, debug.msg_enable, int, 0);
@@ -3196,7 +3187,6 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->features |= NETIF_F_GRO;
 
 	tp->intr_mask = 0xffff;
-	tp->align = cfg->align;
 	tp->hw_start = cfg->hw_start;
 	tp->intr_event = cfg->intr_event;
 	tp->napi_event = cfg->napi_event;
@@ -3266,18 +3256,6 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 }
 
-static void rtl8169_set_rxbufsize(struct rtl8169_private *tp,
-				  unsigned int mtu)
-{
-	unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
-
-	if (max_frame != 16383)
-		printk(KERN_WARNING PFX "WARNING! Changing of MTU on this "
-			"NIC may lead to frame reception errors!\n");
-
-	tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE;
-}
-
 static int rtl8169_open(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
@@ -3287,18 +3265,6 @@ static int rtl8169_open(struct net_device *dev)
 	pm_runtime_get_sync(&pdev->dev);
 
 	/*
-	 * Note that we use a magic value here, its wierd I know
-	 * its done because, some subset of rtl8169 hardware suffers from
-	 * a problem in which frames received that are longer than
-	 * the size set in RxMaxSize register return garbage sizes
-	 * when received.  To avoid this we need to turn off filtering,
-	 * which is done by setting a value of 16383 in the RxMaxSize register
-	 * and allocating 16k frames to handle the largest possible rx value
-	 * thats what the magic math below does.
-	 */
-	rtl8169_set_rxbufsize(tp, 16383 - VLAN_ETH_HLEN - ETH_FCS_LEN);
-
-	/*
 	 * Rx and Tx desscriptors needs 256 bytes alignment.
 	 * dma_alloc_coherent provides more.
 	 */
@@ -3474,7 +3440,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
+	rtl_set_rx_max_size(ioaddr, rx_buf_sz);
 
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_01) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_02) ||
@@ -3735,7 +3701,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
+	rtl_set_rx_max_size(ioaddr, rx_buf_sz);
 
 	tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1;
 
@@ -3915,7 +3881,7 @@ static void rtl_hw_start_8101(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
+	rtl_set_rx_max_size(ioaddr, rx_buf_sz);
 
 	tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW;
 
@@ -3956,8 +3922,6 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu)
 
 	rtl8169_down(dev);
 
-	rtl8169_set_rxbufsize(tp, dev->mtu);
-
 	ret = rtl8169_init_ring(dev);
 	if (ret < 0)
 		goto out;
@@ -3978,15 +3942,15 @@ static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc)
 	desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask);
 }
 
-static void rtl8169_free_rx_skb(struct rtl8169_private *tp,
-				struct sk_buff **sk_buff, struct RxDesc *desc)
+static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
+				     void **data_buff, struct RxDesc *desc)
 {
 	struct pci_dev *pdev = tp->pci_dev;
 
-	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
+	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
 			 PCI_DMA_FROMDEVICE);
-	dev_kfree_skb(*sk_buff);
-	*sk_buff = NULL;
+	kfree(*data_buff);
+	*data_buff = NULL;
 	rtl8169_make_unusable_by_asic(desc);
 }
 
@@ -4005,33 +3969,34 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
 	rtl8169_mark_to_asic(desc, rx_buf_sz);
 }
 
-static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
+static inline void *rtl8169_align(void *data)
+{
+	return (void *)ALIGN((long)data, 16);
+}
+
+static struct sk_buff *rtl8169_alloc_rx_data(struct pci_dev *pdev,
 					    struct net_device *dev,
-					    struct RxDesc *desc, int rx_buf_sz,
-					    unsigned int align, gfp_t gfp)
+					    struct RxDesc *desc)
 {
-	struct sk_buff *skb;
+	void *data;
 	dma_addr_t mapping;
-	unsigned int pad;
+	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 
-	pad = align ? align : NET_IP_ALIGN;
+	data = kmalloc_node(rx_buf_sz, GFP_KERNEL, node);
+	if (!data)
+		return NULL;
 
-	skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
-	if (!skb)
-		goto err_out;
-
-	skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
-
-	mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
+	if (rtl8169_align(data) != data) {
+		kfree(data);
+		data = kmalloc_node(rx_buf_sz + 15, GFP_KERNEL, node);
+		if (!data)
+			return NULL;
+	}
+	mapping = dma_map_single(&pdev->dev, rtl8169_align(data), rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
-out:
-	return skb;
-
-err_out:
-	rtl8169_make_unusable_by_asic(desc);
-	goto out;
+	return data;
 }
 
 static void rtl8169_rx_clear(struct rtl8169_private *tp)
@@ -4039,8 +4004,8 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
 	unsigned int i;
 
 	for (i = 0; i < NUM_RX_DESC; i++) {
-		if (tp->Rx_skbuff[i]) {
-			rtl8169_free_rx_skb(tp, tp->Rx_skbuff + i,
+		if (tp->Rx_databuff[i]) {
+			rtl8169_free_rx_databuff(tp, tp->Rx_databuff + i,
 					    tp->RxDescArray + i);
 		}
 	}
@@ -4052,21 +4017,21 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
 	u32 cur;
 
 	for (cur = start; end - cur != 0; cur++) {
-		struct sk_buff *skb;
+		void *data;
 		unsigned int i = cur % NUM_RX_DESC;
 
 		WARN_ON((s32)(end - cur) < 0);
 
-		if (tp->Rx_skbuff[i])
+		if (tp->Rx_databuff[i])
 			continue;
 
-		skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev,
-					   tp->RxDescArray + i,
-					   tp->rx_buf_sz, tp->align, gfp);
-		if (!skb)
+		data = rtl8169_alloc_rx_data(tp->pci_dev, dev,
+					     tp->RxDescArray + i);
+		if (!data) {
+			rtl8169_make_unusable_by_asic(tp->RxDescArray + i);
 			break;
-
-		tp->Rx_skbuff[i] = skb;
+		}
+		tp->Rx_databuff[i] = data;
 	}
 	return cur - start;
 }
@@ -4088,7 +4053,7 @@ static int rtl8169_init_ring(struct net_device *dev)
 	rtl8169_init_ring_indexes(tp);
 
 	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
-	memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *));
+	memset(tp->Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
 
 	if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
 		goto err_out;
@@ -4473,27 +4438,23 @@ static inline void rtl8169_rx_csum(struct sk_buff *skb, u32 opts1)
 		skb_checksum_none_assert(skb);
 }
 
-static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff,
-				       struct rtl8169_private *tp, int pkt_size,
-				       dma_addr_t addr)
+static struct sk_buff *rtl8169_try_rx_copy(void *data,
+					   struct rtl8169_private *tp,
+					   int pkt_size,
+					   dma_addr_t addr)
 {
 	struct sk_buff *skb;
-	bool done = false;
-
-	if (pkt_size >= rx_copybreak)
-		goto out;
-
-	skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
-	if (!skb)
-		goto out;
 
+	data = rtl8169_align(data);
 	dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
 				PCI_DMA_FROMDEVICE);
-	skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size);
-	*sk_buff = skb;
-	done = true;
-out:
-	return done;
+	prefetch(data);
+	skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
+	if (skb)
+		memcpy(skb->data, data, pkt_size);
+	dma_sync_single_for_device(&tp->pci_dev->dev, addr, pkt_size,
+				   PCI_DMA_FROMDEVICE);
+	return skb;
 }
 
 /*
@@ -4508,7 +4469,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 				void __iomem *ioaddr, u32 budget)
 {
 	unsigned int cur_rx, rx_left;
-	unsigned int delta, count;
+	unsigned int count;
 	int polling = (budget != ~(u32)0) ? 1 : 0;
 
 	cur_rx = tp->cur_rx;
@@ -4537,12 +4498,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 				rtl8169_schedule_work(dev, rtl8169_reset_task);
 				dev->stats.rx_fifo_errors++;
 			}
-			rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
+			rtl8169_mark_to_asic(desc, rx_buf_sz);
 		} else {
-			struct sk_buff *skb = tp->Rx_skbuff[entry];
+			struct sk_buff *skb;
 			dma_addr_t addr = le64_to_cpu(desc->addr);
 			int pkt_size = (status & 0x00001FFF) - 4;
-			struct pci_dev *pdev = tp->pci_dev;
 
 			/*
 			 * The driver does not support incoming fragmented
@@ -4552,18 +4512,16 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 			if (unlikely(rtl8169_fragmented_frame(status))) {
 				dev->stats.rx_dropped++;
 				dev->stats.rx_length_errors++;
-				rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
+				rtl8169_mark_to_asic(desc, rx_buf_sz);
 				continue;
 			}
 
-			if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) {
-				dma_sync_single_for_device(&pdev->dev, addr,
-					pkt_size, PCI_DMA_FROMDEVICE);
-				rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
-			} else {
-				dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz,
-						 PCI_DMA_FROMDEVICE);
-				tp->Rx_skbuff[entry] = NULL;
+			skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
+						  tp, pkt_size, addr);
+			rtl8169_mark_to_asic(desc, rx_buf_sz);
+			if (!skb) {
+				dev->stats.rx_dropped++;
+				continue;
 			}
 
 			rtl8169_rx_csum(skb, status);
@@ -4592,20 +4550,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 	count = cur_rx - tp->cur_rx;
 	tp->cur_rx = cur_rx;
 
-	delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
-	if (!delta && count)
-		netif_info(tp, intr, dev, "no Rx buffer allocated\n");
-	tp->dirty_rx += delta;
-
-	/*
-	 * FIXME: until there is periodic timer to try and refill the ring,
-	 * a temporary shortage may definitely kill the Rx process.
-	 * - disable the asic to try and avoid an overflow and kick it again
-	 *   after refill ?
-	 * - how do others driver handle this condition (Uh oh...).
-	 */
-	if (tp->dirty_rx + NUM_RX_DESC == tp->cur_rx)
-		netif_emerg(tp, intr, dev, "Rx buffers exhausted\n");
+	tp->dirty_rx += count;
 
 	return count;
 }



^ permalink raw reply related

* [PATCH RFC] tun: dma engine support
From: Michael S. Tsirkin @ 2010-10-11 20:52 UTC (permalink / raw)
  Cc: Dan Williams, Linus Walleij, Anatolij Gustschin, Magnus Damm,
	Andrew Morton, Michael S. Tsirkin, Tejun Heo, David S. Miller,
	Herbert Xu, Eric Dumazet, Joe Perches, linux-kernel, netdev, kvm

Simple hack to use dma engine for tun RX.
Only one skb in flight at the moment.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

I am still looking at handling multiple skbs, but
sending this out for early flames and improvement suggestions.

Loopback testing seems to show only minor performance gains:
this is not really suprising as data is hot in cache already.
Where I would expect this to help more is with incoming
traffic from an external NIC. This still needs to be tested.

 drivers/dma/Kconfig   |    2 +-
 drivers/dma/iovlock.c |    2 +-
 drivers/net/tun.c     |  389 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 390 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9520cf0..7e82c00 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -202,7 +202,7 @@ comment "DMA Clients"
 	depends on DMA_ENGINE
 
 config NET_DMA
-	bool "Network: TCP receive copy offload"
+	bool "Network: TCP/TUN receive copy offload"
 	depends on DMA_ENGINE && NET
 	default (INTEL_IOATDMA || FSL_DMA)
 	help
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
index c6917e8..121d7fd 100644
--- a/drivers/dma/iovlock.c
+++ b/drivers/dma/iovlock.c
@@ -138,7 +138,7 @@ void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
 
 	kfree(pinned_list);
 }
-
+EXPORT_SYMBOL_GPL(dma_unpin_iovec_pages);
 
 /*
  * We have already pinned down the pages we will be using in the iovecs.
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 55f3a3e..ddbfbc8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -62,6 +62,8 @@
 #include <linux/nsproxy.h>
 #include <linux/virtio_net.h>
 #include <linux/rcupdate.h>
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
@@ -70,6 +72,9 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
+int tun_dma_copybreak = 0x10000;
+module_param_named(dma_copybreak, tun_dma_copybreak, int, 0644);
+MODULE_PARM_DESC(debug_level, "Use DMA engine for messages of this length and up");
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -547,6 +552,364 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 	return skb;
 }
 
+#ifdef CONFIG_NET_DMA
+/* The below duplicates code from net/core and drivers/dma
+ * with the minor twist that these functions work on a const
+ * iovec with an offset. TODO: move it there? */
+static int num_pages_spanned(void __user * iov_base, size_t iov_len)
+{
+	return
+	((PAGE_ALIGN((unsigned long)iov_base + iov_len) -
+	((unsigned long)iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+static struct dma_pinned_list *dma_pin_const_iovec_pages(const struct iovec *iov,
+						       size_t iov_offset, size_t len)
+{
+	struct dma_pinned_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;
+	void __user *iov_base;
+	size_t iov_len;
+
+	/* determine how many iovecs/pages there are, up front */
+	do {
+		/* Skip offset as required. */
+		iov_len = iov[nr_iovecs].iov_len;
+		if (iov_offset >= iovec_len_used + iov_len) {
+			iov_offset -= iov_len;
+			++iov;
+			continue;
+		}
+		iov_base = iov[nr_iovecs].iov_base;
+		if (!iovec_len_used) {
+			iov_base += iov_offset;
+			iov_len -= iov_offset;
+		}
+		iovec_len_used += iov_len;
+		iovec_pages_used += num_pages_spanned(iov_base, iov_len);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for pinned list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+	if (!local_list)
+		goto out;
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	local_list->nr_iovecs = 0;
+
+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		iov_len = iov[i].iov_len + iov_offset;
+		iov_base = iov[i].iov_base + iov_offset;
+		iov_offset = 0;
+		len -= iov_len;
+
+		page_list->nr_pages = num_pages_spanned(iov_base, iov_len);
+		page_list->base_address = iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* pin pages down */
+		ret = get_user_pages_fast(
+			(unsigned long)iov_base,
+			page_list->nr_pages,
+			1,	/* write */
+			page_list->pages);
+
+		if (unlikely(ret < 0))
+			goto unpin;
+
+		local_list->nr_iovecs = i + 1;
+
+		if (unlikely(ret != page_list->nr_pages)) {
+			page_list->nr_pages = ret;
+			goto unpin;
+		}
+
+	}
+
+	return local_list;
+
+unpin:
+	dma_unpin_iovec_pages(local_list);
+out:
+	return NULL;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovecend(struct dma_chan *chan, const struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata,
+	size_t iov_offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	size_t iov_len;
+	unsigned long iov_base;
+
+	if (!chan)
+		return memcpy_toiovecend(iov, kdata, iov_offset, len);
+
+	iovec_idx = 0;
+	for (iovec_idx = 0; iovec_idx < pinned_list->nr_iovecs; ++iovec_idx) {
+		struct dma_page_list *page_list;
+
+		iov_len = iov[iovec_idx].iov_len;
+		/* skip already used-up iovecs */
+		if (iov_len <= iov_offset) {
+			iov_offset -= iov_len;
+			continue;
+		}
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_base = (unsigned long)iov[iovec_idx].iov_base + iov_offset;
+		iov_len -= iov_offset;
+		iov_offset = 0;
+		iov_byte_offset = iov_base & ~PAGE_MASK;
+		page_idx = ((iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov_len);
+
+			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					kdata,
+					copy);
+			/* poll for a descriptor slot */
+			if (unlikely(dma_cookie < 0)) {
+				dma_async_issue_pending(chan);
+				continue;
+			}
+
+			len -= copy;
+			iov_len -= copy;
+			iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			kdata += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_const_iovec(struct dma_chan *chan, const struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t iov_offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+	size_t iov_len;
+	unsigned long iov_base;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !pinned_list) {
+		u8 *vaddr = kmap(page);
+		err = memcpy_toiovecend(iov, vaddr + offset, iov_offset, len);
+		kunmap(page);
+		return err;
+	}
+
+	for (iovec_idx = 0; iovec_idx < pinned_list->nr_iovecs; ++iovec_idx) {
+		struct dma_page_list *page_list;
+
+		iov_len = iov[iovec_idx].iov_len;
+		/* skip already used-up iovecs */
+		if (iov_len <= iov_offset) {
+			iov_offset -= iov_len;
+			continue;
+		}
+
+		page_list = &pinned_list->page_list[iovec_idx];
+		iov_base = (unsigned long)iov[iovec_idx].iov_base + iov_offset;
+		iov_len -= iov_offset;
+		iov_offset = 0;
+
+		iov_byte_offset = iov_base & ~PAGE_MASK;
+		page_idx = ((iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov_len);
+
+			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					page,
+					offset,
+					copy);
+			/* poll for a descriptor slot */
+			if (unlikely(dma_cookie < 0)) {
+				dma_async_issue_pending(chan);
+				continue;
+			}
+
+			len -= copy;
+			iov_len -= copy;
+			iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			offset += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@pinned_list - locked iovec buffer data
+ *
+ *	Note: the iovec is not modified during the copy.
+ *	Note: pinned_list is assumed pinned with the same offset.
+ */
+dma_cookie_t dma_skb_copy_datagram_const_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, const struct iovec *to,
+			size_t iov_offset, 
+			size_t len, struct dma_pinned_list *pinned_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		cookie = dma_memcpy_to_iovecend(chan, to, pinned_list,
+						skb->data + offset, iov_offset,
+						copy);
+		if (cookie < 0)
+			goto fault;
+		len -= copy;
+		if (len == 0)
+			goto end;
+		offset += copy;
+		iov_offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		copy = end - offset;
+		if (copy > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_to_const_iovec(chan, to, pinned_list, page,
+					frag->page_offset + offset - start, iov_offset, copy);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+			iov_offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			cookie = dma_skb_copy_datagram_const_iovec(chan, frag_iter,
+							     offset - start,
+							     to, iov_offset, copy,
+							     pinned_list);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+			iov_offset += copy;
+		}
+		start = end;
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+	return -EFAULT;
+}
+#endif
+
 /* Get packet from user space buffer */
 static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
 				       const struct iovec *iv, size_t count,
@@ -706,6 +1069,9 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 {
 	struct tun_pi pi = { 0, skb->protocol };
 	ssize_t total = 0;
+	struct dma_chan *dma_chan;
+	struct dma_pinned_list *pinned_list;
+	int dma_cookie;
 
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) < 0)
@@ -768,8 +1134,29 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	len = min_t(int, skb->len, len);
-
+#ifdef CONFIG_NET_DMA
+
+	if (len < tun_dma_copybreak)
+		goto copy;
+
+	dma_chan = dma_find_channel(DMA_MEMCPY);
+	if (!dma_chan)
+		goto copy;
+	pinned_list = dma_pin_const_iovec_pages(iv, total, len);
+	if (!pinned_list)
+		goto copy;
+	dma_cookie = dma_skb_copy_datagram_const_iovec(dma_chan, skb, 0, iv,
+						       total, len, pinned_list);
+	if (dma_cookie >= 0) {
+		dma_async_memcpy_issue_pending(dma_chan);
+		dma_sync_wait(dma_chan, dma_cookie);
+	}
+	dma_unpin_iovec_pages(pinned_list);
+	goto done;
+#endif
+copy:
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
+done:
 	total += skb->len;
 
 	tun->dev->stats.tx_packets++;
-- 
1.7.3-rc1

^ permalink raw reply related

* Re: [Bugme-new] [Bug 19942] New: Not a intel bug: kernel BUG at drivers/pci/intel-iommu.c:1656
From: Andrew Morton @ 2010-10-11 20:45 UTC (permalink / raw)
  To: gronslet
  Cc: e1000-devel, netdev, bugzilla-daemon, Jesse Barnes, bugme-daemon,
	David Woodhouse
In-Reply-To: <bug-19942-10286@https.bugzilla.kernel.org/>



(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).


On Sat, 9 Oct 2010 10:07:15 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=19942
> 
>            Summary: Not a intel bug: kernel BUG at
>                     drivers/pci/intel-iommu.c:1656
>            Product: Drivers
>            Version: 2.5
>     Kernel Version: 2.6.36-0.35.rc7.git0.fc15.x86_64
>           Platform: All
>         OS/Version: Linux
>               Tree: Fedora
>             Status: NEW
>           Severity: normal
>           Priority: P1
>          Component: Network
>         AssignedTo: drivers_network@kernel-bugs.osdl.org
>         ReportedBy: gronslet@gmail.com
>         Regression: No
> 
> 
> On my Fedora Rawhide system, I keep getting these errors, which kills my wifi
> and require me to reboot my Lenovo Thinkpad T400. Please also see
> https://bugzilla.redhat.com/show_bug.cgi?id=637554
> https://bugs.freedesktop.org/show_bug.cgi?id=30722
> 
> In the latter, I was asked to file the bug here, as it isn't a intel bug.
> Fedora Rawhide, kernel-2.6.36-0.35.rc7.git0.fc15.x86_64,
> xorg-x11-drv-intel-2.12.0-6.fc14.1.x86_64, xorg-x11-drivers-7.4-1.fc14.x86_64
> xorg-x11-server-utils-7.4-20.fc15.x86_64,
> NetworkManager-0.8.1-7.git20100831.fc15.x86_64
> 
> This happens when I resume my laptop after suspend to ram:
> 
> [24572.218077] PM: resume devices took 0.987 seconds
> [24572.239068] PM: Finishing wakeup.
> [24572.239216] Restarting tasks ... 
> [24572.239332] usb 2-4: USB disconnect, address 2
> [24572.245520] done.
> [24572.245702] video LNXVIDEO:00: Restoring backlight state
> [24572.249109] ehci_hcd 0000:00:1d.7: dma_pool_free buffer-2048,
> ffff880134f9d000/ffffb000 (bad dma)
> [24572.249631] ehci_hcd 0000:00:1d.7: dma_pool_free buffer-2048,
> ffff880134f9d080/ffffb080 (bad dma)
> [24572.249977] cdc_ether 2-4:1.7: wwan0: unregister 'cdc_ether'
> usb-0000:00:1d.7-4, Mobile Broadband Network Device
> [24573.685674] ------------[ cut here ]------------
> [24573.685709] kernel BUG at drivers/pci/intel-iommu.c:1656!
> [24573.685734] invalid opcode: 0000 [#1] SMP 
> [24573.685761] last sysfs file: /sys/devices/system/cpu/sched_mc_power_savings
> [24573.685791] CPU 0 
> [24573.685803] Modules linked in: rfcomm sunrpc sco bnep l2cap cpufreq_ondemand
> acpi_cpufreq freq_table mperf ip6t_REJECT xt_physdev nf_conntrack_ipv6
> ip6table_filter ipt_MASQUERADE iptable_nat ip6_tables nf_nat sha256_generic
> cryptd aes_x86_64 aes_generic cbc dm_crypt uinput arc4 ecb
> snd_hda_codec_conexant snd_hda_intel iwlagn snd_hda_codec snd_hwdep zaurus
> iwlcore snd_seq snd_seq_device r852 sm_common cdc_ether nand nand_ids nand_ecc
> microcode mac80211 uvcvideo usbnet mtd mii cdc_acm snd_pcm btusb cdc_wdm
> bluetooth videodev iTCO_wdt i2c_i801 iTCO_vendor_support joydev cfg80211
> thinkpad_acpi v4l1_compat v4l2_compat_ioctl32 e1000e snd_timer rfkill
> snd_page_alloc wmi snd soundcore ipv6 sdhci_pci sdhci firewire_ohci mmc_core
> firewire_core yenta_socket crc_itu_t i915 drm_kms_helper drm i2c_algo_bit
> i2c_core video output [last unloaded: scsi_wait_scan]
> [24573.686007] 
> [24573.686007] Pid: 8321, comm: NetworkManager Not tainted
> 2.6.36-0.35.rc7.git0.fc15.x86_64 #1 6474AR4/6474AR4
> [24573.686007] RIP: 0010:[<ffffffff8126ea48>]  [<ffffffff8126ea48>]
> __domain_mapping+0x43/0x1ce
> [24573.686007] RSP: 0018:ffff880133727648  EFLAGS: 00010206
> [24573.694051] RAX: 0000000001ffffff RBX: ffff8800b4687400 RCX:
> 000000000000001b
> [24573.694051] RDX: 000000000008b621 RSI: 000ffffffffffdff RDI:
> ffff8801320f6dc0
> [24573.694051] RBP: ffff880133727698 R08: 0000000000000001 R09:
> 0000000000000003
> [24573.694051] R10: ffff8801320f6df8 R11: 0000000000000000 R12:
> 0000000000000000
> [24573.694051] R13: ffff8801320f6dc0 R14: ffff88013bc04ff8 R15:
> 0000000000000001
> [24573.694051] FS:  00007fb24c872800(0000) GS:ffff880002c00000(0000)
> knlGS:0000000000000000
> [24573.694051] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [24573.694051] CR2: 000000000042da00 CR3: 000000012fbc2000 CR4:
> 00000000000006f0
> [24573.694051] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [24573.694051] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
> [24573.694051] Process NetworkManager (pid: 8321, threadinfo ffff880133726000,
> task ffff88012f8f0000)
> [24573.694051] Stack:
> [24573.694051]  ffff88013707e240 ffff8801320f6dc0 ffff880133727698
> 000ffffffffffdff
> [24573.694051] <0> 0000000000000000 ffff8800b4687400 000000008b621000
> ffff8801320f6dc0
> [24573.694051] <0> ffff88013bc04ff8 0000000000000000 ffff8801337276f8
> ffffffff8126f710
> [24573.694051] Call Trace:
> [24573.694051]  [<ffffffff8126f710>] __intel_map_single.clone.25+0xdc/0x16b
> [24573.694051]  [<ffffffff8126f88c>] intel_alloc_coherent+0xae/0xd5
> [24573.694051]  [<ffffffffa018c128>] e1000_alloc_ring_dma.clone.28+0x94/0xc0
> [e1000e]
> [24573.694051]  [<ffffffffa018e359>] e1000e_setup_tx_resources+0x65/0xaa
> [e1000e]
> [24573.694051]  [<ffffffffa018e891>] e1000_open+0x64/0x41e [e1000e]
> [24573.694051]  [<ffffffff813eeb45>] __dev_open+0x9b/0xd2
> [24573.694051]  [<ffffffff813eed87>] __dev_change_flags+0xad/0x130
> [24573.694051]  [<ffffffff813eee8b>] dev_change_flags+0x21/0x56
> [24573.694051]  [<ffffffff813f90f9>] do_setlink+0x2ba/0x61f
> [24573.694051]  [<ffffffff8107d8c7>] ? print_lock_contention_bug+0x1b/0xd5
> [24573.694051]  [<ffffffff81249f7f>] ? debug_check_no_obj_freed+0x65/0x18a
> [24573.694051]  [<ffffffff8107d8c7>] ? print_lock_contention_bug+0x1b/0xd5
> [24573.694051]  [<ffffffff813f96be>] rtnl_setlink+0xd0/0xf2
> [24573.694051]  [<ffffffff813f99ac>] rtnetlink_rcv_msg+0x1eb/0x201
> [24573.694051]  [<ffffffff813f97c1>] ? rtnetlink_rcv_msg+0x0/0x201
> [24573.694051]  [<ffffffff8140d3e5>] netlink_rcv_skb+0x45/0x90
> [24573.694051]  [<ffffffff813f8d29>] rtnetlink_rcv+0x26/0x2d
> [24573.694051]  [<ffffffff8140cec0>] netlink_unicast+0xee/0x157
> [24573.694051]  [<ffffffff8140d1e1>] netlink_sendmsg+0x2b8/0x2d6
> [24573.694051]  [<ffffffff813da64e>] __sock_sendmsg+0x6b/0x77
> [24573.694051]  [<ffffffff813da9a8>] sock_sendmsg+0xa8/0xc1
> [24573.694051]  [<ffffffff8107ff07>] ? lock_acquire+0xee/0xfd
> [24573.694051]  [<ffffffff810fb080>] ? might_fault+0x5c/0xac
> [24573.694051]  [<ffffffff8107fe0d>] ? lock_release+0x19a/0x1a6
> [24573.694051]  [<ffffffff810fb0c9>] ? might_fault+0xa5/0xac
> [24573.694051]  [<ffffffff813e4dbb>] ? copy_from_user+0x2f/0x31
> [24573.694051]  [<ffffffff813e51ae>] ? verify_iovec+0x57/0x99
> [24573.694051]  [<ffffffff813dc971>] sys_sendmsg+0x235/0x2b3
> [24573.694051]  [<ffffffff8112bb26>] ? rcu_read_lock+0x0/0x35
> [24573.694051]  [<ffffffff8107ff07>] ? lock_acquire+0xee/0xfd
> [24573.694051]  [<ffffffff8112bb26>] ? rcu_read_lock+0x0/0x35
> [24573.694051]  [<ffffffff813dc405>] ? sys_sendto+0x125/0x152
> [24573.694051]  [<ffffffff8112c5a2>] ? fput+0x22/0x1d6
> [24573.694051]  [<ffffffff8112c4ae>] ? fget_light+0x79/0x83
> [24573.694051]  [<ffffffff81133e5b>] ? path_put+0x22/0x27
> [24573.694051]  [<ffffffff810a8443>] ? audit_syscall_entry+0x11c/0x148
> [24573.694051]  [<ffffffff8149da45>] ? trace_hardirqs_on_thunk+0x3a/0x3f
> [24573.694051]  [<ffffffff81009cf2>] system_call_fastpath+0x16/0x1b
> [24573.694051] Code: d4 48 89 ca 48 89 7d b8 6b 8f 84 00 00 00 09 48 89 75 c8
> 4d 89 c7 83 c1 12 83 f9 3f 7f 0f 4a 8d 44 06 ff 48 d3 e8 48 85 c0 74 02 <0f> 0b
> 41 f6 c1 03 b8 ea ff ff ff 0f 84 6b 01 00 00 41 81 e1 03 
> [24573.694051] RIP  [<ffffffff8126ea48>] __domain_mapping+0x43/0x1ce
> [24573.694051]  RSP <ffff880133727648>
> [24573.821392] ---[ end trace 391efc8948e1496b ]---
> [24573.832050] NetworkManager used greatest stack depth: 2064 bytes left
> [24574.026042] usb 4-2: new full speed USB device using uhci_hcd and address 3
> [24574.187102] usb 4-2: New USB device found, idVendor=0a5c, idProduct=2145
> [24574.188244] usb 4-2: New USB device strings: Mfr=1, Product=2,
> SerialNumber=0
> [24574.189418] usb 4-2: Product: ThinkPad Bluetooth with Enhanced Data Rate II
> [24574.190567] usb 4-2: Manufacturer: Lenovo Computer Corp
> [24576.230085] usb 2-4: new high speed USB device using ehci_hcd and address 3
> [24577.080715] usb 2-4: New USB device found, idVendor=0bdb, idProduct=1900
> [24577.081862] usb 2-4: New USB device strings: Mfr=1, Product=2,
> SerialNumber=3
> [24577.083009] usb 2-4: Product: Ericsson F3507g Mobile Broadband Minicard
> Composite Device
> [24577.084140] usb 2-4: Manufacturer: Ericsson
> [24577.085263] usb 2-4: SerialNumber: 3541430207407750
> [24577.144202] cdc_acm 2-4:1.1: ttyACM0: USB ACM device
> [24577.163044] cdc_acm 2-4:1.3: ttyACM1: USB ACM device
> [24577.174389] cdc_wdm 2-4:1.5: cdc-wdm0: USB WDM device
> [24577.183588] cdc_wdm 2-4:1.6: cdc-wdm1: USB WDM device
> [26974.894966] thinkpad_acpi: EC reports that Thermal Table has changed
> 
> 
> Note that I explicitly have disabled iommu for intel:
> # cat /proc/cmdline 
> ro root=/dev/VolGroup00/lv_root rhgb quiet selinux=0 vga=0x318
> SYSFONT=latarcyrheb-sun16 LANG=en_US.UTF-8 KEYTABLE=no intel_iommu=igfx_off
> 
> I've seen this on 2.6.36-0.35.rc7.git0.fc15.x86_64,
> 2.6.36-0.27.rc5.git6.fc15.x86_64,2.6.36-0.32.rc6.git2.fc15.x86_64.

I don't have any of those kernel versions here, but I'm guessing that
this test is triggering:

	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);

It could be that e1000e is feeding in garbage, or it could be that
intel-iommu is screwed up.


It's a bit hard to tell what's happening because that BUG_ON was quite
poorly thought out.  It tests three different variables, doesn't tell
us their values and even though it _could_ cleanly recover and allow
the machine to continue to operate it simply whacks the box.

So we now have a pickle on our hands, because you use prebuilt kernels
and are probably not in a position to test patches.  


------------------------------------------------------------------------------
Beautiful is writing same markup. Internet Explorer 9 supports
standards for HTML5, CSS3, SVG 1.1,  ECMAScript5, and DOM L2 & L3.
Spend less time writing and  rewriting code and more time creating great
experiences on the web. Be a part of the beta today.
http://p.sf.net/sfu/beautyoftheweb
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* [PATCH net-next V3] net: percpu net_device refcount
From: Eric Dumazet @ 2010-10-11 20:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1286826685.3218.16.camel@edumazet-laptop>

Le lundi 11 octobre 2010 à 21:51 +0200, Eric Dumazet a écrit :
> Le lundi 11 octobre 2010 à 12:49 -0700, David Miller a écrit :
> 
> > Actually I have to revert, this breaks the infiniband drivers
> > which access netdev->refcnt directly.
> > 
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_alloc_pd':
> > drivers/infiniband/hw/nes/nes_verbs.c:786:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_verbs.c: In function 'nes_create_qp':
> > drivers/infiniband/hw/nes/nes_verbs.c:1418:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_disconnect':
> > drivers/infiniband/hw/nes/nes_cm.c:2703:2: error: 'struct net_device' has no member named 'refcnt'
> > drivers/infiniband/hw/nes/nes_cm.c: In function 'nes_accept':
> > drivers/infiniband/hw/nes/nes_cm.c:2793:2: error: 'struct net_device' has no member named 'refcnt'
> 
> Ah ok, I'll make a build test before submitting v3, sorry for the
> inconvenience.
> 

This was a bit long (allyesconfig), but eventually succeeded ...

Thanks !

[PATCH net-next V3] net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
V3: export netdev_refcnt_read() for infiniband debugging

 drivers/infiniband/hw/nes/nes_cm.c    |    4 +-
 drivers/infiniband/hw/nes/nes_verbs.c |    4 +-
 include/linux/netdevice.h             |    7 ++--
 net/core/dev.c                        |   40 +++++++++++++++++++-----
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 61e0efd..6220d9d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
 	nesibdev = nesvnic->nesibdev;
 
 	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	if (nesqp->active_conn) {
 
@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	atomic_inc(&cm_accepts);
 
 	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	/* allocate the ietf frame and space for private data */
 	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 9046e66..546fc22 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
 
 	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
 			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
 			nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 	/* update the QP table */
 	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
 	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
-			atomic_read(&nesvnic->netdev->refcnt));
+			netdev_refcnt_read(nesvnic->netdev));
 
 	return &nesqp->ibqp;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3..14fbb04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
 	struct timer_list	watchdog_timer;
 
 	/* Number of references to this device */
-	atomic_t		refcnt ____cacheline_aligned_in_smp;
+	int __percpu		*pcpu_refcnt;
 
 	/* delayed register/unregister */
 	struct list_head	todo_list;
@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev)
 	unregister_netdevice_queue(dev, NULL);
 }
 
+extern int 		netdev_refcnt_read(const struct net_device *dev);
 extern void		free_netdev(struct net_device *dev);
 extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	atomic_dec(&dev->refcnt);
+	irqsafe_cpu_dec(*dev->pcpu_refcnt);
 }
 
 /**
@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	atomic_inc(&dev->refcnt);
+	irqsafe_cpu_inc(*dev->pcpu_refcnt);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafa..04972a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* initialize the ref count */
-	atomic_set(&dev->refcnt, 1);
-
 	/* NAPI wants this */
 	INIT_LIST_HEAD(&dev->napi_list);
 
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,16 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
 /*
  * netdev_wait_allrefs - wait until all references are gone.
  *
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
 static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
 
 	linkwatch_forget_dev(dev);
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		msleep(250);
 
+		refcnt = netdev_refcnt_read(dev);
+
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, refcnt);
 			warning_time = jiffies;
 		}
 	}
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(atomic_read(&dev->refcnt));
+		BUG_ON(netdev_refcnt_read(dev));
 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
-	if (dev_addr_init(dev))
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
 		goto free_tx;
 
+	if (dev_addr_init(dev))
+		goto free_pcpu;
+
 	dev_mc_init(dev);
 	dev_uc_init(dev);
 
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 free_tx:
 	kfree(tx);
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
 free_p:
 	kfree(p);
 	return NULL;
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox