Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 2/5] sky2: add fake idle irq timer
From: Francois Romieu @ 2006-04-25 21:23 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Jeff Garzik, netdev
In-Reply-To: <20060425175951.444629000@localhost.localdomain>

Stephen Hemminger <shemminger@osdl.org> :
[...]
> --- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:48:47.000000000 -0700
> +++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:53:32.000000000 -0700
> @@ -2086,6 +2086,20 @@
>  	}
>  }
>  
> +/* If idle then force a fake soft NAPI poll once a second
> + * to work around cases where sharing an edge triggered interrupt.
> + */
> +static void sky2_idle(unsigned long arg)
> +{
> +	struct net_device *dev = (struct net_device *) arg;
> +
> +	local_irq_disable();
> +	if (__netif_rx_schedule_prep(dev))
> +		__netif_rx_schedule(dev);
> +	local_irq_enable();
> +}
> +
> +
>  static int sky2_poll(struct net_device *dev0, int *budget)
>  {
>  	struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
> @@ -2134,6 +2148,8 @@
>  		sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
>  	}
>  
> +	mod_timer(&hw->idle_timer, jiffies + HZ);
> +
>  	local_irq_disable();
>  	__netif_rx_complete(dev0);


Any objection against moving mod_timer() from sky2_poll() to sky2_idle()
so as to keep poll() path unmodified ?

-- 
Ueimor

^ permalink raw reply

* [PATCH 2/5] sky2: add fake idle irq timer
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev
In-Reply-To: <20060425175849.372221000@localhost.localdomain>

[-- Attachment #1: sky2-idle-timer.patch --]
[-- Type: text/plain, Size: 1943 bytes --]

Add an fake NAPI schedule once a second. This is an attempt to work around
for broken configurations with edge-triggered interrupts.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

--- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:48:47.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:53:32.000000000 -0700
@@ -2086,6 +2086,20 @@
 	}
 }
 
+/* If idle then force a fake soft NAPI poll once a second
+ * to work around cases where sharing an edge triggered interrupt.
+ */
+static void sky2_idle(unsigned long arg)
+{
+	struct net_device *dev = (struct net_device *) arg;
+
+	local_irq_disable();
+	if (__netif_rx_schedule_prep(dev))
+		__netif_rx_schedule(dev);
+	local_irq_enable();
+}
+
+
 static int sky2_poll(struct net_device *dev0, int *budget)
 {
 	struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
@@ -2134,6 +2148,8 @@
 		sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
 	}
 
+	mod_timer(&hw->idle_timer, jiffies + HZ);
+
 	local_irq_disable();
 	__netif_rx_complete(dev0);
 
@@ -3288,6 +3304,8 @@
 
 	sky2_write32(hw, B0_IMSK, Y2_IS_BASE);
 
+	setup_timer(&hw->idle_timer, sky2_idle, (unsigned long) dev);
+
 	pci_set_drvdata(pdev, hw);
 
 	return 0;
@@ -3323,13 +3341,15 @@
 	if (!hw)
 		return;
 
+	del_timer_sync(&hw->idle_timer);
+
+	sky2_write32(hw, B0_IMSK, 0);
 	dev0 = hw->dev[0];
 	dev1 = hw->dev[1];
 	if (dev1)
 		unregister_netdev(dev1);
 	unregister_netdev(dev0);
 
-	sky2_write32(hw, B0_IMSK, 0);
 	sky2_set_power_state(hw, PCI_D3hot);
 	sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
 	sky2_write8(hw, B0_CTST, CS_RST_SET);
--- sky2-2.6.17.orig/drivers/net/sky2.h	2006-04-25 10:48:42.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.h	2006-04-25 10:51:33.000000000 -0700
@@ -1880,6 +1880,8 @@
 	struct sky2_status_le *st_le;
 	u32		     st_idx;
 	dma_addr_t   	     st_dma;
+
+	struct timer_list    idle_timer;
 	int		     msi_detected;
 	wait_queue_head_t    msi_wait;
 };

--


^ permalink raw reply

* [PATCH 3/5] sky2: use ALIGN() macro
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev
In-Reply-To: <20060425175849.372221000@localhost.localdomain>

[-- Attachment #1: sky2-align.patch --]
[-- Type: text/plain, Size: 995 bytes --]

The ALIGN() macro in kernel.h does the same math that the
sky2 driver was using for padding.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

--- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:47:03.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:47:28.000000000 -0700
@@ -925,8 +925,7 @@
 	skb = alloc_skb(size + RX_SKB_ALIGN, gfp_mask);
 	if (likely(skb)) {
 		unsigned long p	= (unsigned long) skb->data;
-		skb_reserve(skb,
-			((p + RX_SKB_ALIGN - 1) & ~(RX_SKB_ALIGN - 1)) - p);
+		skb_reserve(skb, ALIGN(p, RX_SKB_ALIGN) - p);
 	}
 
 	return skb;
@@ -1686,13 +1685,12 @@
 }
 
 
-#define roundup(x, y)   ((((x)+((y)-1))/(y))*(y))
 /* Want receive buffer size to be multiple of 64 bits
  * and incl room for vlan and truncation
  */
 static inline unsigned sky2_buf_size(int mtu)
 {
-	return roundup(mtu + ETH_HLEN + VLAN_HLEN, 8) + 8;
+	return ALIGN(mtu + ETH_HLEN + VLAN_HLEN, 8) + 8;
 }
 
 static int sky2_change_mtu(struct net_device *dev, int new_mtu)

--


^ permalink raw reply

* [PATCH 4/5] sky2: reset function can be devinit
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev
In-Reply-To: <20060425175849.372221000@localhost.localdomain>

[-- Attachment #1: sky2-devinit.patch --]
[-- Type: text/plain, Size: 488 bytes --]

The sky2_reset function only called from sky2_probe.
Maybe the compiler was smart enough to figure this out already.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>


--- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:53:37.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:54:57.000000000 -0700
@@ -2219,7 +2219,7 @@
 }
 
 
-static int sky2_reset(struct sky2_hw *hw)
+static int __devinit sky2_reset(struct sky2_hw *hw)
 {
 	u16 status;
 	u8 t8, pmd_type;

--


^ permalink raw reply

* [PATCH 5/5] sky2: version 1.2
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev
In-Reply-To: <20060425175849.372221000@localhost.localdomain>

[-- Attachment #1: sky2-1.2.patch --]
[-- Type: text/plain, Size: 387 bytes --]

Update to version 1.2

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>


--- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:54:57.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:55:51.000000000 -0700
@@ -51,7 +51,7 @@
 #include "sky2.h"
 
 #define DRV_NAME		"sky2"
-#define DRV_VERSION		"1.1"
+#define DRV_VERSION		"1.2"
 #define PFX			DRV_NAME " "
 
 /*

--


^ permalink raw reply

* [PATCH 1/5] sky2: reschedule if irq still pending
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev
In-Reply-To: <20060425175849.372221000@localhost.localdomain>

[-- Attachment #1: sky2-edge.patch --]
[-- Type: text/plain, Size: 2676 bytes --]

This is a workaround for the case edge-triggered irq's. Several users
seem to have broken configurations sharing edge-triggered irq's. To avoid
losing IRQ's, reshedule if more work arrives.

The changes to netdevice.h are to extract the part that puts device
back in list into separate inline.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>


--- sky2-2.6.17.orig/drivers/net/sky2.c	2006-04-25 10:48:44.000000000 -0700
+++ sky2-2.6.17/drivers/net/sky2.c	2006-04-25 10:48:47.000000000 -0700
@@ -2093,6 +2093,7 @@
 	int work_done = 0;
 	u32 status = sky2_read32(hw, B0_Y2_SP_EISR);
 
+ restart_poll:
 	if (unlikely(status & ~Y2_IS_STAT_BMU)) {
 		if (status & Y2_IS_HW_ERR)
 			sky2_hw_intr(hw);
@@ -2123,7 +2124,7 @@
 	}
 
 	if (status & Y2_IS_STAT_BMU) {
-		work_done = sky2_status_intr(hw, work_limit);
+		work_done += sky2_status_intr(hw, work_limit - work_done);
 		*budget -= work_done;
 		dev0->quota -= work_done;
 
@@ -2133,9 +2134,22 @@
 		sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
 	}
 
-	netif_rx_complete(dev0);
+	local_irq_disable();
+	__netif_rx_complete(dev0);
 
 	status = sky2_read32(hw, B0_Y2_SP_LISR);
+
+	if (unlikely(status)) {
+		/* More work pending, try and keep going */
+		if (__netif_rx_schedule_prep(dev0)) {
+			__netif_rx_reschedule(dev0, work_done);
+			status = sky2_read32(hw, B0_Y2_SP_EISR);
+			local_irq_enable();
+			goto restart_poll;
+		}
+	}
+
+	local_irq_enable();
 	return 0;
 }
 
@@ -2153,8 +2167,6 @@
 	prefetch(&hw->st_le[hw->st_idx]);
 	if (likely(__netif_rx_schedule_prep(dev0)))
 		__netif_rx_schedule(dev0);
-	else
-		printk(KERN_DEBUG PFX "irq race detected\n");
 
 	return IRQ_HANDLED;
 }
--- sky2-2.6.17.orig/include/linux/netdevice.h	2006-04-25 10:48:44.000000000 -0700
+++ sky2-2.6.17/include/linux/netdevice.h	2006-04-25 10:48:47.000000000 -0700
@@ -829,19 +829,21 @@
 		__netif_rx_schedule(dev);
 }
 
-/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().
- * Do not inline this?
- */
+
+static inline void  __netif_rx_reschedule(struct net_device *dev, int undo)
+{
+	dev->quota += undo;
+	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete(). */
 static inline int netif_rx_reschedule(struct net_device *dev, int undo)
 {
 	if (netif_rx_schedule_prep(dev)) {
 		unsigned long flags;
-
-		dev->quota += undo;
-
 		local_irq_save(flags);
-		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		__netif_rx_reschedule(dev, undo);
 		local_irq_restore(flags);
 		return 1;
 	}

--


^ permalink raw reply

* [PATCH 0/5] sky2: version 1.2
From: Stephen Hemminger @ 2006-04-25 17:58 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: netdev

Update to sky2 driver. Mostly fixes to try and handle users
stuck with edge-triggered interrupts. Also, some minor cleanups.

Patches apply onto 1.1 version in 2.6.17-rc2

--

^ permalink raw reply

* Re: is it a backwards compatability catch-22?
From: Michal Schmidt @ 2006-04-25 19:10 UTC (permalink / raw)
  To: Rick Jones; +Cc: Linux Network Development list
In-Reply-To: <444D6396.4010004@hp.com>

Rick Jones wrote:
> lumber:~# cat /etc/udev/rules.d/010_netinterfaces.rules
> KERNEL="eth*",SYSFS{address}=="00:30:6e:4c:27:3c", NAME="eth0"
> KERNEL="eth*",SYSFS{address}=="00:30:6e:4c:27:3d", NAME="eth1"
> KERNEL="eth*",SYSFS{address}=="00:12:79:9e:0e:d2", NAME="eth2"
> KERNEL="eth*",SYSFS{address}=="00:12:79:9e:0e:d3", NAME="eth3"
> KERNEL="eth*",SYSFS{address}=="00:0c:fc:00:08:71", NAME="eth4"
        ^^^
BTW, you should use "==" here instead of "=". Otherwise the rules will 
break with newer udev versions which behave strictly in this regard.

Michal

^ permalink raw reply

* Re: is it a backwards compatability catch-22?
From: Rick Jones @ 2006-04-25 18:34 UTC (permalink / raw)
  To: Jesse Brandeburg; +Cc: Stephen Hemminger, Linux Network Development list
In-Reply-To: <4807377b0604250909m34f6030ar19735b3343884399@mail.gmail.com>

Jesse Brandeburg wrote:
> On 4/24/06, Rick Jones <rick.jones2@hp.com> wrote:
> 
>>>The udev stuff runs after the device has already chosen it's default name.
>>>It has to, it's part of the hotplug infrastructure, and we don't want
>>>to depend on usermode to define the name.  Just choose some other
>>>convention "eth_0"  or something like that.
>>
>>Is that because adding another NIC at a later time might cause it to
>>grab ethN out from under what I'm trying to do with udev?
> 
> 
> From what I read its likely to be because there may already be a
> device named "eth1" due to default naming when you are trying to
> rename a device (say eth0) to eth1.
> 
> this is all because Debian now has async init, right?

Beats me. I got the impression that udev things were happening "early 
enough" in my case that I didn't run into the issue.  still, init and 
device names are presently a maze of twisty passages to me. someone else 
also suggested not using the ethN stuff - or at least not starting at 0, 
but start them at N where N is reasonably large.  i decided to call them 
lan0, lan1, etc just to be perverse and see what breaks.

> BTW, since the letters in udev are all hex, it shouldn't matter
> whether they are upper or lower case, IMO

that would be my opinion as well, certainly that was my expectation - 
that I could simply "cut and paste" MAC addresses from the likes of 
ifconfig output

alas, it seems that if I leave theme upper case, the renaming does not 
happen.  i am _guessing_ the comparison is a simple string compare. and 
it doesn't _really_ know that what is being compared is a MAC address?

rick jones

^ permalink raw reply

* [PATCH] bridge: allow full size vlan packets (repost)
From: Stephen Hemminger @ 2006-04-25 18:08 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

Need to allow for VLAN header when bridging.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>


--- bridge.orig/net/bridge/br_forward.c	2006-04-10 16:17:51.000000000 -0700
+++ bridge/net/bridge/br_forward.c	2006-04-19 13:50:42.000000000 -0700
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
 #include "br_private.h"
 
@@ -29,10 +30,15 @@
 	return 1;
 }
 
+static inline unsigned packet_length(const struct sk_buff *skb)
+{
+	return skb->len - (skb->protocol == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+}
+
 int br_dev_queue_push_xmit(struct sk_buff *skb)
 {
 	/* drop mtu oversized packets except tso */
-	if (skb->len > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
+	if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
 		kfree_skb(skb);
 	else {
 #ifdef CONFIG_BRIDGE_NETFILTER

^ permalink raw reply

* Re: [PATCH]: suspicious unlikely usage in tcp_transmit_skb()
From: Stephen Hemminger @ 2006-04-25 17:01 UTC (permalink / raw)
  To: Hua Zhong; +Cc: davem, netdev
In-Reply-To: <444D5E73.7020803@gmail.com>

On Mon, 24 Apr 2006 16:25:39 -0700
Hua Zhong <hzhong@gmail.com> wrote:

> Hi,
> 
> I am developing a profiling tool to check if likely/unlikely usages are wise. I find that the following one is always a miss:
> 
>       # Hit    # miss Function:Filename@Line
> !         0     50505 tcp_transmit_skb():net/ipv4/tcp_output.c@468
> 
> There is a chance that my tool is buggy, but I just want to confirm with you whether this does look suspicious and what your opinion is.
> 
> Signed-off-by: Hua Zhong <hzhong@gmail.com>
> 
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index a28ae59..743016b 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -465,7 +465,7 @@ #define SYSCTL_FLAG_SACK    0x4
>         TCP_INC_STATS(TCP_MIB_OUTSEGS);
>  
>         err = icsk->icsk_af_ops->queue_xmit(skb, 0);
> -       if (unlikely(err <= 0))
> +       if (likely(err <= 0))
>                 return err;
>  
>         tcp_enter_cwr(sk);

How about just taking off the likely/unlikely in this case.

^ permalink raw reply

* Re: skb->truesize assertion checking for TCP
From: Jesse Brandeburg @ 2006-04-25 16:49 UTC (permalink / raw)
  To: David S. Miller; +Cc: herbert, netdev
In-Reply-To: <20060419.231703.35841080.davem@davemloft.net>

On 4/19/06, David S. Miller <davem@davemloft.net> wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Thu, 20 Apr 2006 15:04:06 +1000
>
> > On Wed, Apr 19, 2006 at 09:55:13PM -0700, David S. Miller wrote:
> > > +static inline void skb_truesize_check(struct sk_buff *skb)
> > > +{
> > > +   if (unlikely((int)skb->truesize < sizeof(struct sk_buff)))
> > > +           skb_truesize_bug(skb);
> > > +}
> >
> > I think we can go for the stronger test:
> >
> > skb->truesize < sizeof(struct sk_buff) + skb->len
>
> Agreed, let me see if that triggers on my machine before
> I commit this :-)

Um, I get a log full of these now with the 7.0.33 driver in the
kernel.  BTW, it seems like it is missing a WARN_ON or
printk(__function__) - or whatever prints the function name of a
caller in the debug output.

Apr 24 15:48:36 lindenhurst-2 kernel: e1000: eth1:
e1000_watchdog_task: NIC Link is Up 100 Mbps Half Duplex
Apr 24 15:48:36 lindenhurst-2 kernel: e1000: eth1:
e1000_watchdog_task: 10/100 speed: disabling TSO
Apr 24 15:49:21 lindenhurst-2 kernel: SKB BUG: Invalid truesize (616)
len=1448, sizeof(sk_buff)=232
Apr 24 15:49:21 lindenhurst-2 last message repeated 13 times
Apr 24 15:49:21 lindenhurst-2 kernel: SKB BUG: Invalid truesize (616)
len=1408, sizeof(sk_buff)=232
Apr 24 15:49:21 lindenhurst-2 kernel: SKB BUG: Invalid truesize (616)
len=1448, sizeof(sk_buff)=232
Apr 24 15:49:21 lindenhurst-2 last message repeated 15 times
Apr 24 15:49:21 lindenhurst-2 kernel: SKB BUG: Invalid truesize (616)
len=1408, sizeof(sk_buff)=232
Apr 24 15:49:21 lindenhurst-2 kernel: SKB BUG: Invalid truesize (616)
len=1448, sizeof(sk_buff)=232
Apr 24 15:49:21 lindenhurst-2 last message repeated 97 times

I have the latest version of net/core/skbuff.c in git.

I've verified that the below patch fixes the message appearing but I
still think the message could be refined a little bit.  We're working
on seperate patches for this for 17-rc and 16 stable.

Jesse

PS this is just for reference, this patch is mangled due to cut/paste

 drivers/net/e1000/e1000_main.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index add8dc4..c99e878 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3768,6 +3768,7 @@ e1000_clean_rx_irq_ps(struct e1000_adapt
                        ps_page->ps_page[j] = NULL;
                        skb->len += length;
                        skb->data_len += length;
+                       skb->truesize += length;
                }

 copydone:

^ permalink raw reply related

* Fw: [Bugme-new] [Bug 6438] New: CISCO AIRONET 340 SERIES oops under PPC
From: Andrew Morton @ 2006-04-25 16:22 UTC (permalink / raw)
  To: Dominik Brodowski, John W. Linville
  Cc: netdev, bugme-daemon@kernel-bugs.osdl.org



Begin forwarded message:

Date: Tue, 25 Apr 2006 01:57:50 -0700
From: bugme-daemon@bugzilla.kernel.org
To: bugme-new@lists.osdl.org
Subject: [Bugme-new] [Bug 6438] New: CISCO AIRONET 340 SERIES oops under PPC


http://bugzilla.kernel.org/show_bug.cgi?id=6438

           Summary: CISCO AIRONET 340 SERIES oops under PPC
    Kernel Version: all under PPC
            Status: NEW
          Severity: blocking
             Owner: drivers_network-wireless@kernel-bugs.osdl.org
         Submitter: paszczus@gmail.com


Most recent kernel where this bug did not occur:
Distribution: PLD Linux
Hardware Environment: Apple PowerBook G3 with CISCO AIRONET 340 SERIES
Software Environment: gcc 3.3.x and 3.4.x
Problem Description:
Kernel oops while trying to load module yenta_socket while Airo card is inside
POMCIA slot. The problem is propably only on ppc, cause some other guy use that
card under x86 notebook and it works for him.
Steps to reproduce:
run pcmcia-cs service or modprobe yenta_socket. Oops is like that: 

cs: memory probe 0xfd000000-0xfdffffff:Machine check in kernel mode.
Caused by (from SRR1=49030): Transfer error ack signal
Oops: machine check, sig: 7 [#1]
NIP: CD02ACD0 LR: CD02ACA4 SP: CB3ADC70 REGS: cb3adbc0 TRAP: 0200    Not tainted
MSR: 00049030 EE: 1 PR: 0 FP: 0 ME: 1 IR/DR: 11
TASK = cb272c70[1636] 'pccardd' THREAD: cb3ac000
Last syscall: -1 
GPR00: 000000FF CB3ADC70 CB272C70 CD8AA000 00000014 00000070 00000002 CB3ADCEA 
GPR08: C4822B60 CD8AB000 00000040 00001000 22002248 1001F6AC 00000000 00000000 
GPR16: 00000000 00000000 00000000 10017B58 1000249C CBFB9E80 01000000 00000000 
GPR24: CB3ADCEA 00000021 00000002 CBD9582C 00000000 00000000 00000002 CB3ADCEA 
NIP [cd02acd0] pcmcia_read_cis_mem+0x184/0xffff64b4 [pcmcia_core]
LR [cd02aca4] pcmcia_read_cis_mem+0x158/0xffff64b4 [pcmcia_core]
Call trace:
 [cd02afe0] read_cis_cache+0x144/0xffff6164 [pcmcia_core]
 [cd02b648] pccard_get_next_tuple+0x7c/0xffff5a34 [pcmcia_core]
 [cd02b358] pccard_get_first_tuple+0x94/0xffff5d3c [pcmcia_core]
 [cd02cf98] pccard_validate_cis+0x94/0xffff40fc [pcmcia_core]
 [cd8af688] readable+0x88/0xff7cba00 [rsrc_nonstatic]
 [cd8af860] cis_readable+0xc8/0xff7cb868 [rsrc_nonstatic]
 [cd8afb30] do_mem_probe+0x1e0/0xff7cb6b0 [rsrc_nonstatic]
 [cd8afb78] inv_probe+0x30/0xff7cb4b8 [rsrc_nonstatic]
 [cd8afd20] validate_mem+0x128/0xff7cb408 [rsrc_nonstatic]
 [cd8afdf4] pcmcia_nonstatic_validate_mem+0xbc/0xff7cb2c8 [rsrc_nonstatic]
 [cd02d378] pcmcia_validate_mem+0x34/0xffff3cbc [pcmcia_core]
 [cda2ea44] pcmcia_card_add+0x28/0xfffb95e4 [pcmcia]
 [cda2f5a4] ds_event+0x80/0xfffb8adc [pcmcia]
 [cd0299d8] send_event+0x70/0xffff7698 [pcmcia_core]
 [cd029e6c] socket_insert+0xa8/0xffff723c [pcmcia_core]

------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.

^ permalink raw reply

* [PATCH] ibmveth change buffer pools dynamically
From: Santiago Leon @ 2006-04-25 16:19 UTC (permalink / raw)
  To: jgarzik, netdev

[-- Attachment #1: Type: text/plain, Size: 1166 bytes --]

This patch provides a sysfs interface to change some properties of the
ibmveth buffer pools (size of the buffers, number of buffers per pool,
and whether a pool is active).  Ethernet drivers use ethtool to provide
this type of functionality.  However, the buffers in the ibmveth driver
can have an arbitrary size (not only regular, mini, and jumbo which are
the only sizes that ethtool can change), and also ibmveth can have an
arbitrary number of buffer pools 

Under heavy load we have seen dropped packets which obviously kills TCP
performance.  We have created several fixes that mitigate this issue,
but we definitely need a way of changing the number of buffers for an
adapter dynamically.  Also, changing the size of the buffers allows
users to change the MTU to something big (bigger than a jumbo frame)
greatly improving performance on partition to partition transfers.

The patch creates directories pool1...pool4 in the device directory in
sysfs, each with files: num, size, and active (which default to the
values in the mainline version).

Comments and suggestions are welcome...
-- 
Santiago A. Leon
Power Linux Development
IBM Linux Technology Center

[-- Attachment #2: ibmveth_flexbuff_ml.patch --]
[-- Type: text/x-patch, Size: 10852 bytes --]

--- a/drivers/net/ibmveth.h	2006-01-02 21:21:10.000000000 -0600
+++ b/drivers/net/ibmveth.h	2006-04-18 10:20:00.102520432 -0500
@@ -75,10 +75,13 @@
 
 #define IbmVethNumBufferPools 5
 #define IBMVETH_BUFF_OH 22 /* Overhead: 14 ethernet header + 8 opaque handle */
+#define IBMVETH_MAX_MTU 68
+#define IBMVETH_MAX_POOL_COUNT 4096
+#define IBMVETH_MAX_BUF_SIZE (1024 * 128)
 
-/* pool_size should be sorted */
 static int pool_size[] = { 512, 1024 * 2, 1024 * 16, 1024 * 32, 1024 * 64 };
 static int pool_count[] = { 256, 768, 256, 256, 256 };
+static int pool_active[] = { 1, 1, 0, 0, 0};
 
 #define IBM_VETH_INVALID_MAP ((u16)0xffff)
 
@@ -94,6 +97,7 @@ struct ibmveth_buff_pool {
     dma_addr_t *dma_addr;
     struct sk_buff **skbuff;
     int active;
+    struct kobject kobj;
 };
 
 struct ibmveth_rx_q {
@@ -118,6 +122,7 @@ struct ibmveth_adapter {
     dma_addr_t filter_list_dma;
     struct ibmveth_buff_pool rx_buff_pool[IbmVethNumBufferPools];
     struct ibmveth_rx_q rx_queue;
+    int pool_config;
 
     /* adapter specific stats */
     u64 replenish_task_cycles;
--- a/drivers/net/ibmveth.c	2006-01-02 21:21:10.000000000 -0600
+++ b/drivers/net/ibmveth.c	2006-04-18 10:19:55.624532480 -0500
@@ -96,6 +96,7 @@ static void ibmveth_proc_register_adapte
 static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter);
 static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
 static inline void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter);
+static struct kobj_type ktype_veth_pool;
 
 #ifdef CONFIG_PROC_FS
 #define IBMVETH_PROC_DIR "net/ibmveth"
@@ -133,12 +134,13 @@ static inline int ibmveth_rxq_frame_leng
 }
 
 /* setup the initial settings for a buffer pool */
-static void ibmveth_init_buffer_pool(struct ibmveth_buff_pool *pool, u32 pool_index, u32 pool_size, u32 buff_size)
+static void ibmveth_init_buffer_pool(struct ibmveth_buff_pool *pool, u32 pool_index, u32 pool_size, u32 buff_size, u32 pool_active)
 {
 	pool->size = pool_size;
 	pool->index = pool_index;
 	pool->buff_size = buff_size;
 	pool->threshold = pool_size / 2;
+	pool->active = pool_active;
 }
 
 /* allocate and setup an buffer pool - called during open */
@@ -180,7 +182,6 @@ static int ibmveth_alloc_buffer_pool(str
 	atomic_set(&pool->available, 0);
 	pool->producer_index = 0;
 	pool->consumer_index = 0;
-	pool->active = 0;
 
 	return 0;
 }
@@ -301,7 +302,6 @@ static void ibmveth_free_buffer_pool(str
 		kfree(pool->skbuff);
 		pool->skbuff = NULL;
 	}
-	pool->active = 0;
 }
 
 /* remove a buffer from a pool */
@@ -433,7 +433,9 @@ static void ibmveth_cleanup(struct ibmve
 	}
 
 	for(i = 0; i<IbmVethNumBufferPools; i++)
-		ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[i]);
+		if (adapter->rx_buff_pool[i].active)
+			ibmveth_free_buffer_pool(adapter, 
+						 &adapter->rx_buff_pool[i]);
 }
 
 static int ibmveth_open(struct net_device *netdev)
@@ -489,9 +491,6 @@ static int ibmveth_open(struct net_devic
 	adapter->rx_queue.num_slots = rxq_entries;
 	adapter->rx_queue.toggle = 1;
 
-	/* call change_mtu to init the buffer pools based in initial mtu */
-	ibmveth_change_mtu(netdev, netdev->mtu);
-
 	memcpy(&mac_address, netdev->dev_addr, netdev->addr_len);
 	mac_address = mac_address >> 16;
 
@@ -522,6 +521,17 @@ static int ibmveth_open(struct net_devic
 		return -ENONET; 
 	}
 
+	for(i = 0; i<IbmVethNumBufferPools; i++) {
+		if(!adapter->rx_buff_pool[i].active)
+			continue;
+		if (ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[i])) {
+			ibmveth_error_printk("unable to alloc pool\n");
+			adapter->rx_buff_pool[i].active = 0;
+			ibmveth_cleanup(adapter);
+			return -ENOMEM ;
+		}
+	}
+
 	ibmveth_debug_printk("registering irq 0x%x\n", netdev->irq);
 	if((rc = request_irq(netdev->irq, &ibmveth_interrupt, 0, netdev->name, netdev)) != 0) {
 		ibmveth_error_printk("unable to request irq 0x%x, rc %d\n", netdev->irq, rc);
@@ -550,7 +560,8 @@ static int ibmveth_close(struct net_devi
     
 	ibmveth_debug_printk("close starting\n");
 
-	netif_stop_queue(netdev);
+	if (!adapter->pool_config)
+		netif_stop_queue(netdev);
 
 	free_irq(netdev->irq, netdev);
 
@@ -876,46 +887,22 @@ static void ibmveth_set_multicast_list(s
 static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct ibmveth_adapter *adapter = dev->priv;
+	int new_mtu_oh = new_mtu + IBMVETH_BUFF_OH;
 	int i;
-	int prev_smaller = 1;
 
-	if ((new_mtu < 68) || 
-	    (new_mtu > (pool_size[IbmVethNumBufferPools-1]) - IBMVETH_BUFF_OH))
+	if (new_mtu < IBMVETH_MAX_MTU)
 		return -EINVAL;
 
+	/* Look for an active buffer pool that can hold the new MTU */
 	for(i = 0; i<IbmVethNumBufferPools; i++) {
-		int activate = 0;
-		if (new_mtu > (pool_size[i]  - IBMVETH_BUFF_OH)) { 
-			activate = 1;
-			prev_smaller= 1;
-		} else {
-			if (prev_smaller)
-				activate = 1;
-			prev_smaller= 0;
-		}
-
-		if (activate && !adapter->rx_buff_pool[i].active) {
-			struct ibmveth_buff_pool *pool = 
-						&adapter->rx_buff_pool[i];
-			if(ibmveth_alloc_buffer_pool(pool)) {
-				ibmveth_error_printk("unable to alloc pool\n");
-				return -ENOMEM;
-			}
-			adapter->rx_buff_pool[i].active = 1;
-		} else if (!activate && adapter->rx_buff_pool[i].active) {
-			adapter->rx_buff_pool[i].active = 0;
-			h_free_logical_lan_buffer(adapter->vdev->unit_address,
-					  (u64)pool_size[i]);
+		if (!adapter->rx_buff_pool[i].active)
+			continue;
+		if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) {
+			dev->mtu = new_mtu;
+			return 0;
 		}
-
 	}
-
-	/* kick the interrupt handler so that the new buffer pools get
-	   replenished or deallocated */
-	ibmveth_interrupt(dev->irq, dev, NULL);
-
-	dev->mtu = new_mtu;
-	return 0;	
+	return -EINVAL;
 }
 
 static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
@@ -960,6 +947,7 @@ static int __devinit ibmveth_probe(struc
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
 	adapter->mcastFilterSize= *mcastFilterSize_p;
+	adapter->pool_config = 0;
 	
 	/* 	Some older boxes running PHYP non-natively have an OF that
 		returns a 8-byte local-mac-address field (and the first 
@@ -994,9 +982,16 @@ static int __devinit ibmveth_probe(struc
 
 	memcpy(&netdev->dev_addr, &adapter->mac_addr, netdev->addr_len);
 
-	for(i = 0; i<IbmVethNumBufferPools; i++)
+	for(i = 0; i<IbmVethNumBufferPools; i++) {
+		struct kobject *kobj = &adapter->rx_buff_pool[i].kobj;
 		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i, 
-					 pool_count[i], pool_size[i]);
+					 pool_count[i], pool_size[i], 
+					 pool_active[i]);
+		kobj->parent = &dev->dev.kobj;
+		sprintf(kobj->name, "pool%d", i);
+		kobj->ktype = &ktype_veth_pool;
+		kobject_register(kobj);
+	}
 
 	ibmveth_debug_printk("adapter @ 0x%p\n", adapter);
 
@@ -1025,6 +1020,10 @@ static int __devexit ibmveth_remove(stru
 {
 	struct net_device *netdev = dev->dev.driver_data;
 	struct ibmveth_adapter *adapter = netdev->priv;
+	int i;
+
+	for(i = 0; i<IbmVethNumBufferPools; i++)
+		kobject_unregister(&adapter->rx_buff_pool[i].kobj);
 
 	unregister_netdev(netdev);
 
@@ -1169,6 +1168,132 @@ static void ibmveth_proc_unregister_driv
 }
 #endif /* CONFIG_PROC_FS */
 
+static struct attribute veth_active_attr;
+static struct attribute veth_num_attr;
+static struct attribute veth_size_attr;
+
+static ssize_t veth_pool_show(struct kobject * kobj,
+                              struct attribute * attr, char * buf)
+{
+	struct ibmveth_buff_pool *pool = container_of(kobj, 
+						      struct ibmveth_buff_pool,
+						      kobj);
+
+	if (attr == &veth_active_attr)
+		return sprintf(buf, "%d\n", pool->active);
+	else if (attr == &veth_num_attr)
+		return sprintf(buf, "%d\n", pool->size);
+	else if (attr == &veth_size_attr)
+		return sprintf(buf, "%d\n", pool->buff_size);
+	return 0;
+}
+
+static ssize_t veth_pool_store(struct kobject * kobj, struct attribute * attr,
+const char * buf, size_t count)
+{
+	struct ibmveth_buff_pool *pool = container_of(kobj, 
+						      struct ibmveth_buff_pool,
+						      kobj);
+	struct net_device *netdev = 
+	    container_of(kobj->parent, struct device, kobj)->driver_data;
+	struct ibmveth_adapter *adapter = netdev->priv;
+	long value = simple_strtol(buf, NULL, 10);
+	long rc;
+
+	if (attr == &veth_active_attr) {
+		if (value && !pool->active) {
+			if(ibmveth_alloc_buffer_pool(pool)) {
+                                ibmveth_error_printk("unable to alloc pool\n");
+                                return -ENOMEM;
+                        }
+			pool->active = 1;
+			adapter->pool_config = 1;
+			ibmveth_close(netdev);
+			adapter->pool_config = 0;
+			if ((rc = ibmveth_open(netdev)))
+				return rc;
+		} else if (!value && pool->active) {
+			int mtu = netdev->mtu + IBMVETH_BUFF_OH;
+			int i;
+			/* Make sure there is a buffer pool with buffers that
+			   can hold a packet of the size of the MTU */
+			for(i = 0; i<IbmVethNumBufferPools; i++) {
+				if (pool == &adapter->rx_buff_pool[i])
+					continue;
+				if (!adapter->rx_buff_pool[i].active)
+					continue;
+				if (mtu < adapter->rx_buff_pool[i].buff_size) {
+					pool->active = 0;
+					h_free_logical_lan_buffer(adapter->
+								  vdev->
+								  unit_address,
+								  pool->
+								  buff_size);
+				}
+			}
+			if (pool->active) {
+				ibmveth_error_printk("no active pool >= MTU\n");
+				return -EPERM;
+			}
+		}
+	} else if (attr == &veth_num_attr) {
+		if (value <= 0 || value > IBMVETH_MAX_POOL_COUNT)
+			return -EINVAL;
+		else {
+			adapter->pool_config = 1;
+			ibmveth_close(netdev);
+			adapter->pool_config = 0;
+			pool->size = value;
+			if ((rc = ibmveth_open(netdev)))
+				return rc;
+		}
+	} else if (attr == &veth_size_attr) {
+		if (value <= IBMVETH_BUFF_OH || value > IBMVETH_MAX_BUF_SIZE)
+			return -EINVAL;
+		else {
+			adapter->pool_config = 1;
+			ibmveth_close(netdev);
+			adapter->pool_config = 0;
+			pool->buff_size = value;
+			if ((rc = ibmveth_open(netdev)))
+				return rc;
+		}
+	}
+
+	/* kick the interrupt handler to allocate/deallocate pools */
+	ibmveth_interrupt(netdev->irq, netdev, NULL);
+	return count;
+}
+
+
+#define ATTR(_name, _mode)      \
+        struct attribute veth_##_name##_attr = {               \
+        .name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE \
+        };
+
+static ATTR(active, 0644);
+static ATTR(num, 0644);
+static ATTR(size, 0644);
+
+static struct attribute * veth_pool_attrs[] = {
+	&veth_active_attr,
+	&veth_num_attr,
+	&veth_size_attr,
+	NULL,
+};
+
+static struct sysfs_ops veth_pool_ops = {
+	.show   = veth_pool_show,
+	.store  = veth_pool_store,
+};
+
+static struct kobj_type ktype_veth_pool = {
+	.release        = NULL,
+	.sysfs_ops      = &veth_pool_ops,
+	.default_attrs  = veth_pool_attrs,
+};
+
+
 static struct vio_device_id ibmveth_device_table[] __devinitdata= {
 	{ "network", "IBM,l-lan"},
 	{ "", "" }

^ permalink raw reply

* Re: is it a backwards compatability catch-22?
From: Jesse Brandeburg @ 2006-04-25 16:09 UTC (permalink / raw)
  To: Rick Jones; +Cc: Stephen Hemminger, Linux Network Development list
In-Reply-To: <444D6F78.9080309@hp.com>

On 4/24/06, Rick Jones <rick.jones2@hp.com> wrote:
>
> > The udev stuff runs after the device has already chosen it's default name.
> > It has to, it's part of the hotplug infrastructure, and we don't want
> > to depend on usermode to define the name.  Just choose some other
> > convention "eth_0"  or something like that.
>
> Is that because adding another NIC at a later time might cause it to
> grab ethN out from under what I'm trying to do with udev?

>From what I read its likely to be because there may already be a
device named "eth1" due to default naming when you are trying to
rename a device (say eth0) to eth1.

this is all because Debian now has async init, right?

BTW, since the letters in udev are all hex, it shouldn't matter
whether they are upper or lower case, IMO

Jesse

^ permalink raw reply

* Re: determine outgoing interface (eth0,eth1) for a packet according to the dest IP
From: Andi Kleen @ 2006-04-25 14:48 UTC (permalink / raw)
  To: John Que; +Cc: netdev
In-Reply-To: <ada605fb0604250744w73c7766as1d3ceb0d62511764@mail.gmail.com>

On Tuesday 25 April 2006 16:44, John Que wrote:
> Thanks a lot !
> 
>   I had tried the sending RTM_GETROUTE message using a NETLINK_ROUTE
> 	socket in a User Space program and it went OK.
> 	
> 	It gaves correct routing struct which I could parse.
> 	In fact it gave the rotuing table.
> 	But in sending that message I did not specify a certain
> 	dest IP.
> 	Consider the follwing simple scenario: I have 2 gateways
> 	(one on eth0,one on eth1), and I am sending
> 	a packet to some dest IP ; I want to know according to
> 	that ip on which interface (or gw) it will be out
> 	
> 	But where do I specify that certain dest IP ?

You add RTA_SRC and RTA_DST attributes to the query.

-Andi

^ permalink raw reply

* Re: determine outgoing interface (eth0,eth1) for a packet according to the dest IP
From: John Que @ 2006-04-25 14:44 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev
In-Reply-To: <200604250943.49422.ak@suse.de>

Thanks a lot !

  I had tried the sending RTM_GETROUTE message using a NETLINK_ROUTE
	socket in a User Space program and it went OK.
	
	It gaves correct routing struct which I could parse.
	In fact it gave the rotuing table.
	But in sending that message I did not specify a certain
	dest IP.
	Consider the follwing simple scenario: I have 2 gateways
	(one on eth0,one on eth1), and I am sending
	a packet to some dest IP ; I want to know according to
	that ip on which interface (or gw) it will be out
	
	But where do I specify that certain dest IP ?
Regards,
John
On 4/25/06, Andi Kleen <ak@suse.de> wrote:
> On Tuesday 25 April 2006 09:31, John Que wrote:
> > Hello,
> > What is the right way to determine on which interface card
> > (eth0 or eth1) will a packet be sent (according to the dest IP)?
>
> You can send a rtnetlink RTM_GETROUTE message to ask the kernel.
> Result is the interface index in RTA_OIF, which can be converted
> into a name.
>
> -Andi
>
>

^ permalink raw reply

* Re: tune back idle cwnd closing?
From: John Heffner @ 2006-04-25 14:27 UTC (permalink / raw)
  To: Zach Brown; +Cc: netdev
In-Reply-To: <44493980.1040708@oracle.com>

Zach Brown wrote:
> My apologies if this is a FAQ, I couldn't find it in the archives.
> 
> We have some dudes who are syncing large amounts of data across a
> dedicated long fat pipe at somewhat irregular intervals that are, sadly,
> longer than the rto.  They feel the pain of having to reopen the window
> between transmissions.
> 
> Is there room for a compromise tunable that would be less aggressive
> about closing cwnd during idle periods but which wouldn't violate the
> spirit of 2861?  No one wants broken TCP here.
> 
> They mention that Solaris has the tcp_slow_start_after_idle tunable and
> that it helps their situation.  I mention that only as a data point, I
> wouldn't be foolish enough to try and use the presence of something in
> Solaris as justification :)

Yours is the first complaint of this kind I recall seeing, but I've 
expected for a while someone would have this type of problem.  RFC2861 
seems conceptually nice at first, but there are a few things about it 
that bother me.  One thing in particular is that a naturally bursty 
application (like yours) will actually perform better by padding its 
connection with junk data whenever it doesn't have real data to send. 
Or equivalently, it's punished for not sending data when it doesn't need 
to.  I also think it may not do much good when there are connections 
with significantly different RTTs.

Given that RFC2681 is Experimental (and I'm not aware of any current 
efforts in the IETF to push it to the standard track), IHMO it would not 
be inappropriate to make this behavior controlled via sysctl.

Thanks,
   -John

^ permalink raw reply

* Re: Please pull 'upstream' branch of wireless-2.6
From: Dan Williams @ 2006-04-25 12:03 UTC (permalink / raw)
  To: Johannes Berg; +Cc: John W. Linville, jeff, netdev
In-Reply-To: <1145964620.4571.10.camel@localhost>

On Tue, 2006-04-25 at 13:30 +0200, Johannes Berg wrote:
> On Mon, 2006-04-24 at 20:33 -0400, Dan Williams wrote:
> 
> > Any way to get the event handling cleanup patch into 2.6.17?  It's
> > pretty much a bugfix and bcm43xx is useless with wpa_supplicant and NM
> > without the patch...
> 
> No, that's not true, the cleanup patch is exactly that, code cleanup :)
> 
> Externally, softmac behaves the same without it since your patch and
> some other patches I did on top of that have already gone in.

Ah, sorry, mistook this patch for the one that actually sent the events.

Dan

^ permalink raw reply

* Re: Please pull 'upstream' branch of wireless-2.6
From: Johannes Berg @ 2006-04-25 11:30 UTC (permalink / raw)
  To: Dan Williams; +Cc: John W. Linville, jeff, netdev
In-Reply-To: <1145925182.9369.1.camel@localhost.localdomain>

[-- Attachment #1: Type: text/plain, Size: 448 bytes --]

On Mon, 2006-04-24 at 20:33 -0400, Dan Williams wrote:

> Any way to get the event handling cleanup patch into 2.6.17?  It's
> pretty much a bugfix and bcm43xx is useless with wpa_supplicant and NM
> without the patch...

No, that's not true, the cleanup patch is exactly that, code cleanup :)

Externally, softmac behaves the same without it since your patch and
some other patches I did on top of that have already gone in.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 793 bytes --]

^ permalink raw reply

* Re: Van Jacobson's net channels and real-time
From: linux-os (Dick Johnson) @ 2006-04-25 11:29 UTC (permalink / raw)
  To: Auke Kok
  Cc: Auke Kok, Ingo Oeser, Jörn Engel, Ingo Oeser,
	David S. Miller, simlo, linux-kernel, mingo, netdev
In-Reply-To: <444D8047.9080403@foo-projects.org>

On Mon, 24 Apr 2006, Auke Kok wrote:

> linux-os (Dick Johnson) wrote:
>> On Mon, 24 Apr 2006, Auke Kok wrote:
>>
>>> Ingo Oeser wrote:
>>>> On Saturday, 22. April 2006 15:49, Jörn Engel wrote:
>>>>> That was another main point, yes.  And the endpoints should be as
>>>>> little burden on the bottlenecks as possible.  One bottleneck is the
>>>>> receive interrupt, which shouldn't wait for cachelines from other cpus
>>>>> too much.
>>>> Thats right. This will be made a non issue with early demuxing
>>>> on the NIC and MSI (or was it MSI-X?) which will select
>>>> the right CPU based on hardware channels.
>>> MSI-X. with MSI you still have only one cpu handling all MSI interrupts and
>>> that doesn't look any different than ordinary interrupts. MSI-X will allow
>>> much better interrupt handling across several cpu's.
>>>
>>> Auke
>>> -
>>
>> Message signaled interrupts are just a kudge to save a trace on a
>> PC board (read make junk cheaper still).
>
> yes. Also in PCI-Express there is no physical interrupt line anymore due to
> the architecture, so even classical interrupts are sent as "message" over the bus.
>
>> They are not faster and may even be slower.
>
> thus in the case of PCI-Express, MSI interrupts are just as fast as the
> ordinary ones. I have no numbers on whether MSI is faster or not then e.g.
> interrupts on PCI-X, but generally speaking, the PCI-Express bus is not
> designed to be "low latency" at all, at best it gives you X latency, where X
> is something like microseconds. The MSI message itself only takes 10-20
> nanoseconds though, but all the handling probably adds a large factor to that
> (1000 or so). No clue on classical interrupt line latency - anyone?
>

About 9 nanosecond per foot of FR-4 (G10) trace, plus the access time
through the gate-arrays (about 20 ns) so, from the time a device needs
the CPU, until it hits the interrupt pin, you have typically 30 to
50 nanoseconds. Of course the CPU is __much__ slower. However, these
physical latencies are in series, cannot be compensated for because
the CPU can't see into the future.

>> They will not be the salvation of any interrupt latency problems.
>
> This is also not the problem - we really don't care that our 100.000 packets
> arrive 20usec slower per packet, just as long as the bus is not idle for those
> intervals. We would care a lot if 25.000 of those arrive directly at the
> proper CPU, without the need for one of the cpu's to arbitrate on every
> interrupt. That's the idea anyway.

It forces driver-writers to loop in ISRs to handle new status changes
that happened before an asserted interrupt even got to the CPU. This
is bad. You end up polled in the ISR, with the interrupts off. Turning
on the interrupts exacerbates the problem, you may never leave the
ISR! It becomes the new "idle task". To properly use interrupts,
the hardware latency must be less than the CPUs response to the
hardware stimulus.

>
> Nowadays with irq throttling we introduce a lot of designed latency anyway,
> especially with network devices.
>
>> The solutions for increasing networking speed,
>> where the bit-rate on the wire gets close to the bit-rate on the
>> bus, is to put more and more of the networking code inside the
>> network board. The CPU get interrupted after most things (like
>> network handshakes) are complete.
>
> That is a limited vision of the situation. You could argue that the current
> CPU's have so much power that they can easily do a lot of the processing
> instead of the hardware, and thus warm caches for userspace, setup sockets
> etc. This is the whole idea of Van Jacobsen's net channels. Putting more
> offloading into the hardware just brings so much problems with itself, that
> are just far easier solved in the OS.
>
>
> Cheers,
>
> Auke
>

Cheers,
Dick Johnson
Penguin : Linux version 2.6.16.4 on an i686 machine (5592.89 BogoMips).
Warning : 98.36% of all statistics are fiction, book release in April.
_
\x1a\x04

****************************************************************
The information transmitted in this message is confidential and may be privileged.  Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited.  If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to DeliveryErrors@analogic.com - and destroy all copies of this information, including any attachments, without reading or disclosing them.

Thank you.

^ permalink raw reply

* FIXED Re: [PATCH] remove likely in ip_rcv_finish()
From: Hua Zhong @ 2006-04-25  9:20 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <Pine.LNX.4.64.0604250204470.4481@localhost.localdomain>

Horrible typo! I really should have gone to sleep now.

Correct patch attached.

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 18d7fad..c9026db 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -337,7 +337,7 @@ static inline int ip_rcv_finish(struct s
  	 *	Initialise the virtual path cache for the packet. It describes
  	 *	how the packet travels inside Linux networking.
  	 */ 
-	if (likely(skb->dst == NULL)) {
+	if (skb->dst == NULL) {
  		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
  					 skb->dev);
  		if (unlikely(err)) {

^ permalink raw reply related

* [PATCH] remove likely in ip_rcv_finish()
From: Hua Zhong @ 2006-04-25  9:10 UTC (permalink / raw)
  To: davem, netdev

Hi,

This is another result from my likely profiling tool (dwalker@mvista.com just sent the patch of the profiling tool to linux-kernel mailing list, which is similar to what I use).

On my system (not very busy, normal development machine within a VMWare workstation), I see a 6/5 miss/hit ratio for the following "likely".

I am not sure what would happen for a busy system though (on which performance actually matters), but I am just reporting what I find and hope it might help.

Signed-off-by: Hua Zhong <hzhong@gmail.com>

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 18d7fad..9f44359 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -337,7 +337,7 @@ static inline int ip_rcv_finish(struct s
  	 *	Initialise the virtual path cache for the packet. It describes
  	 *	how the packet travels inside Linux networking.
  	 */ 
-	if (likely(skb->dst == NULL)) {
+	if (skb->dst) {
  		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
  					 skb->dev);
  		if (unlikely(err)) {

^ permalink raw reply related

* Re: determine outgoing interface (eth0,eth1) for a packet according to the dest IP
From: David S. Miller @ 2006-04-25  7:58 UTC (permalink / raw)
  To: ak; +Cc: qwejohn, netdev
In-Reply-To: <200604250943.49422.ak@suse.de>

From: Andi Kleen <ak@suse.de>
Date: Tue, 25 Apr 2006 09:43:49 +0200

> On Tuesday 25 April 2006 09:31, John Que wrote:
> > Hello,
> > What is the right way to determine on which interface card
> > (eth0 or eth1) will a packet be sent (according to the dest IP)?
> 
> You can send a rtnetlink RTM_GETROUTE message to ask the kernel.
> Result is the interface index in RTA_OIF, which can be converted
> into a name.

That scheme does not handle netfilter nor packet scheduler
classifier mangling and redirection of the packet.

^ permalink raw reply

* Re: determine outgoing interface (eth0,eth1) for a packet according to the dest IP
From: Andi Kleen @ 2006-04-25  7:43 UTC (permalink / raw)
  To: John Que; +Cc: netdev
In-Reply-To: <ada605fb0604250031r2e4a74e7id0b5079fabde9517@mail.gmail.com>

On Tuesday 25 April 2006 09:31, John Que wrote:
> Hello,
> What is the right way to determine on which interface card
> (eth0 or eth1) will a packet be sent (according to the dest IP)?

You can send a rtnetlink RTM_GETROUTE message to ask the kernel.
Result is the interface index in RTA_OIF, which can be converted
into a name.

-Andi


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox