Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next-2.6 PATCH 2/2] e1000/e1000e: implement a simple interrupt moderation
From: Jeff Kirsher @ 2010-05-05  8:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Jesse Brandeburg, Jeff Kirsher
In-Reply-To: <20100505082537.5956.86868.stgit@localhost.localdomain>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

Back before e1000-7.3.20, the e1000 driver had a simple algorithm that
managed interrupt moderation.  The driver was updated in 7.3.20 to
have the new "adaptive" interrupt moderation but we have customer
requests to redeploy the old way as an option.  This patch adds the
old functionality back.  The new functionality can be enabled via
module parameter or at runtime via ethtool.
Module parameter: (InterruptThrottleRate=4) to use this new
moderation method.
Ethtool method: ethtool -C ethX rx-usecs 4

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/e1000/e1000_ethtool.c |    8 +++++---
 drivers/net/e1000/e1000_main.c    |   18 +++++++++++++++++-
 drivers/net/e1000/e1000_param.c   |   10 ++++++++--
 drivers/net/e1000e/ethtool.c      |    8 +++++---
 drivers/net/e1000e/netdev.c       |   18 +++++++++++++++++-
 drivers/net/e1000e/param.c        |    5 +++++
 6 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c
index d6931ca..2a3b2dc 100644
--- a/drivers/net/e1000/e1000_ethtool.c
+++ b/drivers/net/e1000/e1000_ethtool.c
@@ -1808,7 +1808,7 @@ static int e1000_get_coalesce(struct net_device *netdev,
 	if (adapter->hw.mac_type < e1000_82545)
 		return -EOPNOTSUPP;
 
-	if (adapter->itr_setting <= 3)
+	if (adapter->itr_setting <= 4)
 		ec->rx_coalesce_usecs = adapter->itr_setting;
 	else
 		ec->rx_coalesce_usecs = 1000000 / adapter->itr_setting;
@@ -1826,12 +1826,14 @@ static int e1000_set_coalesce(struct net_device *netdev,
 		return -EOPNOTSUPP;
 
 	if ((ec->rx_coalesce_usecs > E1000_MAX_ITR_USECS) ||
-	    ((ec->rx_coalesce_usecs > 3) &&
+	    ((ec->rx_coalesce_usecs > 4) &&
 	     (ec->rx_coalesce_usecs < E1000_MIN_ITR_USECS)) ||
 	    (ec->rx_coalesce_usecs == 2))
 		return -EINVAL;
 
-	if (ec->rx_coalesce_usecs <= 3) {
+	if (ec->rx_coalesce_usecs == 4) {
+		adapter->itr = adapter->itr_setting = 4;
+	} else if (ec->rx_coalesce_usecs <= 3) {
 		adapter->itr = 20000;
 		adapter->itr_setting = ec->rx_coalesce_usecs;
 	} else {
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index e6ebc22..4dd2c23 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -31,7 +31,7 @@
 
 char e1000_driver_name[] = "e1000";
 static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
-#define DRV_VERSION "7.3.21-k5-NAPI"
+#define DRV_VERSION "7.3.21-k6-NAPI"
 const char e1000_driver_version[] = DRV_VERSION;
 static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation.";
 
@@ -2386,6 +2386,22 @@ link_up:
 		}
 	}
 
+	/* Simple mode for Interrupt Throttle Rate (ITR) */
+	if (hw->mac_type >= e1000_82540 && adapter->itr_setting == 4) {
+		/*
+		 * Symmetric Tx/Rx gets a reduced ITR=2000;
+		 * Total asymmetrical Tx or Rx gets ITR=8000;
+		 * everyone else is between 2000-8000.
+		 */
+		u32 goc = (adapter->gotcl + adapter->gorcl) / 10000;
+		u32 dif = (adapter->gotcl > adapter->gorcl ?
+			    adapter->gotcl - adapter->gorcl :
+			    adapter->gorcl - adapter->gotcl) / 10000;
+		u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;
+
+		ew32(ITR, 1000000000 / (itr * 256));
+	}
+
 	/* Cause software interrupt to ensure rx ring is cleaned */
 	ew32(ICS, E1000_ICS_RXDMT0);
 
diff --git a/drivers/net/e1000/e1000_param.c b/drivers/net/e1000/e1000_param.c
index 543c6d1..9fbb562 100644
--- a/drivers/net/e1000/e1000_param.c
+++ b/drivers/net/e1000/e1000_param.c
@@ -484,11 +484,17 @@ void __devinit e1000_check_options(struct e1000_adapter *adapter)
 				adapter->itr_setting = adapter->itr;
 				adapter->itr = 20000;
 				break;
+			case 4:
+				e_dev_info("%s set to simplified "
+				           "(2000-8000) ints mode\n", opt.name);
+				adapter->itr_setting = adapter->itr;
+				break;
 			default:
 				e1000_validate_option(&adapter->itr, &opt,
 				        adapter);
-				/* save the setting, because the dynamic bits change itr */
-				/* clear the lower two bits because they are
+				/* save the setting, because the dynamic bits
+				 * change itr.
+				 * clear the lower two bits because they are
 				 * used as control */
 				adapter->itr_setting = adapter->itr & ~3;
 				break;
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index 7f9915c..c81118a 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -1891,7 +1891,7 @@ static int e1000_get_coalesce(struct net_device *netdev,
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
-	if (adapter->itr_setting <= 3)
+	if (adapter->itr_setting <= 4)
 		ec->rx_coalesce_usecs = adapter->itr_setting;
 	else
 		ec->rx_coalesce_usecs = 1000000 / adapter->itr_setting;
@@ -1906,12 +1906,14 @@ static int e1000_set_coalesce(struct net_device *netdev,
 	struct e1000_hw *hw = &adapter->hw;
 
 	if ((ec->rx_coalesce_usecs > E1000_MAX_ITR_USECS) ||
-	    ((ec->rx_coalesce_usecs > 3) &&
+	    ((ec->rx_coalesce_usecs > 4) &&
 	     (ec->rx_coalesce_usecs < E1000_MIN_ITR_USECS)) ||
 	    (ec->rx_coalesce_usecs == 2))
 		return -EINVAL;
 
-	if (ec->rx_coalesce_usecs <= 3) {
+	if (ec->rx_coalesce_usecs == 4) {
+		adapter->itr = adapter->itr_setting = 4;
+	} else if (ec->rx_coalesce_usecs <= 3) {
 		adapter->itr = 20000;
 		adapter->itr_setting = ec->rx_coalesce_usecs;
 	} else {
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index d13760d..0a16465 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -52,7 +52,7 @@
 
 #include "e1000.h"
 
-#define DRV_VERSION "1.0.2-k2"
+#define DRV_VERSION "1.0.2-k4"
 char e1000e_driver_name[] = "e1000e";
 const char e1000e_driver_version[] = DRV_VERSION;
 
@@ -4087,6 +4087,22 @@ link_up:
 		}
 	}
 
+	/* Simple mode for Interrupt Throttle Rate (ITR) */
+	if (adapter->itr_setting == 4) {
+		/*
+		 * Symmetric Tx/Rx gets a reduced ITR=2000;
+		 * Total asymmetrical Tx or Rx gets ITR=8000;
+		 * everyone else is between 2000-8000.
+		 */
+		u32 goc = (adapter->gotc + adapter->gorc) / 10000;
+		u32 dif = (adapter->gotc > adapter->gorc ?
+			    adapter->gotc - adapter->gorc :
+			    adapter->gorc - adapter->gotc) / 10000;
+		u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;
+
+		ew32(ITR, 1000000000 / (itr * 256));
+	}
+
 	/* Cause software interrupt to ensure Rx ring is cleaned */
 	if (adapter->msix_entries)
 		ew32(ICS, adapter->rx_ring->ims_val);
diff --git a/drivers/net/e1000e/param.c b/drivers/net/e1000e/param.c
index f775a48..0f4077c 100644
--- a/drivers/net/e1000e/param.c
+++ b/drivers/net/e1000e/param.c
@@ -351,6 +351,11 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 				adapter->itr_setting = adapter->itr;
 				adapter->itr = 20000;
 				break;
+			case 4:
+				e_info("%s set to simplified (2000-8000 ints) "
+				       "mode\n", opt.name);
+				adapter->itr_setting = 4;
+				break;
 			default:
 				/*
 				 * Save the setting, because the dynamic bits


^ permalink raw reply related

* [net-next-2.6 PATCH 1/2] e1000e: increase rx fifo size to 36K on 82574 and 82583
From: Jeff Kirsher @ 2010-05-05  8:25 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change increases the RX fifo size to 36K for standard frames and
decreases the TX fifo size to 4K.  The reason for this change is that on
slower systems the RX is much more likely to backfill and need space than
the TX is.  As long as the TX fifo is twice the size of the MTU we should
have more than enough TX fifo.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/e1000e/82571.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/e1000e/82571.c b/drivers/net/e1000e/82571.c
index 17a25e1..1e73edd 100644
--- a/drivers/net/e1000e/82571.c
+++ b/drivers/net/e1000e/82571.c
@@ -1845,7 +1845,7 @@ struct e1000_info e1000_82574_info = {
 				  | FLAG_HAS_SMART_POWER_DOWN
 				  | FLAG_HAS_AMT
 				  | FLAG_HAS_CTRLEXT_ON_LOAD,
-	.pba			= 20,
+	.pba			= 36,
 	.max_hw_frame_size	= DEFAULT_JUMBO,
 	.get_variants		= e1000_get_variants_82571,
 	.mac_ops		= &e82571_mac_ops,
@@ -1862,7 +1862,7 @@ struct e1000_info e1000_82583_info = {
 				  | FLAG_HAS_SMART_POWER_DOWN
 				  | FLAG_HAS_AMT
 				  | FLAG_HAS_CTRLEXT_ON_LOAD,
-	.pba			= 20,
+	.pba			= 36,
 	.max_hw_frame_size	= ETH_FRAME_LEN + ETH_FCS_LEN,
 	.get_variants		= e1000_get_variants_82571,
 	.mac_ops		= &e82571_mac_ops,


^ permalink raw reply related

* Re: [PATCH net-next-2.6] net: __alloc_skb() speedup
From: Eric Dumazet @ 2010-05-05  8:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, hadi, therbert
In-Reply-To: <20100505.010658.48498744.davem@davemloft.net>

Le mercredi 05 mai 2010 à 01:06 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 04 May 2010 19:10:54 +0200
> 
> > With following patch I can reach maximum rate of my pktgen+udpsink
> > simulator :
> > - 'old' machine : dual quad core E5450  @3.00GHz
> > - 64 UDP rx flows (only differ by destination port)
> > - RPS enabled, NIC interrupts serviced on cpu0
> > - rps dispatched on 7 other cores. (~130.000 IPI per second)
> > - SLAB allocator (faster than SLUB in this workload)
> > - tg3 NIC
> > - 1.080.000 pps without a single drop at NIC level.
> > 
> > Idea is to add two prefetchw() calls in __alloc_skb(), one to prefetch
> > first sk_buff cache line, the second to prefetch the shinfo part.
> > 
> > Also using one memset() to initialize all skb_shared_info fields instead
> > of one by one to reduce number of instructions, using long word moves.
> > 
> > All skb_shared_info fields before 'dataref' are cleared in 
> > __alloc_skb().
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> I'll apply this, nice work Eric.
> 
> But some caveats...
> 
> On several cpu types it is possible to "prefetch invalidate"
> cachelines.  PowerPC and sparc64 can both do it.  I'm pretty
> sure current gen x86 have SSE bits that can do this too.
> 
> In fact, the memset() for sparc64 is going to do these cacheline
> invalidates, making the prefetches on 'skb' in fact wasteful.
> It will just create spurious bus traffic.
> 

You mean memset() wont be inlined by ompiler to plain memory writes, but
use the custom kernel memset()  ?



^ permalink raw reply

* [net-next-2.6 PATCH 2/2] ixgbe: Add support for VF MAC and VLAN configuration
From: Jeff Kirsher @ 2010-05-05  8:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Greg Rose, Jeff Kirsher
In-Reply-To: <20100505081129.5709.39430.stgit@localhost.localdomain>

From: Greg Rose <gregory.v.rose@intel.com>

Add support for the "ip link set" and "ip link show" commands that allow
configuration of the virtual functions' MAC and port VLAN via user space
command line.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe.h       |    3 +
 drivers/net/ixgbe/ixgbe_main.c  |    4 ++
 drivers/net/ixgbe/ixgbe_sriov.c |  104 ++++++++++++++++++++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_sriov.h |    6 ++
 drivers/net/ixgbe/ixgbe_type.h  |    5 ++
 5 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 79c35ae..d0ea3d6 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -111,7 +111,10 @@ struct vf_data_storage {
 	u16 default_vf_vlan_id;
 	u16 vlans_enabled;
 	bool clear_to_send;
+	bool pf_set_mac;
 	int rar;
+	u16 pf_vlan; /* When set, guest VLAN config not allowed. */
+	u16 pf_qos;
 };
 
 /* wrapper around a pointer to a socket buffer,
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 0a0e90e..d1a1868 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -6311,6 +6311,10 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= ixgbe_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= ixgbe_vlan_rx_kill_vid,
 	.ndo_do_ioctl		= ixgbe_ioctl,
+	.ndo_set_vf_mac		= ixgbe_ndo_set_vf_mac,
+	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
+	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
+	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
diff --git a/drivers/net/ixgbe/ixgbe_sriov.c b/drivers/net/ixgbe/ixgbe_sriov.c
index 53f364d..221b2ca 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ixgbe/ixgbe_sriov.c
@@ -126,13 +126,34 @@ void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe)
 	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf), vmolr);
 }
 
+static void ixgbe_set_vmvir(struct ixgbe_adapter *adapter, u32 vid, u32 vf)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+
+	if (vid)
+		IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf),
+				(vid | IXGBE_VMVIR_VLANA_DEFAULT));
+	else
+		IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), 0);
+}
+
 inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	/* reset offloads to defaults */
-	ixgbe_set_vmolr(hw, vf, true);
-
+	if (adapter->vfinfo[vf].pf_vlan) {
+		ixgbe_set_vf_vlan(adapter, true,
+				  adapter->vfinfo[vf].pf_vlan, vf);
+		ixgbe_set_vmvir(adapter,
+				(adapter->vfinfo[vf].pf_vlan |
+				 (adapter->vfinfo[vf].pf_qos <<
+				  VLAN_PRIO_SHIFT)), vf);
+		ixgbe_set_vmolr(hw, vf, false);
+	} else {
+		ixgbe_set_vmvir(adapter, 0, vf);
+		ixgbe_set_vmolr(hw, vf, true);
+	}
 
 	/* reset multicast table array for vf */
 	adapter->vfinfo[vf].num_vf_mc_hashes = 0;
@@ -266,10 +287,12 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	case IXGBE_VF_SET_MAC_ADDR:
 		{
 			u8 *new_mac = ((u8 *)(&msgbuf[1]));
-			if (is_valid_ether_addr(new_mac))
+			if (is_valid_ether_addr(new_mac) &&
+			    !adapter->vfinfo[vf].pf_set_mac)
 				ixgbe_set_vf_mac(adapter, vf, new_mac);
 			else
-				retval = -1;
+				ixgbe_set_vf_mac(adapter,
+				  vf, adapter->vfinfo[vf].vf_mac_addresses);
 		}
 		break;
 	case IXGBE_VF_SET_MULTICAST:
@@ -363,3 +386,76 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter)
 	}
 }
 
+int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	if (!is_valid_ether_addr(mac) || (vf >= adapter->num_vfs))
+		return -EINVAL;
+	adapter->vfinfo[vf].pf_set_mac = true;
+	dev_info(&adapter->pdev->dev, "setting MAC %pM on VF %d\n", mac, vf);
+	dev_info(&adapter->pdev->dev, "Reload the VF driver to make this"
+				      " change effective.");
+	if (test_bit(__IXGBE_DOWN, &adapter->state)) {
+		dev_warn(&adapter->pdev->dev, "The VF MAC address has been set,"
+			 " but the PF device is not up.\n");
+		dev_warn(&adapter->pdev->dev, "Bring the PF device up before"
+			 " attempting to use the VF device.\n");
+	}
+	return ixgbe_set_vf_mac(adapter, vf, mac);
+}
+
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+{
+	int err = 0;
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7))
+		return -EINVAL;
+	if (vlan || qos) {
+		err = ixgbe_set_vf_vlan(adapter, true, vlan, vf);
+		if (err)
+			goto out;
+		ixgbe_set_vmvir(adapter, vlan | (qos << VLAN_PRIO_SHIFT), vf);
+		ixgbe_set_vmolr(&adapter->hw, vf, false);
+		adapter->vfinfo[vf].pf_vlan = vlan;
+		adapter->vfinfo[vf].pf_qos = qos;
+		dev_info(&adapter->pdev->dev,
+			 "Setting VLAN %d, QOS 0x%x on VF %d\n", vlan, qos, vf);
+		if (test_bit(__IXGBE_DOWN, &adapter->state)) {
+			dev_warn(&adapter->pdev->dev,
+				 "The VF VLAN has been set,"
+				 " but the PF device is not up.\n");
+			dev_warn(&adapter->pdev->dev,
+				 "Bring the PF device up before"
+				 " attempting to use the VF device.\n");
+		}
+	} else {
+		err = ixgbe_set_vf_vlan(adapter, false,
+					adapter->vfinfo[vf].pf_vlan, vf);
+		ixgbe_set_vmvir(adapter, vlan, vf);
+		ixgbe_set_vmolr(&adapter->hw, vf, true);
+		adapter->vfinfo[vf].pf_vlan = 0;
+		adapter->vfinfo[vf].pf_qos = 0;
+       }
+out:
+       return err;
+}
+
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
+{
+	return -EOPNOTSUPP;
+}
+
+int ixgbe_ndo_get_vf_config(struct net_device *netdev,
+			    int vf, struct ifla_vf_info *ivi)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+	ivi->vf = vf;
+	memcpy(&ivi->mac, adapter->vfinfo[vf].vf_mac_addresses, ETH_ALEN);
+	ivi->tx_rate = 0;
+	ivi->vlan = adapter->vfinfo[vf].pf_vlan;
+	ivi->qos = adapter->vfinfo[vf].pf_qos;
+	return 0;
+}
diff --git a/drivers/net/ixgbe/ixgbe_sriov.h b/drivers/net/ixgbe/ixgbe_sriov.h
index 7fb1288..184730e 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ixgbe/ixgbe_sriov.h
@@ -42,6 +42,12 @@ int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask);
 void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter);
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 void ixgbe_dump_registers(struct ixgbe_adapter *adapter);
+int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
+			   u8 qos);
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate);
+int ixgbe_ndo_get_vf_config(struct net_device *netdev,
+			    int vf, struct ifla_vf_info *ivi);
 
 #endif /* _IXGBE_SRIOV_H_ */
 
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index a0f9084..4277cbb 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -219,6 +219,7 @@
 #define IXGBE_MTQC      0x08120
 #define IXGBE_VLVF(_i)  (0x0F100 + ((_i) * 4))  /* 64 of these (0-63) */
 #define IXGBE_VLVFB(_i) (0x0F200 + ((_i) * 4))  /* 128 of these (0-127) */
+#define IXGBE_VMVIR(_i) (0x08000 + ((_i) * 4))  /* 64 of these (0-63) */
 #define IXGBE_VT_CTL    0x051B0
 #define IXGBE_VFRE(_i)  (0x051E0 + ((_i) * 4))
 #define IXGBE_VFTE(_i)  (0x08110 + ((_i) * 4))
@@ -1311,6 +1312,10 @@
 #define IXGBE_VLVF_ENTRIES      64
 #define IXGBE_VLVF_VLANID_MASK  0x00000FFF
 
+/* Per VF Port VLAN insertion rules */
+#define IXGBE_VMVIR_VLANA_DEFAULT 0x40000000 /* Always use default VLAN */
+#define IXGBE_VMVIR_VLANA_NEVER   0x80000000 /* Never insert VLAN tag */
+
 #define IXGBE_ETHERNET_IEEE_VLAN_TYPE 0x8100  /* 802.1q protocol */
 
 /* STATUS Bit Masks */


^ permalink raw reply related

* [net-next-2.6 PATCH 1/2] ixgbe: Add boolean parameter to ixgbe_set_vmolr
From: Jeff Kirsher @ 2010-05-05  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Greg Rose, Jeff Kirsher

From: Greg Rose <gregory.v.rose@intel.com>

Add a boolean parameter to ixgbe-set_vmolr so that the caller can
specify whether the pool should accept untagged packets.  Required
for a follow on patch to enable administrative configuration of port
VLAN for virtual functions.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c  |    2 +-
 drivers/net/ixgbe/ixgbe_sriov.c |   11 +++++++----
 drivers/net/ixgbe/ixgbe_sriov.h |    2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 2ae5a51..0a0e90e 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -2722,7 +2722,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 		IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), (1 << vf_shift));
 		IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), (1 << vf_shift));
 		IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
-		ixgbe_set_vmolr(hw, adapter->num_vfs);
+		ixgbe_set_vmolr(hw, adapter->num_vfs, true);
 	}
 
 	/* Program MRQC for the distribution of queues */
diff --git a/drivers/net/ixgbe/ixgbe_sriov.c b/drivers/net/ixgbe/ixgbe_sriov.c
index d4cd20f..53f364d 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ixgbe/ixgbe_sriov.c
@@ -113,13 +113,16 @@ int ixgbe_set_vf_vlan(struct ixgbe_adapter *adapter, int add, int vid, u32 vf)
 }
 
 
-void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf)
+void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe)
 {
 	u32 vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(vf));
-	vmolr |= (IXGBE_VMOLR_AUPE |
-		  IXGBE_VMOLR_ROMPE |
+	vmolr |= (IXGBE_VMOLR_ROMPE |
 		  IXGBE_VMOLR_ROPE |
 		  IXGBE_VMOLR_BAM);
+	if (aupe)
+		vmolr |= IXGBE_VMOLR_AUPE;
+	else
+		vmolr &= ~IXGBE_VMOLR_AUPE;
 	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf), vmolr);
 }
 
@@ -128,7 +131,7 @@ inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf)
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	/* reset offloads to defaults */
-	ixgbe_set_vmolr(hw, vf);
+	ixgbe_set_vmolr(hw, vf, true);
 
 
 	/* reset multicast table array for vf */
diff --git a/drivers/net/ixgbe/ixgbe_sriov.h b/drivers/net/ixgbe/ixgbe_sriov.h
index 51d1106..7fb1288 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ixgbe/ixgbe_sriov.h
@@ -32,7 +32,7 @@ int ixgbe_set_vf_multicasts(struct ixgbe_adapter *adapter,
                             int entries, u16 *hash_list, u32 vf);
 void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter);
 int ixgbe_set_vf_vlan(struct ixgbe_adapter *adapter, int add, int vid, u32 vf);
-void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf);
+void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe);
 void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf);
 void ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf);
 void ixgbe_msg_task(struct ixgbe_adapter *adapter);


^ permalink raw reply related

* [v5 Patch 3/3] bonding: make bonding support netpoll
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel
In-Reply-To: <20100505081514.5157.83783.sendpatchset@localhost.localdomain>


Based on Andy's work, but I modified a lot.

Similar to the patch for bridge, this patch does:

1) implement the 2 methods to support netpoll for bonding;

2) modify netpoll during forwarding packets via bonding;

3) disable netpoll support of bonding when a netpoll-unabled device
   is added to bonding;

4) enable netpoll support when all underlying devices support netpoll.

Cc: Andy Gospodarek <gospo@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Jay Vosburgh <fubar@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/drivers/net/bonding/bond_main.c
===================================================================
--- linux-2.6.orig/drivers/net/bonding/bond_main.c
+++ linux-2.6/drivers/net/bonding/bond_main.c
@@ -59,6 +59,7 @@
 #include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
 #include <linux/etherdevice.h>
@@ -430,7 +431,18 @@ int bond_dev_queue_xmit(struct bonding *
 	}
 
 	skb->priority = 1;
-	dev_queue_xmit(skb);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (unlikely(bond->dev->priv_flags & IFF_IN_NETPOLL)) {
+		struct netpoll *np = bond->dev->npinfo->netpoll;
+		slave_dev->npinfo = bond->dev->npinfo;
+		np->real_dev = np->dev = skb->dev;
+		slave_dev->priv_flags |= IFF_IN_NETPOLL;
+		netpoll_send_skb(np, skb);
+		slave_dev->priv_flags &= ~IFF_IN_NETPOLL;
+		np->dev = bond->dev;
+	} else
+#endif
+		dev_queue_xmit(skb);
 
 	return 0;
 }
@@ -1329,6 +1341,61 @@ static void bond_detach_slave(struct bon
 	bond->slave_cnt--;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/*
+ * You must hold read lock on bond->lock before calling this.
+ */
+static bool slaves_support_netpoll(struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	int i = 0;
+	bool ret = true;
+
+	bond_for_each_slave(bond, slave, i) {
+		if ((slave->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
+		    !slave->dev->netdev_ops->ndo_poll_controller)
+			ret = false;
+	}
+	return i != 0 && ret;
+}
+
+static void bond_poll_controller(struct net_device *bond_dev)
+{
+	struct net_device *dev = bond_dev->npinfo->netpoll->real_dev;
+	if (dev != bond_dev)
+		netpoll_poll_dev(dev);
+}
+
+static void bond_netpoll_cleanup(struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	const struct net_device_ops *ops;
+	int i;
+
+	read_lock(&bond->lock);
+	bond_dev->npinfo = NULL;
+	bond_for_each_slave(bond, slave, i) {
+		if (slave->dev) {
+			ops = slave->dev->netdev_ops;
+			if (ops->ndo_netpoll_cleanup)
+				ops->ndo_netpoll_cleanup(slave->dev);
+			else
+				slave->dev->npinfo = NULL;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
+#else
+
+static void bond_netpoll_cleanup(struct net_device *bond_dev)
+{
+}
+
+#endif
+
 /*---------------------------------- IOCTL ----------------------------------*/
 
 static int bond_sethwaddr(struct net_device *bond_dev,
@@ -1735,6 +1802,18 @@ int bond_enslave(struct net_device *bond
 
 	bond_set_carrier(bond);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (slaves_support_netpoll(bond_dev)) {
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+		if (bond_dev->npinfo)
+			slave_dev->npinfo = bond_dev->npinfo;
+	} else if (!(bond_dev->priv_flags & IFF_DISABLE_NETPOLL)) {
+		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
+		pr_info("New slave device %s does not support netpoll\n",
+			slave_dev->name);
+		pr_info("Disabling netpoll support for %s\n", bond_dev->name);
+	}
+#endif
 	read_unlock(&bond->lock);
 
 	res = bond_create_slave_symlinks(bond_dev, slave_dev);
@@ -1801,6 +1880,7 @@ int bond_release(struct net_device *bond
 		return -EINVAL;
 	}
 
+	netdev_bonding_change(bond_dev, NETDEV_BONDING_DESLAVE);
 	write_lock_bh(&bond->lock);
 
 	slave = bond_get_slave_by_dev(bond, slave_dev);
@@ -1929,6 +2009,17 @@ int bond_release(struct net_device *bond
 
 	netdev_set_master(slave_dev, NULL);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	read_lock_bh(&bond->lock);
+	if (slaves_support_netpoll(bond_dev))
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+	read_unlock_bh(&bond->lock);
+	if (slave_dev->netdev_ops->ndo_netpoll_cleanup)
+		slave_dev->netdev_ops->ndo_netpoll_cleanup(slave_dev);
+	else
+		slave_dev->npinfo = NULL;
+#endif
+
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
@@ -4448,6 +4539,10 @@ static const struct net_device_ops bond_
 	.ndo_vlan_rx_register	= bond_vlan_rx_register,
 	.ndo_vlan_rx_add_vid 	= bond_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_cleanup	= bond_netpoll_cleanup,
+	.ndo_poll_controller	= bond_poll_controller,
+#endif
 };
 
 static void bond_destructor(struct net_device *bond_dev)
@@ -4541,6 +4636,8 @@ static void bond_uninit(struct net_devic
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
+	bond_netpoll_cleanup(bond_dev);
+
 	/* Release the bonded slaves */
 	bond_release_all(bond_dev);

^ permalink raw reply

* [v5 Patch 2/3] bridge: make bridge support netpoll
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel
In-Reply-To: <20100505081514.5157.83783.sendpatchset@localhost.localdomain>


Based on the previous patch, make bridge support netpoll by:

1) implement the 2 methods to support netpoll for bridge;

2) modify netpoll during forwarding packets via bridge;

3) disable netpoll support of bridge when a netpoll-unabled device
   is added to bridge;

4) enable netpoll support when all underlying devices support netpoll.

Cc: David Miller <davem@davemloft.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/net/bridge/br_device.c
===================================================================
--- linux-2.6.orig/net/bridge/br_device.c
+++ linux-2.6/net/bridge/br_device.c
@@ -13,8 +13,10 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
+#include <linux/list.h>
 
 #include <asm/uaccess.h>
 #include "br_private.h"
@@ -162,6 +164,59 @@ static int br_set_tx_csum(struct net_dev
 	return 0;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+bool br_devices_support_netpoll(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	bool ret = true;
+	int count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&br->lock, flags);
+	list_for_each_entry(p, &br->port_list, list) {
+		count++;
+		if ((p->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
+		    !p->dev->netdev_ops->ndo_poll_controller)
+			ret = false;
+	}
+	spin_unlock_irqrestore(&br->lock, flags);
+	return count != 0 && ret;
+}
+
+static void br_poll_controller(struct net_device *br_dev)
+{
+	struct netpoll *np = br_dev->npinfo->netpoll;
+
+	if (np->real_dev != br_dev)
+		netpoll_poll_dev(np->real_dev);
+}
+
+void br_netpoll_cleanup(struct net_device *br_dev)
+{
+	struct net_bridge *br = netdev_priv(br_dev);
+	struct net_bridge_port *p, *n;
+	const struct net_device_ops *ops;
+
+	br->dev->npinfo = NULL;
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		if (p->dev) {
+			ops = p->dev->netdev_ops;
+			if (ops->ndo_netpoll_cleanup)
+				ops->ndo_netpoll_cleanup(p->dev);
+			else
+				p->dev->npinfo = NULL;
+		}
+	}
+}
+
+#else
+
+void br_netpoll_cleanup(struct net_device *br_dev)
+{
+}
+
+#endif
+
 static const struct ethtool_ops br_ethtool_ops = {
 	.get_drvinfo    = br_getinfo,
 	.get_link	= ethtool_op_get_link,
@@ -184,6 +239,10 @@ static const struct net_device_ops br_ne
 	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
 	.ndo_change_mtu		 = br_change_mtu,
 	.ndo_do_ioctl		 = br_dev_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
+	.ndo_poll_controller	 = br_poll_controller,
+#endif
 };
 
 void br_dev_setup(struct net_device *dev)
Index: linux-2.6/net/bridge/br_forward.c
===================================================================
--- linux-2.6.orig/net/bridge/br_forward.c
+++ linux-2.6/net/bridge/br_forward.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/skbuff.h>
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
@@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
 		else {
 			skb_push(skb, ETH_HLEN);
 
-			dev_queue_xmit(skb);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+			if (unlikely(skb->dev->priv_flags & IFF_IN_NETPOLL)) {
+				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
+				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
+			} else
+#endif
+				dev_queue_xmit(skb);
 		}
 	}
 
@@ -66,9 +73,23 @@ int br_forward_finish(struct sk_buff *sk
 
 static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 {
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct net_bridge *br = to->br;
+	if (unlikely(br->dev->priv_flags & IFF_IN_NETPOLL)) {
+		struct netpoll *np;
+		to->dev->npinfo = skb->dev->npinfo;
+		np = skb->dev->npinfo->netpoll;
+		np->real_dev = np->dev = to->dev;
+		to->dev->priv_flags |= IFF_IN_NETPOLL;
+	}
+#endif
 	skb->dev = to->dev;
 	NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
 			br_forward_finish);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (skb->dev->npinfo)
+		skb->dev->npinfo->netpoll->dev = br->dev;
+#endif
 }
 
 static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
Index: linux-2.6/net/bridge/br_if.c
===================================================================
--- linux-2.6.orig/net/bridge/br_if.c
+++ linux-2.6/net/bridge/br_if.c
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
@@ -153,6 +154,14 @@ static void del_nbp(struct net_bridge_po
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (br_devices_support_netpoll(br))
+		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+	if (dev->netdev_ops->ndo_netpoll_cleanup)
+		dev->netdev_ops->ndo_netpoll_cleanup(dev);
+	else
+		dev->npinfo = NULL;
+#endif
 	call_rcu(&p->rcu, destroy_nbp_rcu);
 }
 
@@ -165,6 +174,8 @@ static void del_br(struct net_bridge *br
 		del_nbp(p);
 	}
 
+	br_netpoll_cleanup(br->dev);
+
 	del_timer_sync(&br->gc_timer);
 
 	br_sysfs_delbr(br->dev);
@@ -438,6 +449,20 @@ int br_add_if(struct net_bridge *br, str
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (br_devices_support_netpoll(br)) {
+		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+		if (br->dev->npinfo)
+			dev->npinfo = br->dev->npinfo;
+	} else if (!(br->dev->priv_flags & IFF_DISABLE_NETPOLL)) {
+		br->dev->priv_flags |= IFF_DISABLE_NETPOLL;
+		printk(KERN_INFO "New device %s does not support netpoll\n",
+			dev->name);
+		printk(KERN_INFO "Disabling netpoll for %s\n",
+			br->dev->name);
+	}
+#endif
+
 	return 0;
 err2:
 	br_fdb_delete_by_port(br, p, 1);
Index: linux-2.6/net/bridge/br_private.h
===================================================================
--- linux-2.6.orig/net/bridge/br_private.h
+++ linux-2.6/net/bridge/br_private.h
@@ -233,6 +233,8 @@ static inline int br_is_root_bridge(cons
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
 			       struct net_device *dev);
+extern bool br_devices_support_netpoll(struct net_bridge *br);
+extern void br_netpoll_cleanup(struct net_device *br_dev);
 
 /* br_fdb.c */
 extern int br_fdb_init(void);

^ permalink raw reply

* [v5 Patch 1/3] netpoll: add generic support for bridge and bonding devices
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel

V5:
Fix coding style problems pointed by David.

V4:
Use "unlikely" to mark netpoll call path, suggested by Stephen.
Handle NETDEV_GOING_DOWN case.

V3:
Update to latest Linus' tree.
Fix deadlocks when releasing slaves of bonding devices.
Thanks to Andy.

V2:
Fix some bugs of previous version.
Remove ->netpoll_setup and ->netpoll_xmit, they are not necessary.
Don't poll all underlying devices, poll ->real_dev in struct netpoll.
Thanks to David for suggesting above.

------------>

This whole patchset is for adding netpoll support to bridge and bonding
devices. I already tested it for bridge, bonding, bridge over bonding,
and bonding over bridge. It looks fine now.


To make bridge and bonding support netpoll, we need to adjust
some netpoll generic code. This patch does the following things:

1) introduce two new priv_flags for struct net_device:
   IFF_IN_NETPOLL which identifies we are processing a netpoll;
   IFF_DISABLE_NETPOLL is used to disable netpoll support for a device
   at run-time;

2) introduce one new method for netdev_ops:
   ->ndo_netpoll_cleanup() is used to clean up netpoll when a device is
     removed.

3) introduce netpoll_poll_dev() which takes a struct net_device * parameter;
   export netpoll_send_skb() and netpoll_poll_dev() which will be used later;

4) hide a pointer to struct netpoll in struct netpoll_info, ditto.

5) introduce ->real_dev for struct netpoll.

6) introduce a new status NETDEV_BONDING_DESLAE, which is used to disable
   netconsole before releasing a slave, to avoid deadlocks.

Cc: David Miller <davem@davemloft.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/include/linux/if.h
===================================================================
--- linux-2.6.orig/include/linux/if.h
+++ linux-2.6/include/linux/if.h
@@ -71,6 +71,8 @@
 					 * release skb->dst
 					 */
 #define IFF_DONT_BRIDGE 0x800		/* disallow bridging this ether dev */
+#define IFF_IN_NETPOLL	0x1000		/* whether we are processing netpoll */
+#define IFF_DISABLE_NETPOLL	0x2000	/* disable netpoll at run-time */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
Index: linux-2.6/include/linux/netdevice.h
===================================================================
--- linux-2.6.orig/include/linux/netdevice.h
+++ linux-2.6/include/linux/netdevice.h
@@ -667,6 +667,7 @@ struct net_device_ops {
 						        unsigned short vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*ndo_poll_controller)(struct net_device *dev);
+	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
Index: linux-2.6/include/linux/netpoll.h
===================================================================
--- linux-2.6.orig/include/linux/netpoll.h
+++ linux-2.6/include/linux/netpoll.h
@@ -14,6 +14,7 @@
 
 struct netpoll {
 	struct net_device *dev;
+	struct net_device *real_dev;
 	char dev_name[IFNAMSIZ];
 	const char *name;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
@@ -36,8 +37,11 @@ struct netpoll_info {
 	struct sk_buff_head txq;
 
 	struct delayed_work tx_work;
+
+	struct netpoll *netpoll;
 };
 
+void netpoll_poll_dev(struct net_device *dev);
 void netpoll_poll(struct netpoll *np);
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
@@ -47,6 +51,7 @@ int netpoll_trap(void);
 void netpoll_set_trap(int trap);
 void netpoll_cleanup(struct netpoll *np);
 int __netpoll_rx(struct sk_buff *skb);
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 
 
 #ifdef CONFIG_NETPOLL
Index: linux-2.6/net/core/netpoll.c
===================================================================
--- linux-2.6.orig/net/core/netpoll.c
+++ linux-2.6/net/core/netpoll.c
@@ -179,9 +179,8 @@ static void service_arp_queue(struct net
 	}
 }
 
-void netpoll_poll(struct netpoll *np)
+void netpoll_poll_dev(struct net_device *dev)
 {
-	struct net_device *dev = np->dev;
 	const struct net_device_ops *ops;
 
 	if (!dev || !netif_running(dev))
@@ -201,6 +200,11 @@ void netpoll_poll(struct netpoll *np)
 	zap_completion_queue();
 }
 
+void netpoll_poll(struct netpoll *np)
+{
+	netpoll_poll_dev(np->dev);
+}
+
 static void refill_skbs(void)
 {
 	struct sk_buff *skb;
@@ -282,7 +286,7 @@ static int netpoll_owner_active(struct n
 	return 0;
 }
 
-static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	int status = NETDEV_TX_BUSY;
 	unsigned long tries;
@@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_tx_queue_stopped(txq)) {
+					dev->priv_flags |= IFF_IN_NETPOLL;
 					status = ops->ndo_start_xmit(skb, dev);
+					dev->priv_flags &= ~IFF_IN_NETPOLL;
 					if (status == NETDEV_TX_OK)
 						txq_trans_update(txq);
 				}
@@ -756,7 +762,10 @@ int netpoll_setup(struct netpoll *np)
 		atomic_inc(&npinfo->refcnt);
 	}
 
-	if (!ndev->netdev_ops->ndo_poll_controller) {
+	npinfo->netpoll = np;
+
+	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+	    !ndev->netdev_ops->ndo_poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
 		       np->name, np->dev_name);
 		err = -ENOTSUPP;
@@ -878,6 +887,7 @@ void netpoll_cleanup(struct netpoll *np)
 			}
 
 			if (atomic_dec_and_test(&npinfo->refcnt)) {
+				const struct net_device_ops *ops;
 				skb_queue_purge(&npinfo->arp_tx);
 				skb_queue_purge(&npinfo->txq);
 				cancel_rearming_delayed_work(&npinfo->tx_work);
@@ -885,7 +895,11 @@ void netpoll_cleanup(struct netpoll *np)
 				/* clean after last, unfinished work */
 				__skb_queue_purge(&npinfo->txq);
 				kfree(npinfo);
-				np->dev->npinfo = NULL;
+				ops = np->dev->netdev_ops;
+				if (ops->ndo_netpoll_cleanup)
+					ops->ndo_netpoll_cleanup(np->dev);
+				else
+					np->dev->npinfo = NULL;
 			}
 		}
 
@@ -908,6 +922,7 @@ void netpoll_set_trap(int trap)
 		atomic_dec(&trapped);
 }
 
+EXPORT_SYMBOL(netpoll_send_skb);
 EXPORT_SYMBOL(netpoll_set_trap);
 EXPORT_SYMBOL(netpoll_trap);
 EXPORT_SYMBOL(netpoll_print_options);
@@ -915,4 +930,5 @@ EXPORT_SYMBOL(netpoll_parse_options);
 EXPORT_SYMBOL(netpoll_setup);
 EXPORT_SYMBOL(netpoll_cleanup);
 EXPORT_SYMBOL(netpoll_send_udp);
+EXPORT_SYMBOL(netpoll_poll_dev);
 EXPORT_SYMBOL(netpoll_poll);
Index: linux-2.6/drivers/net/netconsole.c
===================================================================
--- linux-2.6.orig/drivers/net/netconsole.c
+++ linux-2.6/drivers/net/netconsole.c
@@ -665,7 +665,8 @@ static int netconsole_netdev_event(struc
 	struct netconsole_target *nt;
 	struct net_device *dev = ptr;
 
-	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER))
+	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
+	      event == NETDEV_BONDING_DESLAVE || event == NETDEV_GOING_DOWN))
 		goto done;
 
 	spin_lock_irqsave(&target_list_lock, flags);
@@ -677,19 +678,21 @@ static int netconsole_netdev_event(struc
 				strlcpy(nt->np.dev_name, dev->name, IFNAMSIZ);
 				break;
 			case NETDEV_UNREGISTER:
-				if (!nt->enabled)
-					break;
 				netpoll_cleanup(&nt->np);
+				/* Fall through */
+			case NETDEV_GOING_DOWN:
+			case NETDEV_BONDING_DESLAVE:
 				nt->enabled = 0;
-				printk(KERN_INFO "netconsole: network logging stopped"
-					", interface %s unregistered\n",
-					dev->name);
 				break;
 			}
 		}
 		netconsole_target_put(nt);
 	}
 	spin_unlock_irqrestore(&target_list_lock, flags);
+	if (event == NETDEV_UNREGISTER || event == NETDEV_BONDING_DESLAVE)
+		printk(KERN_INFO "netconsole: network logging stopped, "
+			"interface %s %s\n",  dev->name,
+			event == NETDEV_UNREGISTER ? "unregistered" : "released slaves");
 
 done:
 	return NOTIFY_DONE;
Index: linux-2.6/include/linux/notifier.h
===================================================================
--- linux-2.6.orig/include/linux/notifier.h
+++ linux-2.6/include/linux/notifier.h
@@ -203,6 +203,7 @@ static inline int notifier_to_errno(int 
 #define NETDEV_BONDING_NEWTYPE  0x000F
 #define NETDEV_POST_INIT	0x0010
 #define NETDEV_UNREGISTER_BATCH 0x0011
+#define NETDEV_BONDING_DESLAVE  0x0012
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: __alloc_skb() speedup
From: David Miller @ 2010-05-05  8:06 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, hadi, therbert
In-Reply-To: <1272993054.2245.21.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 04 May 2010 19:10:54 +0200

> With following patch I can reach maximum rate of my pktgen+udpsink
> simulator :
> - 'old' machine : dual quad core E5450  @3.00GHz
> - 64 UDP rx flows (only differ by destination port)
> - RPS enabled, NIC interrupts serviced on cpu0
> - rps dispatched on 7 other cores. (~130.000 IPI per second)
> - SLAB allocator (faster than SLUB in this workload)
> - tg3 NIC
> - 1.080.000 pps without a single drop at NIC level.
> 
> Idea is to add two prefetchw() calls in __alloc_skb(), one to prefetch
> first sk_buff cache line, the second to prefetch the shinfo part.
> 
> Also using one memset() to initialize all skb_shared_info fields instead
> of one by one to reduce number of instructions, using long word moves.
> 
> All skb_shared_info fields before 'dataref' are cleared in 
> __alloc_skb().
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

I'll apply this, nice work Eric.

But some caveats...

On several cpu types it is possible to "prefetch invalidate"
cachelines.  PowerPC and sparc64 can both do it.  I'm pretty
sure current gen x86 have SSE bits that can do this too.

In fact, the memset() for sparc64 is going to do these cacheline
invalidates, making the prefetches on 'skb' in fact wasteful.
It will just create spurious bus traffic.

The memset() for skb_shared_info() is going to help universally
I think.



^ permalink raw reply

* [net-next-2.6 PATCH] igb: reduce cache misses on tx cleanup
From: Jeff Kirsher @ 2010-05-05  7:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Nicholas Nunley, Jeff Kirsher

From: Nick Nunley <nicholasx.d.nunley@intel.com>

This patch reduces the number of skb cache misses in the
clean_tx_irq path, and results in an overall increase
in tx packet throughput.

Signed-off-by: Nicholas Nunley <nicholasx.d.nunley@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    4 +++-
 drivers/net/igb/igb_main.c |   45 +++++++++++++++++++-------------------------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 735ede9..6e63d9a 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -141,8 +141,10 @@ struct igb_buffer {
 			unsigned long time_stamp;
 			u16 length;
 			u16 next_to_watch;
-			u16 mapped_as_page;
+			unsigned int bytecount;
 			u16 gso_segs;
+			union skb_shared_tx shtx;
+			u8 mapped_as_page;
 		};
 		/* RX */
 		struct {
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 438737d..589cf4a 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -3899,34 +3899,33 @@ static inline int igb_tx_map_adv(struct igb_ring *tx_ring, struct sk_buff *skb,
 {
 	struct igb_buffer *buffer_info;
 	struct device *dev = tx_ring->dev;
-	unsigned int len = skb_headlen(skb);
+	unsigned int hlen = skb_headlen(skb);
 	unsigned int count = 0, i;
 	unsigned int f;
+	u16 gso_segs = skb_shinfo(skb)->gso_segs ?: 1;
 
 	i = tx_ring->next_to_use;
 
 	buffer_info = &tx_ring->buffer_info[i];
-	BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
-	buffer_info->length = len;
+	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
+	buffer_info->length = hlen;
 	/* set time_stamp *before* dma to help avoid a possible race */
 	buffer_info->time_stamp = jiffies;
 	buffer_info->next_to_watch = i;
-	buffer_info->dma = dma_map_single(dev, skb->data, len,
+	buffer_info->dma = dma_map_single(dev, skb->data, hlen,
 					  DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, buffer_info->dma))
 		goto dma_error;
 
 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
-		struct skb_frag_struct *frag;
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[f];
+		unsigned int len = frag->size;
 
 		count++;
 		i++;
 		if (i == tx_ring->count)
 			i = 0;
 
-		frag = &skb_shinfo(skb)->frags[f];
-		len = frag->size;
-
 		buffer_info = &tx_ring->buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
@@ -3944,7 +3943,10 @@ static inline int igb_tx_map_adv(struct igb_ring *tx_ring, struct sk_buff *skb,
 	}
 
 	tx_ring->buffer_info[i].skb = skb;
-	tx_ring->buffer_info[i].gso_segs = skb_shinfo(skb)->gso_segs ?: 1;
+	tx_ring->buffer_info[i].shtx = skb_shinfo(skb)->tx_flags;
+	/* multiply data chunks by size of headers */
+	tx_ring->buffer_info[i].bytecount = ((gso_segs - 1) * hlen) + skb->len;
+	tx_ring->buffer_info[i].gso_segs = gso_segs;
 	tx_ring->buffer_info[first].next_to_watch = i;
 
 	return ++count;
@@ -5288,22 +5290,21 @@ static void igb_systim_to_hwtstamp(struct igb_adapter *adapter,
 /**
  * igb_tx_hwtstamp - utility function which checks for TX time stamp
  * @q_vector: pointer to q_vector containing needed info
- * @skb: packet that was just sent
+ * @buffer: pointer to igb_buffer structure
  *
  * If we were asked to do hardware stamping and such a time stamp is
  * available, then it must have been for this skb here because we only
  * allow only one such packet into the queue.
  */
-static void igb_tx_hwtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
+static void igb_tx_hwtstamp(struct igb_q_vector *q_vector, struct igb_buffer *buffer_info)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
-	union skb_shared_tx *shtx = skb_tx(skb);
 	struct e1000_hw *hw = &adapter->hw;
 	struct skb_shared_hwtstamps shhwtstamps;
 	u64 regval;
 
 	/* if skb does not support hw timestamp or TX stamp not valid exit */
-	if (likely(!shtx->hardware) ||
+	if (likely(!buffer_info->shtx.hardware) ||
 	    !(rd32(E1000_TSYNCTXCTL) & E1000_TSYNCTXCTL_VALID))
 		return;
 
@@ -5311,7 +5312,7 @@ static void igb_tx_hwtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
 	regval |= (u64)rd32(E1000_TXSTMPH) << 32;
 
 	igb_systim_to_hwtstamp(adapter, &shhwtstamps, regval);
-	skb_tstamp_tx(skb, &shhwtstamps);
+	skb_tstamp_tx(buffer_info->skb, &shhwtstamps);
 }
 
 /**
@@ -5326,7 +5327,6 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 	struct net_device *netdev = tx_ring->netdev;
 	struct e1000_hw *hw = &adapter->hw;
 	struct igb_buffer *buffer_info;
-	struct sk_buff *skb;
 	union e1000_adv_tx_desc *tx_desc, *eop_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int i, eop, count = 0;
@@ -5342,19 +5342,12 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 			tx_desc = E1000_TX_DESC_ADV(*tx_ring, i);
 			buffer_info = &tx_ring->buffer_info[i];
 			cleaned = (i == eop);
-			skb = buffer_info->skb;
 
-			if (skb) {
-				unsigned int segs, bytecount;
+			if (buffer_info->skb) {
+				total_bytes += buffer_info->bytecount;
 				/* gso_segs is currently only valid for tcp */
-				segs = buffer_info->gso_segs;
-				/* multiply data chunks by size of headers */
-				bytecount = ((segs - 1) * skb_headlen(skb)) +
-					    skb->len;
-				total_packets += segs;
-				total_bytes += bytecount;
-
-				igb_tx_hwtstamp(q_vector, skb);
+				total_packets += buffer_info->gso_segs;
+				igb_tx_hwtstamp(q_vector, buffer_info);
 			}
 
 			igb_unmap_and_free_tx_resource(tx_ring, buffer_info);


^ permalink raw reply related

* Re: [PATCH] net/gianfar: drop recycled skbs on MTU change
From: David Miller @ 2010-05-05  7:57 UTC (permalink / raw)
  To: afleming; +Cc: sebastian, afleming, netdev
In-Reply-To: <o2g2acbd3e41005040829rc5bf7e0dwee8419b2b65e5468@mail.gmail.com>

From: Andy Fleming <afleming@gmail.com>
Date: Tue, 4 May 2010 08:29:06 -0700

> On Mon, May 3, 2010 at 8:17 AM, Sebastian Andrzej Siewior
> <sebastian@breakpoint.cc> wrote:
>> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>>
>> The size for skb which is added to the recycled list is using the
>> current descriptor size which is current MTU. gfar_new_skb() is also
>> using this size. So after changing or alteast increasing the MTU all
>> recycled skbs should be dropped.
>>
>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>> ---
>> I'm not 100% sure but it looks like it is wrong.
>>
>>  drivers/net/gianfar.c |    4 +++-
>>  1 files changed, 3 insertions(+), 1 deletions(-)
>>
>> diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
>> index 5267c27..9093106 100644
>> --- a/drivers/net/gianfar.c
>> +++ b/drivers/net/gianfar.c
>> @@ -2287,8 +2287,10 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu)
>>
>>        /* Only stop and start the controller if it isn't already
>>         * stopped, and we changed something */
>> -       if ((oldsize != tempsize) && (dev->flags & IFF_UP))
>> +       if ((oldsize != tempsize) && (dev->flags & IFF_UP)) {
>>                stop_gfar(dev);
>> +               skb_queue_purge(&priv->rx_recycle);
>> +       }
> 
> 
> I think we should probably do this in free_skb_resources.  And remove
> the call from gfar_close().

Ok, Sebastian please rework your patch as requested by Andy.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next-2.6] pppoe: remove unnecessary checks in pppoe_flush_dev
From: David Miller @ 2010-05-05  7:56 UTC (permalink / raw)
  To: jpirko; +Cc: netdev, ebiederm
In-Reply-To: <20100504150256.GD2845@psychotron.lab.eng.brq.redhat.com>

From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 4 May 2010 17:02:57 +0200

> pernet memory is guaranteed to exist when notifiers are called.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH] FEC: Fix kernel panic in fec_set_mac_address.
From: David Miller @ 2010-05-05  7:55 UTC (permalink / raw)
  To: mattias; +Cc: netdev
In-Reply-To: <4BE02C3E.2000705@vmlinux.org>

From: Mattias Walstrom <mattias@vmlinux.org>
Date: Tue, 04 May 2010 16:16:30 +0200

> Fix memory corruption that sometimes result in kernel panic.
> 
> Signed-off-by: Mattias Walström <mattias@vmlinux.org>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH -next 1/3] bnx2: Add GRO support.
From: David Miller @ 2010-05-05  7:49 UTC (permalink / raw)
  To: mchan; +Cc: netdev
In-Reply-To: <1273036906-29162-1-git-send-email-mchan@broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Tue, 4 May 2010 22:21:44 -0700

> Signed-off-by: Michael Chan <mchan@broadcom.com>
> Signed-off-by: Matt Carlson <mcarlson@broadcom.com>

Please turn it on by default by setting NETIF_F_GRO in your probe
routine.

I don't want a precedent started where we have drivers that support
GRO but don't enable it by default.  If there are some performance
degradations, we fix them instead of deferring the issue by not
enabling it by default in some awkward selection of drivers.

^ permalink raw reply

* Re: Performance problem in network namespaces
From: Martín Ferrari @ 2010-05-05  7:37 UTC (permalink / raw)
  To: Benny Amorsen; +Cc: netdev, Mathieu Lacage
In-Reply-To: <m3y6g0m204.fsf@ursa.amorsen.dk>

Hi Benny,

On Tue, May 4, 2010 at 11:48, Benny Amorsen <benny+usenet@amorsen.dk> wrote:

>> When running some benchmarks to test the feasibility of using
>> namespaces for emulating networks, I have found a big drop in
>> performance when one of the namespaces is performing routing of
>> packets.

> Is this problem specific to vnet, or do the other types of interfaces
> suffer from it as well? (phys, vlan, macvlan...)

Seems that it is specific to vnet, but if there's other way of having
datagrams created locally that get into the routing code before
leaving the system, maybe that would have the same problem.

I tried a couple of combinations that somehow included routing in the mix:
- routing loop over the same ethernet device (e1000e)
- routing between eth and wlan (iwlagn)
- eth and veth (with the paired veth inside a different namespace)
- wlan and veth (ditto)

In all those cases, I see that in ip_forward() the headroom is already
enough, and I think is due to the fact that the hardware drivers use
netdev_alloc_skb which already adds NET_SKB_PAD to the length
requested.

Using macvlan over the real devices showed the same results.

I also tried:
- eth and wlan: packets arriving from eth had a headroom  of 48, but
80 was needed to pass it to wlan

-- 
Martín Ferrari

^ permalink raw reply

* Re: [PATCH -next 2/3] bnx2: Add prefetches to rx path.
From: Eric Dumazet @ 2010-05-05  5:45 UTC (permalink / raw)
  To: Michael Chan; +Cc: davem, netdev
In-Reply-To: <1273036906-29162-2-git-send-email-mchan@broadcom.com>

Le mardi 04 mai 2010 à 22:21 -0700, Michael Chan a écrit :
> Add prefetches of the skb and the next rx descriptor to speed up rx path.
> 
> Signed-off-by: Michael Chan <mchan@broadcom.com>
> Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
> ---
>  drivers/net/bnx2.c |   12 +++++++++---
>  drivers/net/bnx2.h |    1 +
>  2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
> index 6ad3184..cdee29b 100644
> --- a/drivers/net/bnx2.c
> +++ b/drivers/net/bnx2.c
> @@ -2719,6 +2719,7 @@ bnx2_alloc_rx_skb(struct bnx2 *bp, struct bnx2_rx_ring_info *rxr, u16 index)
>  	}
>  
>  	rx_buf->skb = skb;
> +	rx_buf->desc = (struct l2_fhdr *) skb->data;
>  	dma_unmap_addr_set(rx_buf, mapping, mapping);
>  
>  	rxbd->rx_bd_haddr_hi = (u64) mapping >> 32;
> @@ -2941,6 +2942,7 @@ bnx2_reuse_rx_skb(struct bnx2 *bp, struct bnx2_rx_ring_info *rxr,
>  	rxr->rx_prod_bseq += bp->rx_buf_use_size;
>  
>  	prod_rx_buf->skb = skb;
> +	prod_rx_buf->desc = (struct l2_fhdr *) skb->data;
>  
>  	if (cons == prod)
>  		return;
> @@ -3086,7 +3088,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
>  	while (sw_cons != hw_cons) {
>  		unsigned int len, hdr_len;
>  		u32 status;
> -		struct sw_bd *rx_buf;
> +		struct sw_bd *rx_buf, *next_rx_buf;
>  		struct sk_buff *skb;
>  		dma_addr_t dma_addr;
>  		u16 vtag = 0;
> @@ -3097,7 +3099,11 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
>  
>  		rx_buf = &rxr->rx_buf_ring[sw_ring_cons];
>  		skb = rx_buf->skb;
> +		prefetch(skb);

why not a prefetchw() ?

>  
> +		next_rx_buf =
> +			&rxr->rx_buf_ring[RX_RING_IDX(NEXT_RX_BD(sw_cons))];
> +		prefetch(next_rx_buf->desc);

So cpu is allowed to start a memory transaction on next_skb->data, while
not yes DMA unmapped ?

>  		rx_buf->skb = NULL;
>  
>  		dma_addr = dma_unmap_addr(rx_buf, mapping);
> @@ -3106,7 +3112,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
>  			BNX2_RX_OFFSET + BNX2_RX_COPY_THRESH,
>  			PCI_DMA_FROMDEVICE);
>  
> -		rx_hdr = (struct l2_fhdr *) skb->data;
> +		rx_hdr = rx_buf->desc;
>  		len = rx_hdr->l2_fhdr_pkt_len;
>  		status = rx_hdr->l2_fhdr_status;
>  
> @@ -5764,7 +5770,7 @@ bnx2_run_loopback(struct bnx2 *bp, int loopback_mode)
>  	rx_buf = &rxr->rx_buf_ring[rx_start_idx];
>  	rx_skb = rx_buf->skb;
>  
> -	rx_hdr = (struct l2_fhdr *) rx_skb->data;
> +	rx_hdr = rx_buf->desc;
>  	skb_reserve(rx_skb, BNX2_RX_OFFSET);
>  
>  	pci_dma_sync_single_for_cpu(bp->pdev,
> diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
> index ab34a5d..dd35bd0 100644
> --- a/drivers/net/bnx2.h
> +++ b/drivers/net/bnx2.h
> @@ -6551,6 +6551,7 @@ struct l2_fhdr {
>  
>  struct sw_bd {
>  	struct sk_buff		*skb;
> +	struct l2_fhdr		*desc;
>  	DEFINE_DMA_UNMAP_ADDR(mapping);
>  };
>  

It seems there are two parts in this patch, one caching skb->data in
sw_bd, and prefetches...




^ permalink raw reply

* Re: Receive issues with bonding and vlans
From: John Fastabend @ 2010-05-05  5:35 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Leech, Christopher, netdev@vger.kernel.org, Andy Gospodarek,
	Patrick McHardy, bonding-devel@lists.sourceforge.net
In-Reply-To: <12574.1272928653@death.nxdomain.ibm.com>

Jay Vosburgh wrote:
> John Fastabend <john.r.fastabend@intel.com> wrote:
> 
>> Jay Vosburgh wrote:
>>> John Fastabend <john.r.fastabend@intel.com> wrote:
>>>
> [...]
>>>> It should be OK to allow packets to be received on the VLAN if it is not
>>>> explicitly in the bond?
>>>         Lemme see if I have this straight, because all of these special
>>> cases are making my brain hurt.  This one is for a configuration like this:
>>>
>>>         bond0 ----- eth0
>>>                    /
>>>         vlan.xxx -/
>>>
>>>         I.e., a VLAN configured directly atop an ethernet device, said
>>> ethernet also being a slave to bonding.  Is that correct?
>>>
>> Yes, this is the correct scenario that we are considering.
>>
>>>         Extrapolating from the ASCII art in a prior message in this
>>> discussion, would this configuration really be something like this:
>>>
>>>         vlan.xxx -\
>>>                    \
>>>         bond0 ----- eth1
>>>         bond0 ----- eth0
>>>                    /
>>>         vlan.xxx -/
>>>
>>>         I.e., two slaves to bonding, with VLAN xxx configured atop both
>>> of the slaves?  Or would the eth0 and eth1 use discrete VLAN ids?  The
>>> reason I ask is in regards to duplicate suppression.  The whole reason
>>> the "inactive" slave drops (most) incoming packets is to eliminate
>>> duplicates when the switch floods traffic to both slave ports.
>>>
>> These vlan ids could be the same or discrete I think both configurations
>> should be valid.
>>
>>>         This is a bit tricky, because it's not really about broadcasts /
>>> multicasts so much, but about traffic that the switch sends to all ports
>>> because the switch's MAC address table isn't up to date with the
>>> destination MAC of the traffic (which is a transient condition, so this
>>> would only happen for perhaps one second or so).  That would result in
>>> duplicate unicast packets being received by the bond (back in the day
>>> before bonding had the "drop inactive traffic" logic).
>>>
>>>         So if the same VLAN is configured atop the two slaves, I wonder
>>> if that will open a window for the duplicate unicast packet problem.
>> OK, this does appear to open a window for duplicated unicast packets. By
>> only allowing handlers with exact matches at least this issue is less
>> obvious and we are assuming the packet handler can deal with this
>> duplication.  This seems to be the current assumption made. The same issue
>> exists today for real device in the following setup,
>>
>> vlan --> bond0 --> eth
> 
>         I just tested this, and I'm not seeing duplicate packets using
> the test that used to show the problem before the "drop dups" logic went
> in (clear the switch's mac address-table, ping -c 25 -f [peer on VLAN],
> compare "packets transmitted" to "packets received").
> 
>         That doesn't mean there isn't a gap in the logic somewhere, just
> that the original problem hasn't resurfaced (as far as I can tell).
> 
>> Specifically for FCoE we use the san mac address so it wouldn't be an
>> issue here.  The expectation being that the switch will only ever use the
>> correct san mac on the port.
> 
>         The issue arises when the switch does not have the destination
> MAC in its address table, and as such is transitory, and only occurs
> after sufficiently long periods of no traffic (or a manual flush of the
> table).  The packets are sent to all ports until the MAC table updates
> (which seems to take place asynchronously), which is usually about 1
> second or so (on the midrange Cisco gear I have).
> 
>         For example, with the switch's mac address table cleared, when
> starting a "ping -f" I can watch as first every port's light blinks,
> then all but two stop blinking.  During the time that every port is
> blinking, the switch is sending all the packets to every port because
> the mac address table hasn't updated the switching logic (however that
> works under the covers).
> 
> 
> 
>>>         If the VLAN ids are different, then I'll assume this is some
>>> kind of device mapper magic, doing the load balancing elsewhere.
>> Correct device mapper handles load balancing and failover for both cases,
>> when the vlan ids are different and when they are the same.
>>
>>>> Or if we want to be more paranoid deliver packets only to handlers with
>>>> exact matches for the device. For non vlan devices we deliver skb's to
>>>> packet handlers that match exactly even on inactive slaves so doing this
>>>> on vlan devices as well makes sense and shouldn't cause any unexpected
>>>> problems.
>>>         Yah, the whole concept of "inactive" slave is pretty mutated
>>> now; it's kind of become the "active-backup with semi-manual load
>>> balance for clever protocols, oh, and duplicate suppression" mode.
>>>
>>>> Also on a somewhat unrelated note I suspect null_or_orig and null_or_bond
>>>> are not working as expected in __netif_receive_skb().  At least the
>>>> comment 'deliver only exact match' could be inaccurate.
>>>         I don't think this is unrelated at all.  This code (the packet
>>> to device lookup stuff in __netif_receive_skb) has been modified pretty
>>> extensively lately for various bonding-related special cases, and I
>>> think it is getting hard to follow.  Whatever comments are there need to
>>> be accurate, and, honestly, I think this code needs comments to explain
>>> what exactly is supposed to happen for these special cases.
>>>
>> Agreed.  This should be cleaned up and some explanations added.  The
>> current behavior in active-backup mode is receiving packets on the bonded
>> real device in active mode fails but putting that same real device in an
>> inactive state will cause it to receive packets.  This is an
>> inconsistency, which should probably be fixed by initializing null_or_bond
>> to orig_dev.  And also renaming it orig_or_bond at that point.
>>
>>>> Here's a patch to illustrate what I'm thinking compile tested only.  If
>>>> this sounds reasonable I'll work up an official patch.
>>>>
>>>>
>>>> [PATCH] net: allow vlans on bonded real net_devices
>>>>
>>>> For converged I/O it is reasonable to use dm_multipathing to provice
>>>> failover and load balancing for storage traffic and then use bonding
>>>> for the LAN failover and load balancing.
>>>>
>>>> Currently this works if the multipathed devices are using the real
>>>> net_device and those devices are enslaved to a bonded device what
>>>> does not work is creating a vlan on the real device and trying to
>>>> use it outside the bond for multipathing.
>>>>
>>>> This patch adds logic so that if the skb is destined for a vlan
>>>> that is not in the bond the skb will not be dropped.
>>>>
>>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>>>> ---
>>>>
>>>> net/8021q/vlan_core.c |   31 +++++++++++++++++++++----------
>>>> net/core/dev.c        |   11 ++++++++---
>>>> 2 files changed, 29 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
>>>> index c584a0a..3bce0c3 100644
>>>> --- a/net/8021q/vlan_core.c
>>>> +++ b/net/8021q/vlan_core.c
>>>> @@ -8,18 +8,24 @@
>>>> int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
>>>>                     u16 vlan_tci, int polling)
>>>> {
>>>> +      struct net_device *vlan_dev;
>>>> +
>>>>       if (netpoll_rx(skb))
>>>>               return NET_RX_DROP;
>>>>
>>>> -      if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
>>>> +      vlan_dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
>>>> +
>>>> +      if (!vlan_dev)
>>>> +              goto drop;
>>>> +
>>>> +      if ((vlan_dev->priv_flags & IFF_BONDING ||
>>>> +          vlan_dev_real_dev(vlan_dev)->flags & IFF_MASTER) &&
>>>> +          skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
>>>         I'm not sure this will do the right thing if the VLAN device
>>> itself is a slave to bonding, e.g., bond0 ---> vlan.xxx ---> eth0.  In
>>> that case, eth0's dev->master is NULL, and the vlan_dev (vlan.xxx's dev)
>>> doesn't have IFF_MASTER (but does have IFF_SLAVE and IFF_BONDING, I
>>> believe).
>>>
>> correct, vlan_dev does have IFF_BONDING and IFF_SLAVE here and doesn't
>> have IFF_MASTER.
>>
>>
>>>         I think this will result in all incoming traffic being accepted
>>> on such a configuration (leading to duplicates, as described above).
>>>
>>>         I suspect, but have not tested, that something like this might
>>> do what you're looking for:
>>>
>>>         if ((vlan_dev->priv_flags & IFF_BONDING ||
>>>             vlan_dev_real_dev(vlan_dev)->flags & (IFF_MASTER | IFF_SLAVE)) &&
>>>             skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
>>>
>>>         I.e., if the VLAN device is either a MASTER (configured above
>>> the bond) or a slave (configured below the bond) do the duplicate
>>> suppresion.
>> Here are the three basic cases I see,
>>
>> #1. vlanx --> bond0 --> ethx
>>
>> In this case vlanx does not have IFF_BONDING set and real_dev is ethx with
>> IFF_SLAVE set.  ethx has master dev->bond0 so this should work. And shows
>> why we need the IFF_SLAVE bit as you pointed out and I dropped.
>>
>> #2. bond  --> vlanx --> ethx
>>
>> This case is broke, skb->dev->master is NULL so we would never drop this
>> pkt.  As it exists today I suspect this is broken as well.
> 
>         In the VLAN pass, yes, but the VLAN input path will call into
> netif_receive_skb, and at that point the skb->dev is the vlan device,
> and it has a dev->master.  I haven't tested this lately, but I'm fairly
> sure this works.
> 

OK, these both seem to work as expected my test was invalid.

>> #3 bond0 --> ethx
>>   vlanx --> -|
>>
>> Here is the case where adding the IFF_SLAVE bit doesn't work as I
>> hoped. We don't want to run skb_bond_should_drop here.
> 
>         Yes, this is tricky because the VLAN device will copy the
> dev->flags from the device it's placed atop, so the VLAN will inherit
> the ethx's IFF_SLAVE flag.  This happens regardless of the setup order
> (enslave ethX, then add VLAN, or vice versa).
> 

This doesn't appear to be true, adding a VLAN on ethx then enslave ethx 
doesn't set the IFF_SLAVE flag on the VLAN.  Unless I am missing something.

>         I suspect this case may be testable because the VLAN device has
> IFF_SLAVE, but has no dev->master.
> 
>> So I think there needs to be a bit of logic here to determine if we need
>> to check skb_bond_should_drop with the vlan device or with the
>> skb->dev->master. Something like might do:
>>
>> should_drop_dev = vlan_dev->master ? vlan_dev->master : skb->dev->master
>>
>> This should fix case #2 without breaking case #1.  And the case I want to
>> allow is still not resolved.  I'll think about this some more maybe this
>> logic can be fixed for all cases.
> 
>         As I said above, I don't think case #2 is really broken now.
> 

Seems to be working sorry for the noise.

<<snip>>

> 
>         Hopefully this will be the last futzing around with this, and
> won't make it too complicated.
> 

I currently believe the cleanest way to implement this is to add a 
pkt_type flag PACKET_DROP to mark skbs that have been received on the 
inactive slave.  I sent out a functional RFC I would like to run a few 
more tests on it, but otherwise I think it ok.

Thanks,
John.

>         -J
> 
> ---
>         -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com


^ permalink raw reply

* [RFC PATCH] net: deliver skbs on inactive slaves to exact matches
From: John Fastabend @ 2010-05-05  5:34 UTC (permalink / raw)
  To: john.r.fastabend, netdev; +Cc: christopher.leech, andy, kaber, bonding-devel

Currently, the accelerated receive path for VLAN's will
drop packets if the real device is an inactive slave and
is not one of the special pkts tested for in
skb_bond_should_drop().  This behavior is different then
the non-accelerated path and for pkts over a bonded vlan.

For example,

vlanx -> bond0 -> ethx

will be dropped in the vlan path and not delivered to any
packet handlers.  However,

bond0 -> vlanx -> ethx

will be delivered to handlers that match the exact dev,
because the VLAN path checks the real_dev which is not a
slave and netif_recv_skb() doesn't drop frames but only
delivers them to exact matches.

This patch adds a pkt_type PACKET_DROP which is now used
to identify skbs that would previously been dropped and
allows the skb to continue to skb_netif_recv().  Here we
add logic to check for PACKET_DROP and if so only deliver
to handlers that match exactly.  IMHO this is more
consistent and gives pkt handlers a way to identify skbs
that come from inactive slaves.

This allows a third case to function which is important for
doing multipath with FCoE traffic while LAN traffic bonded,

bond0 -> ethx
          |
vlanx -> --

Here the vlan is not in bond0 but the FCoE handler can still
receive the skb.  Previously these skbs were dropped.

I have tested the following 4 configurations in failover modes
and load balancing modes and have not seen any duplicate packets
or unexpected bahavior.

# bond0 -> ethx

# vlanx -> bond0 -> ethx

# bond0 -> vlanx -> ethx

# bond0 -> ethx
            |
  vlanx -> --

Also this removes the PACKET_FASTROUTE define which was not being
used.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/if_packet.h |    2 +-
 net/8021q/vlan_core.c     |    4 ++--
 net/core/dev.c            |   25 ++++++++++++++++++-------
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 6ac23ef..9d079fa 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -28,7 +28,7 @@ struct sockaddr_ll {
 #define PACKET_OUTGOING		4		/* Outgoing of any type */
 /* These ones are invisible by user level */
 #define PACKET_LOOPBACK		5		/* MC/BRD frame looped back */
-#define PACKET_FASTROUTE	6		/* Fastrouted frame	*/
+#define PACKET_DROP		6		/* Drop packet 		*/
 
 /* Packet socket options */
 
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c584a0a..4510e08 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -12,7 +12,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 		return NET_RX_DROP;
 
 	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
-		goto drop;
+		skb->pkt_type = PACKET_DROP;
 
 	skb->skb_iif = skb->dev->ifindex;
 	__vlan_hwaccel_put_tag(skb, vlan_tci);
@@ -84,7 +84,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 	struct sk_buff *p;
 
 	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
-		goto drop;
+		skb->pkt_type = PACKET_DROP;
 
 	skb->skb_iif = skb->dev->ifindex;
 	__vlan_hwaccel_put_tag(skb, vlan_tci);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4d43f1a..0a5b0b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2776,7 +2776,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	struct net_device *orig_dev;
 	struct net_device *master;
 	struct net_device *null_or_orig;
-	struct net_device *null_or_bond;
+	struct net_device *dev_or_bond;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -2793,13 +2793,24 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
 
+	/*
+	 * bonding note: skbs received on inactive slaves should only
+	 * be delivered to pkt handlers that are exact matches.  Also
+	 * the pkt_type field will be marked PACKET_DROP.  If packet
+	 * handlers are sensitive to duplicate packets these skbs will
+	 * need to be dropped at the handler.  The vlan accel path may
+	 * have already set PACKET_DROP.
+	 */
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
 	master = ACCESS_ONCE(orig_dev->master);
-	if (master) {
-		if (skb_bond_should_drop(skb, master))
+	if (skb->pkt_type == PACKET_DROP)
+		null_or_orig = orig_dev;
+	else if (master) {
+		if (skb_bond_should_drop(skb, master)) {
+			skb->pkt_type = PACKET_DROP;
 			null_or_orig = orig_dev; /* deliver only exact match */
-		else
+		} else
 			skb->dev = master;
 	}
 
@@ -2849,10 +2860,10 @@ ncls:
 	 * device that may have registered for a specific ptype.  The
 	 * handler may have to adjust skb->dev and orig_dev.
 	 */
-	null_or_bond = NULL;
+	dev_or_bond = skb->dev;
 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		null_or_bond = vlan_dev_real_dev(skb->dev);
+		dev_or_bond = vlan_dev_real_dev(skb->dev);
 	}
 
 	type = skb->protocol;
@@ -2860,7 +2871,7 @@ ncls:
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
 		if (ptype->type == type && (ptype->dev == null_or_orig ||
 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == null_or_bond)) {
+		     ptype->dev == dev_or_bond)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;


^ permalink raw reply related

* [PATCH net-next-2.6] net: Increase NET_SKB_PAD to 64 bytes
From: Eric Dumazet @ 2010-05-05  5:24 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jamal, Tom Herbert

eth_type_trans() & get_rps_cpus() currently need two 64bytes cache lines
in packet to compute rxhash.

Increasing NET_SKB_PAD from 32 to 64 reduces the need to one cache line
only, and makes RPS faster.

NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 746a652..fe5798b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1356,9 +1356,12 @@ static inline int skb_network_offset(const struct sk_buff *skb)
  *
  * Various parts of the networking layer expect at least 32 bytes of
  * headroom, you should not reduce this.
+ * With RPS, we raised NET_SKB_PAD to 64 so that get_rps_cpus() fetches span
+ * a 64 bytes aligned block to fit modern (>= 64 bytes) cache line sizes
+ * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
  */
 #ifndef NET_SKB_PAD
-#define NET_SKB_PAD	32
+#define NET_SKB_PAD	64
 #endif
 
 extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);



^ permalink raw reply related

* [PATCH -next 2/3] bnx2: Add prefetches to rx path.
From: Michael Chan @ 2010-05-05  5:21 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1273036906-29162-1-git-send-email-mchan@broadcom.com>

Add prefetches of the skb and the next rx descriptor to speed up rx path.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
---
 drivers/net/bnx2.c |   12 +++++++++---
 drivers/net/bnx2.h |    1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 6ad3184..cdee29b 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2719,6 +2719,7 @@ bnx2_alloc_rx_skb(struct bnx2 *bp, struct bnx2_rx_ring_info *rxr, u16 index)
 	}
 
 	rx_buf->skb = skb;
+	rx_buf->desc = (struct l2_fhdr *) skb->data;
 	dma_unmap_addr_set(rx_buf, mapping, mapping);
 
 	rxbd->rx_bd_haddr_hi = (u64) mapping >> 32;
@@ -2941,6 +2942,7 @@ bnx2_reuse_rx_skb(struct bnx2 *bp, struct bnx2_rx_ring_info *rxr,
 	rxr->rx_prod_bseq += bp->rx_buf_use_size;
 
 	prod_rx_buf->skb = skb;
+	prod_rx_buf->desc = (struct l2_fhdr *) skb->data;
 
 	if (cons == prod)
 		return;
@@ -3086,7 +3088,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 	while (sw_cons != hw_cons) {
 		unsigned int len, hdr_len;
 		u32 status;
-		struct sw_bd *rx_buf;
+		struct sw_bd *rx_buf, *next_rx_buf;
 		struct sk_buff *skb;
 		dma_addr_t dma_addr;
 		u16 vtag = 0;
@@ -3097,7 +3099,11 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 
 		rx_buf = &rxr->rx_buf_ring[sw_ring_cons];
 		skb = rx_buf->skb;
+		prefetch(skb);
 
+		next_rx_buf =
+			&rxr->rx_buf_ring[RX_RING_IDX(NEXT_RX_BD(sw_cons))];
+		prefetch(next_rx_buf->desc);
 		rx_buf->skb = NULL;
 
 		dma_addr = dma_unmap_addr(rx_buf, mapping);
@@ -3106,7 +3112,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 			BNX2_RX_OFFSET + BNX2_RX_COPY_THRESH,
 			PCI_DMA_FROMDEVICE);
 
-		rx_hdr = (struct l2_fhdr *) skb->data;
+		rx_hdr = rx_buf->desc;
 		len = rx_hdr->l2_fhdr_pkt_len;
 		status = rx_hdr->l2_fhdr_status;
 
@@ -5764,7 +5770,7 @@ bnx2_run_loopback(struct bnx2 *bp, int loopback_mode)
 	rx_buf = &rxr->rx_buf_ring[rx_start_idx];
 	rx_skb = rx_buf->skb;
 
-	rx_hdr = (struct l2_fhdr *) rx_skb->data;
+	rx_hdr = rx_buf->desc;
 	skb_reserve(rx_skb, BNX2_RX_OFFSET);
 
 	pci_dma_sync_single_for_cpu(bp->pdev,
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index ab34a5d..dd35bd0 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -6551,6 +6551,7 @@ struct l2_fhdr {
 
 struct sw_bd {
 	struct sk_buff		*skb;
+	struct l2_fhdr		*desc;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
-- 
1.6.4.GIT



^ permalink raw reply related

* [PATCH -next 1/3] bnx2: Add GRO support.
From: Michael Chan @ 2010-05-05  5:21 UTC (permalink / raw)
  To: davem; +Cc: netdev

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
---
 drivers/net/bnx2.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index ab26bbc..6ad3184 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -3207,10 +3207,10 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 
 #ifdef BCM_VLAN
 		if (hw_vlan)
-			vlan_hwaccel_receive_skb(skb, bp->vlgrp, vtag);
+			vlan_gro_receive(&bnapi->napi, bp->vlgrp, vtag, skb);
 		else
 #endif
-			netif_receive_skb(skb);
+			napi_gro_receive(&bnapi->napi, skb);
 
 		rx_pkt++;
 
-- 
1.6.4.GIT



^ permalink raw reply related

* Re: [Patch 2/3] sysctl: add proc_do_large_bitmap
From: Cong Wang @ 2010-05-05  3:14 UTC (permalink / raw)
  To: Changli Gao
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, adobriyan, David Miller
In-Reply-To: <v2y412e6f7f1004301541yb1ede589t8c446966743ca138@mail.gmail.com>

Changli Gao wrote:
>                      add the following lines to let "echo 1-10 >>
> /proc/..." work as normal.

Hmm, I haven't tested this, what did you see if we append
lines into it?

Also, do we need appending lines to this /proc file when design it?
Octavian? Eric?

Thanks.

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Cong Wang @ 2010-05-05  3:02 UTC (permalink / raw)
  To: Changli Gao
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller, adobriyan
In-Reply-To: <u2s412e6f7f1004301549tb0e88a80n4c621e42c0b31015@mail.gmail.com>

Changli Gao wrote:
> On Fri, Apr 30, 2010 at 4:25 PM, Amerigo Wang <amwang@redhat.com> wrote:
>> +       if (*p == '-' && *size > 1) {
>> +               *neg = 1;
> 
> As neg is bool*, you should use true and false instead of 1 and 0.
> 

Yeah, I only corrected those lines that I touched, I should
correct them all.

Will fix.

Thanks.


^ permalink raw reply

* MDaemon Notification -- Attachment Removed
From: Postmaster @ 2010-05-05  2:41 UTC (permalink / raw)
  To: netdev

-------------------------------------------------------------------
MDaemon has detected restricted attachments within an email message
-------------------------------------------------------------------

>From      : netdev@vger.kernel.org
To        : ngochphcm@klv.com.vn
Subject   : Mail Delivery (failure ngochphcm@klv.com.vn)
Message-ID: 

---------------------
Attachment(s) removed
---------------------
message.scr



^ permalink raw reply

* Re: RFC: Network Plugin Architecture (NPA) for vmxnet3
From: Stephen Hemminger @ 2010-05-05  2:44 UTC (permalink / raw)
  To: Pankaj Thakkar
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	virtualization@lists.linux-foundation.org, pv-drivers@vmware.com,
	Shreyas Bhatewara
In-Reply-To: <20100505001857.GQ8323@vmware.com>

On Tue, 4 May 2010 17:18:57 -0700
Pankaj Thakkar <pthakkar@vmware.com> wrote:

> The purpose of this email is to introduce the architecture and the design principles. The overall project involves more than just changes to vmxnet3 driver and hence we though an overview email would be better. Once people agree to the design in general we intend to provide the code changes to the vmxnet3 driver.

As Dave said, we care more about what the implementation looks like than the high level
goals of the design. I think we all agree that better management of virtualized devices
is necessary, the problem is that their are so many of them (vmware, xen, HV, Xen), 
and vendors seem to to lean on their own specific implementation of a offloading, 
which makes a general solution more difficult. Please, Please solve this cleanly.

The little things like API's and locking semantics and handling of dynamic versus
static control can make a good design in principle fall apart when someone does a bad
job of implementing them.

Lastly, projects that have had multiple people involved for long periods of time
in the dark often end up building a legacy mentality "but we convinced vendor XXX to include it
in their Enterprise version 666" and require lots of "retraining" before the code
becomes acceptable.

-- 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox