Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/5] Fix deadlock in bonding driver resulting from internal locking when using netpoll
From: nhorman @ 2010-10-12 20:29 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, andy, amwang, nhorman
In-Reply-To: <1286915377-1612-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>

The monitoring paths in the bonding driver take write locks that are shared by
the tx path.  If netconsole is in use, these paths can call printk which puts us
in the netpoll tx path, which, if netconsole is attached to the bonding driver,
result in deadlock (the xmit_lock guards are useless in netpoll_send_skb, as the
monitor paths in the bonding driver don't claim the xmit_lock, nor should they).
The solution is to use a per cpu flag internal to the driver to indicate when a
cpu is holding the lock in a path that might recusrse into the tx path for the
driver via netconsole.  By checking this flag on transmit, we can defer the
sending of the netconsole frames until a later time using the retransmit feature
of netpoll_send_skb that is triggered on the return code NETDEV_TX_BUSY.  I've
tested this and am able to transmit via netconsole while causing failover
conditions on the bond slave links.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 drivers/net/bonding/bond_main.c |   67 ++++++++++++++++++++++++++++++++++++--
 1 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index eb7d089..1d0d8c5 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -76,6 +76,7 @@
 #include <linux/if_vlan.h>
 #include <linux/if_bonding.h>
 #include <linux/jiffies.h>
+#include <linux/preempt.h>
 #include <net/route.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -169,6 +170,35 @@ MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on link
 
 /*----------------------------- Global variables ----------------------------*/
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static cpumask_var_t netpoll_block_tx;
+
+static inline void block_netpoll_tx(void
+{
+	preempt_disable();
+	BUG_ON(cpumask_test_and_set_cpu(smp_processor_id(),
+					netpoll_block_tx));
+}
+
+static inline void unblock_netpoll_tx(void)
+{
+	BUG_ON(!cpumask_test_and_clear_cpu(smp_processor_id(),
+					  netpoll_block_tx));
+	preempt_enable();
+}
+
+static inline int is_netpoll_tx_blocked(struct net_device *dev)
+{
+	if (unlikely(dev->priv_flags & IFF_IN_NETPOLL))
+		return cpumask_test_cpu(smp_processor_id(), netpoll_block_tx);
+	return 0;
+}
+#else
+#define block_netpoll_tx()
+#define unblock_netpoll_tx()
+#define is_netpoll_tx_blocked(dev)
+#endif
+
 static const char * const version =
 	DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n";
 
@@ -310,6 +340,7 @@ static int bond_del_vlan(struct bonding *bond, unsigned short vlan_id)
 
 	pr_debug("bond: %s, vlan id %d\n", bond->dev->name, vlan_id);
 
+	block_netpoll_tx();
 	write_lock_bh(&bond->lock);
 
 	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
@@ -344,6 +375,7 @@ static int bond_del_vlan(struct bonding *bond, unsigned short vlan_id)
 
 out:
 	write_unlock_bh(&bond->lock);
+	unblock_netpoll_tx();
 	return res;
 }
 
@@ -1804,10 +1836,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	bond_set_carrier(bond);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	/*
-	 * Netpoll and bonding is broken, make sure it is not initialized
-	 * until it is fixed.
-	 */
 	if (disable_netpoll) {
 		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
 	} else {
@@ -1892,6 +1920,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		return -EINVAL;
 	}
 
+	block_netpoll_tx();
 	netdev_bonding_change(bond_dev, NETDEV_BONDING_DESLAVE);
 	write_lock_bh(&bond->lock);
 
@@ -1901,6 +1930,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		pr_info("%s: %s not enslaved\n",
 			bond_dev->name, slave_dev->name);
 		write_unlock_bh(&bond->lock);
+		unblock_netpoll_tx();
 		return -EINVAL;
 	}
 
@@ -1994,6 +2024,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	}
 
 	write_unlock_bh(&bond->lock);
+	unblock_netpoll_tx();
 
 	/* must do this from outside any spinlocks */
 	bond_destroy_slave_symlinks(bond_dev, slave_dev);
@@ -2085,6 +2116,7 @@ static int bond_release_all(struct net_device *bond_dev)
 	struct net_device *slave_dev;
 	struct sockaddr addr;
 
+	block_netpoll_tx();
 	write_lock_bh(&bond->lock);
 
 	netif_carrier_off(bond_dev);
@@ -2183,6 +2215,7 @@ static int bond_release_all(struct net_device *bond_dev)
 
 out:
 	write_unlock_bh(&bond->lock);
+	unblock_netpoll_tx();
 
 	return 0;
 }
@@ -2232,9 +2265,11 @@ static int bond_ioctl_change_active(struct net_device *bond_dev, struct net_devi
 	    (old_active) &&
 	    (new_active->link == BOND_LINK_UP) &&
 	    IS_UP(new_active->dev)) {
+		block_netpoll_tx();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_change_active_slave(bond, new_active);
 		write_unlock_bh(&bond->curr_slave_lock);
+		unblock_netpoll_tx();
 	} else
 		res = -EINVAL;
 
@@ -2466,9 +2501,11 @@ static void bond_miimon_commit(struct bonding *bond)
 
 do_failover:
 		ASSERT_RTNL();
+		block_netpoll_tx();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
+		unblock_netpoll_tx();
 	}
 
 	bond_set_carrier(bond);
@@ -2911,11 +2948,13 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 	}
 
 	if (do_failover) {
+		block_netpoll_tx();
 		write_lock_bh(&bond->curr_slave_lock);
 
 		bond_select_active_slave(bond);
 
 		write_unlock_bh(&bond->curr_slave_lock);
+		unblock_netpoll_tx();
 	}
 
 re_arm:
@@ -3074,9 +3113,11 @@ static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
 
 do_failover:
 		ASSERT_RTNL();
+		block_netpoll_tx();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
+		unblock_netpoll_tx();
 	}
 
 	bond_set_carrier(bond);
@@ -4564,6 +4605,13 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 
+	/*
+	 * If we risk deadlock from transmitting this in the
+	 * netpoll path, tell netpoll to queue the frame for later tx
+	 */
+	if (is_netpoll_tx_blocked(dev))
+		return NETDEV_TX_BUSY;
+
 	if (TX_QUEUE_OVERRIDE(bond->params.mode)) {
 		if (!bond_slave_override(bond, skb))
 			return NETDEV_TX_OK;
@@ -5295,6 +5343,13 @@ static int __init bonding_init(void)
 	if (res)
 		goto err;
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (!alloc_cpumask_var(&netpoll_block_tx, GFP_KERNEL)) {
+		res = -ENOMEM;
+		bond_destroy_sysfs();
+		goto err;
+	}
+#endif
 	register_netdevice_notifier(&bond_netdev_notifier);
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
@@ -5316,6 +5371,10 @@ static void __exit bonding_exit(void)
 
 	bond_destroy_sysfs();
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	free_cpumask_var(netpoll_block_tx);
+#endif
+
 	rtnl_link_unregister(&bond_link_ops);
 	unregister_pernet_subsys(&bond_net_ops);
 }
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 1/5] Fix bonding drivers improper modification of netpoll structure
From: nhorman @ 2010-10-12 20:29 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, andy, amwang, nhorman
In-Reply-To: <1286915377-1612-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>

The bonding driver currently modifies the netpoll structure in its xmit path
while sending frames from netpoll.  This is racy, as other cpus can access the
netpoll structure in parallel. Since the bonding driver points np->dev to a
slave device, other cpus can inadvertently attempt to send data directly to
slave devices, leading to improper locking with the bonding master, lost frames,
and deadlocks.  This patch fixes that up.

This patch also removes the real_dev pointer from the netpoll structure as that
data is really only used by bonding in the poll_controller, and we can emulate
its behavior by check each slave for IS_UP.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 drivers/net/bonding/bond_main.c |   15 +++++++++------
 include/linux/netpoll.h         |    9 +++++++--
 net/core/netpoll.c              |    6 +++---
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a0bf35d..eb7d089 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -449,11 +449,9 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
 	if (unlikely(bond->dev->priv_flags & IFF_IN_NETPOLL)) {
 		struct netpoll *np = bond->dev->npinfo->netpoll;
 		slave_dev->npinfo = bond->dev->npinfo;
-		np->real_dev = np->dev = skb->dev;
 		slave_dev->priv_flags |= IFF_IN_NETPOLL;
-		netpoll_send_skb(np, skb);
+		netpoll_send_skb_on_dev(np, skb, slave_dev);
 		slave_dev->priv_flags &= ~IFF_IN_NETPOLL;
-		np->dev = bond->dev;
 	} else
 #endif
 		dev_queue_xmit(skb);
@@ -1332,9 +1330,14 @@ static bool slaves_support_netpoll(struct net_device *bond_dev)
 
 static void bond_poll_controller(struct net_device *bond_dev)
 {
-	struct net_device *dev = bond_dev->npinfo->netpoll->real_dev;
-	if (dev != bond_dev)
-		netpoll_poll_dev(dev);
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	int i;
+
+	bond_for_each_slave(bond, slave, i) {
+		if (slave->dev && IS_UP(slave->dev))
+			netpoll_poll_dev(slave->dev);
+	}
 }
 
 static void bond_netpoll_cleanup(struct net_device *bond_dev)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 50d8009..79358bb 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -14,7 +14,6 @@
 
 struct netpoll {
 	struct net_device *dev;
-	struct net_device *real_dev;
 	char dev_name[IFNAMSIZ];
 	const char *name;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
@@ -53,7 +52,13 @@ void netpoll_set_trap(int trap);
 void __netpoll_cleanup(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
 int __netpoll_rx(struct sk_buff *skb);
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+			     struct net_device *dev);
+static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+	netpoll_send_skb_on_dev(np, skb, np->dev);
+}
+
 
 
 #ifdef CONFIG_NETPOLL
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 537e01a..4e98ffa 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -288,11 +288,11 @@ static int netpoll_owner_active(struct net_device *dev)
 	return 0;
 }
 
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+			     struct net_device *dev)
 {
 	int status = NETDEV_TX_BUSY;
 	unsigned long tries;
-	struct net_device *dev = np->dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
 	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo = np->dev->npinfo;
@@ -346,7 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
 }
-EXPORT_SYMBOL(netpoll_send_skb);
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 {
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 5/5] Re-enable netpoll over bonding
From: nhorman @ 2010-10-12 20:29 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, andy, amwang, nhorman
In-Reply-To: <1286915377-1612-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>

With the inclusion of previous fixup patches, netpoll over bonding apears to
work reliably with failover conditions.  This reverts Gospos previous commit
c22d7ac844f1cb9c6a5fd20f89ebadc2feef891b, and allows access again to the netpoll
functionality in the bonding driver.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 drivers/net/bonding/bond_main.c |   31 +++++++++++--------------------
 1 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1d0d8c5..3b53360 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -173,7 +173,7 @@ MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on link
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static cpumask_var_t netpoll_block_tx;
 
-static inline void block_netpoll_tx(void
+static inline void block_netpoll_tx(void)
 {
 	preempt_disable();
 	BUG_ON(cpumask_test_and_set_cpu(smp_processor_id(),
@@ -209,9 +209,6 @@ static int arp_ip_count;
 static int bond_mode	= BOND_MODE_ROUNDROBIN;
 static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
 static int lacp_fast;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static int disable_netpoll = 1;
-#endif
 
 const struct bond_parm_tbl bond_lacp_tbl[] = {
 {	"slow",		AD_LACP_SLOW},
@@ -1836,19 +1833,15 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	bond_set_carrier(bond);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	if (disable_netpoll) {
+	if (slaves_support_netpoll(bond_dev)) {
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+		if (bond_dev->npinfo)
+			slave_dev->npinfo = bond_dev->npinfo;
+	} else if (!(bond_dev->priv_flags & IFF_DISABLE_NETPOLL)) {
 		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
-	} else {
-		if (slaves_support_netpoll(bond_dev)) {
-			bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
-			if (bond_dev->npinfo)
-				slave_dev->npinfo = bond_dev->npinfo;
-		} else if (!(bond_dev->priv_flags & IFF_DISABLE_NETPOLL)) {
-			bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
-			pr_info("New slave device %s does not support netpoll\n",
-				slave_dev->name);
-			pr_info("Disabling netpoll support for %s\n", bond_dev->name);
-		}
+		pr_info("New slave device %s does not support netpoll\n",
+			slave_dev->name);
+		pr_info("Disabling netpoll support for %s\n", bond_dev->name);
 	}
 #endif
 	read_unlock(&bond->lock);
@@ -2055,10 +2048,8 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	read_lock_bh(&bond->lock);
 
-	 /* Make sure netpoll over stays disabled until fixed. */
-	if (!disable_netpoll)
-		if (slaves_support_netpoll(bond_dev))
-				bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+	if (slaves_support_netpoll(bond_dev))
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
 	read_unlock_bh(&bond->lock);
 	if (slave_dev->netdev_ops->ndo_netpoll_cleanup)
 		slave_dev->netdev_ops->ndo_netpoll_cleanup(slave_dev);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 3/5] Fix napi poll for bonding driver
From: nhorman @ 2010-10-12 20:29 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, andy, amwang, nhorman
In-Reply-To: <1286915377-1612-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>

Usually the netpoll path, when preforming a napi poll can get away with just
polling all the napi instances of the configured device.  Thats not the case for
the bonding driver however, as the napi instances which may wind up getting
flagged as needing polling after the poll_controller call don't belong to the
bonded device, but rather to the slave devices.  Fix this by checking the device
in question for the IFF_MASTER flag, if set, we know we need to check the full
poll list for this cpu, rather than just the devices napi instance list.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 net/core/netpoll.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4e98ffa..d79d221 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -156,8 +156,15 @@ static void poll_napi(struct net_device *dev)
 {
 	struct napi_struct *napi;
 	int budget = 16;
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct list_head *nlist;
 
-	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+	if (dev->flags & IFF_MASTER)
+		nlist = &sd->poll_list;
+	else
+		nlist = &dev->napi_list;
+
+	list_for_each_entry(napi, nlist, dev_list) {
 		if (napi->poll_owner != smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			budget = poll_one_napi(dev->npinfo, napi, budget);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 4/5] Fix netconsole to not deadlock on rmmod
From: nhorman @ 2010-10-12 20:29 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, andy, amwang, nhorman
In-Reply-To: <1286915377-1612-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>

Netconsole calls netpoll_cleanup on receipt of a NETDEVICE_UNREGISTER event.
The notifier subsystem calls these event handlers with rtnl_lock held, which
netpoll_cleanup also takes, resulting in deadlock.  Fix this by calling the
__netpoll_cleanup interior function instead, and fixing up the additional
pointers.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 drivers/net/netconsole.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index ca142c4..94255f0 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -678,7 +678,14 @@ static int netconsole_netdev_event(struct notifier_block *this,
 				strlcpy(nt->np.dev_name, dev->name, IFNAMSIZ);
 				break;
 			case NETDEV_UNREGISTER:
-				netpoll_cleanup(&nt->np);
+				/*
+				 * rtnl_lock already held
+				 */
+				if (nt->np.dev) {
+					__netpoll_cleanup(&nt->np);
+					dev_put(nt->np.dev);
+					nt->np.dev = NULL;
+				}
 				/* Fall through */
 			case NETDEV_GOING_DOWN:
 			case NETDEV_BONDING_DESLAVE:
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH -next] sundance: Add initial ethtool stats support
From: Denis Kirjanov @ 2010-10-12 20:36 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, eric.dumazet, Ben Hutchings, Jeff Garzik
In-Reply-To: <20101012.115138.112614145.davem@davemloft.net>

On 10/12/2010 10:51 PM, David Miller wrote:
> From: Denis Kirjanov <dkirjanov@kernel.org>
> Date: Sat, 09 Oct 2010 23:41:32 +0400
> 
>> +	/* ethtool extra stats */
>> +	struct {
>> +		unsigned long tx_multiple_collisions;
>> +		unsigned long tx_single_collisions;
>> +		unsigned long tx_late_collisions;
>> +		unsigned long tx_deffered;
>> +		unsigned long tx_deffered_excessive;
>> +		unsigned long tx_aborted;
>> +		unsigned long tx_bcasts;
>> +		unsigned long rx_bcasts;
>> +		unsigned long tx_mcasts;
>> +		unsigned long rx_mcasts;
>> +	} xstats;
> 
> I think these should be "u64".
> 

[PATCH -next v3] sundance: Add ethtool stats support
Add ethtool stats support

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
---
V2: 
 check for the ETH_SS_STATS in get_string()
 use xstats struct for ethtool stats
V3:
 make counters 64-bits wide
 
 drivers/net/sundance.c |   90 ++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/drivers/net/sundance.c b/drivers/net/sundance.c
index 4283cc5..159f7e7 100644
--- a/drivers/net/sundance.c
+++ b/drivers/net/sundance.c
@@ -363,6 +363,19 @@ struct netdev_private {
         dma_addr_t tx_ring_dma;
         dma_addr_t rx_ring_dma;
 	struct timer_list timer;		/* Media monitoring timer. */
+	/* ethtool extra stats */
+	struct {
+		u64 tx_multiple_collisions;
+		u64 tx_single_collisions;
+		u64 tx_late_collisions;
+		u64 tx_deffered;
+		u64 tx_deffered_excessive;
+		u64 tx_aborted;
+		u64 tx_bcasts;
+		u64 rx_bcasts;
+		u64 tx_mcasts;
+		u64 rx_mcasts;
+	} xstats;
 	/* Frequently used values: keep some adjacent for cache effect. */
 	spinlock_t lock;
 	int msg_enable;
@@ -1486,7 +1499,6 @@ static struct net_device_stats *get_stats(struct net_device *dev)
 {
 	struct netdev_private *np = netdev_priv(dev);
 	void __iomem *ioaddr = np->base;
-	int i;
 	unsigned long flags;
 
 	spin_lock_irqsave(&np->statlock, flags);
@@ -1494,13 +1506,23 @@ static struct net_device_stats *get_stats(struct net_device *dev)
 	dev->stats.rx_missed_errors	+= ioread8(ioaddr + RxMissed);
 	dev->stats.tx_packets += ioread16(ioaddr + TxFramesOK);
 	dev->stats.rx_packets += ioread16(ioaddr + RxFramesOK);
-	dev->stats.collisions += ioread8(ioaddr + StatsLateColl);
-	dev->stats.collisions += ioread8(ioaddr + StatsMultiColl);
-	dev->stats.collisions += ioread8(ioaddr + StatsOneColl);
 	dev->stats.tx_carrier_errors += ioread8(ioaddr + StatsCarrierError);
-	ioread8(ioaddr + StatsTxDefer);
-	for (i = StatsTxDefer; i <= StatsMcastRx; i++)
-		ioread8(ioaddr + i);
+
+	np->xstats.tx_multiple_collisions += ioread8(ioaddr + StatsMultiColl);
+	np->xstats.tx_single_collisions += ioread8(ioaddr + StatsOneColl);
+	np->xstats.tx_late_collisions += ioread8(ioaddr + StatsLateColl);
+	dev->stats.collisions += np->xstats.tx_multiple_collisions
+		+ np->xstats.tx_single_collisions
+		+ np->xstats.tx_late_collisions;
+
+	np->xstats.tx_deffered += ioread8(ioaddr + StatsTxDefer);
+	np->xstats.tx_deffered_excessive += ioread8(ioaddr + StatsTxXSDefer);
+	np->xstats.tx_aborted += ioread8(ioaddr + StatsTxAbort);
+	np->xstats.tx_bcasts += ioread8(ioaddr + StatsBcastTx);
+	np->xstats.rx_bcasts += ioread8(ioaddr + StatsBcastRx);
+	np->xstats.tx_mcasts += ioread8(ioaddr + StatsMcastTx);
+	np->xstats.rx_mcasts += ioread8(ioaddr + StatsMcastRx);
+
 	dev->stats.tx_bytes += ioread16(ioaddr + TxOctetsLow);
 	dev->stats.tx_bytes += ioread16(ioaddr + TxOctetsHigh) << 16;
 	dev->stats.rx_bytes += ioread16(ioaddr + RxOctetsLow);
@@ -1566,6 +1588,21 @@ static int __set_mac_addr(struct net_device *dev)
 	return 0;
 }
 
+static const struct {
+	const char name[ETH_GSTRING_LEN];
+} sundance_stats[] = {
+	{ "tx_multiple_collisions" },
+	{ "tx_single_collisions" },
+	{ "tx_late_collisions" },
+	{ "tx_deffered" },
+	{ "tx_deffered_excessive" },
+	{ "tx_aborted" },
+	{ "tx_bcasts" },
+	{ "rx_bcasts" },
+	{ "tx_mcasts" },
+	{ "rx_mcasts" },
+};
+
 static int check_if_running(struct net_device *dev)
 {
 	if (!netif_running(dev))
@@ -1624,6 +1661,42 @@ static void set_msglevel(struct net_device *dev, u32 val)
 	np->msg_enable = val;
 }
 
+static void get_strings(struct net_device *dev, u32 stringset,
+		u8 *data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, sundance_stats, sizeof(sundance_stats));
+}
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(sundance_stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, u64 *data)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int i = 0;
+
+	get_stats(dev);
+	data[i++] = np->xstats.tx_multiple_collisions;
+	data[i++] = np->xstats.tx_single_collisions;
+	data[i++] = np->xstats.tx_late_collisions;
+	data[i++] = np->xstats.tx_deffered;
+	data[i++] = np->xstats.tx_deffered_excessive;
+	data[i++] = np->xstats.tx_aborted;
+	data[i++] = np->xstats.tx_bcasts;
+	data[i++] = np->xstats.rx_bcasts;
+	data[i++] = np->xstats.tx_mcasts;
+	data[i++] = np->xstats.rx_mcasts;
+}
+
 static const struct ethtool_ops ethtool_ops = {
 	.begin = check_if_running,
 	.get_drvinfo = get_drvinfo,
@@ -1633,6 +1706,9 @@ static const struct ethtool_ops ethtool_ops = {
 	.get_link = get_link,
 	.get_msglevel = get_msglevel,
 	.set_msglevel = set_msglevel,
+	.get_strings = get_strings,
+	.get_sset_count = get_sset_count,
+	.get_ethtool_stats = get_ethtool_stats,
 };
 
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-- 
1.7.0


^ permalink raw reply related

* RE: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: Vladislav Zolotarov @ 2010-10-12 20:08 UTC (permalink / raw)
  To: David Miller, joe@perches.com
  Cc: Dmitry Kravkov, netdev@vger.kernel.org, eric.dumazet@gmail.com,
	Eilon Greenstein
In-Reply-To: <20101012.123142.59664847.davem@davemloft.net>




> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, October 12, 2010 9:32 PM
> To: joe@perches.com
> Cc: Dmitry Kravkov; netdev@vger.kernel.org; eric.dumazet@gmail.com;
> Vladislav Zolotarov; Eilon Greenstein
> Subject: Re: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS
> enablement
> 
> From: Joe Perches <joe@perches.com>
> Date: Tue, 12 Oct 2010 12:26:19 -0700
> 
> > On Tue, 2010-10-12 at 21:02 +0200, Dmitry Kravkov wrote:
> >> @@ -2486,6 +2486,7 @@ void bnx2x_pf_init(struct bnx2x *bp)
> >>  	 * if (is_eth_multi(bp))
> >>  	 *	flags |= FUNC_FLG_RSS;
> >>  	 */
> >> +	flags |= FUNC_FLG_RSS;
> >>
> >>  	/* function setup */
> >>  	if (flags & FUNC_FLG_RSS) {
> >
> > Then the "if (flags & FUNC_FLG_RSS)" test should be removed.
> 
> Yeah it probably should.  If necessary it could be added back
> later.

Thanks, Joe. We will consider removing this "if" and will post
an appropriate patch. Most likely in the close patch series we
have promised to respin... ;)

Thanks to all, guys.
vlad




^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: David Miller @ 2010-10-12 20:05 UTC (permalink / raw)
  To: ebiederm; +Cc: hans.schillstrom, daniel.lezcano, netdev
In-Reply-To: <m11v801tfr.fsf@fess.ebiederm.org>

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Fri, 08 Oct 2010 10:32:40 -0700

> It is just dealing with not flushing the entire routing cache, just the
> routes that have expired.  Which prevents one network namespace from
> flushing it's routes and DOS'ing another.

That's a very indirect and obfuscated way of handling it.

And I still don't know why we let the first contiguous set of expired
entries in the chain get freed outside of the lock, and the rest
inside the lock.  That really isn't explained by anything I've read.

How about we just do exactly what's intended, and with no ifdefs?

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/route.h b/include/net/route.h
index 7e5e73b..8d24761 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -106,7 +106,7 @@ extern int		ip_rt_init(void);
 extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
 				       __be32 src, struct net_device *dev);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(void);
+extern void		rt_cache_flush_batch(struct net *net);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 919f2ad..4039f56 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -999,7 +999,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		rt_cache_flush_batch();
+		rt_cache_flush_batch(dev_net(dev));
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0755aa4..6ad730c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -712,13 +712,14 @@ static inline int rt_is_expired(struct rtable *rth)
  * Can be called by a softirq or a process.
  * In the later case, we want to be reschedule if necessary
  */
-static void rt_do_flush(int process_context)
+static void rt_do_flush(struct net *net, int process_context)
 {
 	unsigned int i;
 	struct rtable *rth, *next;
-	struct rtable * tail;
 
 	for (i = 0; i <= rt_hash_mask; i++) {
+		struct rtable *list, **pprev;
+
 		if (process_context && need_resched())
 			cond_resched();
 		rth = rt_hash_table[i].chain;
@@ -726,41 +727,27 @@ static void rt_do_flush(int process_context)
 			continue;
 
 		spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
-		{
-		struct rtable ** prev, * p;
 
-		rth = rt_hash_table[i].chain;
+		pprev = &rt_hash_table[i].chain;
+		rth = *pprev;
+		while (rth) {
+			next = rth->dst.rt_next;
+			if (dev_net(rth->dst.dev) == net) {
+				*pprev = next;
 
-		/* defer releasing the head of the list after spin_unlock */
-		for (tail = rth; tail; tail = tail->dst.rt_next)
-			if (!rt_is_expired(tail))
-				break;
-		if (rth != tail)
-			rt_hash_table[i].chain = tail;
-
-		/* call rt_free on entries after the tail requiring flush */
-		prev = &rt_hash_table[i].chain;
-		for (p = *prev; p; p = next) {
-			next = p->dst.rt_next;
-			if (!rt_is_expired(p)) {
-				prev = &p->dst.rt_next;
-			} else {
-				*prev = next;
-				rt_free(p);
-			}
-		}
+				rth->dst.rt_next = list;
+				list = rth;
+			} else
+				pprev = &rth->dst.rt_next;
+
+			rth = next;
 		}
-#else
-		rth = rt_hash_table[i].chain;
-		rt_hash_table[i].chain = NULL;
-		tail = NULL;
-#endif
+
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
-		for (; rth != tail; rth = next) {
-			next = rth->dst.rt_next;
-			rt_free(rth);
+		for (; list; list = next) {
+			next = list->dst.rt_next;
+			rt_free(list);
 		}
 	}
 }
@@ -906,13 +893,13 @@ void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
 	if (delay >= 0)
-		rt_do_flush(!in_softirq());
+		rt_do_flush(net, !in_softirq());
 }
 
 /* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+void rt_cache_flush_batch(struct net *net)
 {
-	rt_do_flush(!in_softirq());
+	rt_do_flush(net, !in_softirq());
 }
 
 static void rt_emergency_hash_rebuild(struct net *net)

^ permalink raw reply related

* Re: [PATCH net-next] net:  allocate skbs on local node
From: David Rientjes @ 2010-10-12 19:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Andrew Morton, Eric Dumazet, David Miller, netdev,
	Michael Chan, Eilon Greenstein, Christoph Hellwig, LKML,
	Nick Piggin
In-Reply-To: <alpine.DEB.2.00.1010120745360.31832@router.home>

On Tue, 12 Oct 2010, Christoph Lameter wrote:

> Hmmm. Given these effects I think we should be more cautious regarding the
> unification work. May be the "unified allocator" should replace SLAB
> instead and SLUB can stay unchanged?

Linus has said that he refuses to merge another allocator until one is 
removed or replaced, so that would force the unificiation patches to go 
into slab instead if you want to leave slub untouched.

> The unification patches go back to
> the one lock per node SLAB thing because the queue maintenance overhead is
> otherwise causing large regressions in hackbench because of lots of atomic
> ops. The per node lock seem to be causing problems here in the network
> stack,.

The TCP_RR regression on slub is because of what I described a couple 
years ago as "slab thrashing" where cpu slabs would be filled with 
allocations, then frees would occur to move those slabs from the full to 
partial list with only a few free objects, those partial slabs would 
quickly become full, etc.  Performance gets better if you change the 
per-node lock to a trylock when iterating the partial list and preallocate 
and have a substantially longer partial list than normal (and it still 
didn't rival slab's performance), so I don't think it's only a per-node 
lock that's the issue , it's all the slowpath overhead of swapping the cpu 
slab out for another slab.  The TCP_RR load would show slub stats that 
indicate certain caches, kmalloc-256 and kmalloc-2048, would have ~98% of 
allocations coming from the slowpath.

This gets better if you allocate higher order slabs (and kmalloc-2048 is 
already order-3 by default) but then allocating new slabs gets really slow 
if not impossible on smaller machines.  The overhead of even compaction 
will kill us.

> Take the unified as a SLAB cleanup instead? Then at least we have
> a large common code base and just differentiate through the locking
> mechanism?
> 

Will you be adding the extensive slub debugging to slab then?  It would be 
a shame to lose it because one allocator is chosen over another for 
performance reasons and then we need to recompile to debug issues as they 
arise.

^ permalink raw reply

* Re: [PATCH net-next V3] net: percpu net_device refcount
From: David Miller @ 2010-10-12 19:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286828532.30423.16.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Oct 2010 22:22:12 +0200

> This was a bit long (allyesconfig), but eventually succeeded ...
 ...
> [PATCH net-next V3] net: percpu net_device refcount

Applied, thanks Eric.

^ permalink raw reply

* Re: tbf/htb qdisc limitations
From: Steven Brudenell @ 2010-10-12 19:31 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev
In-Reply-To: <20101012101022.GA8578@ff.dom.local>

> Yes, it's not allowed according to Documentation/HOWTO. Btw, as you
> can see e.g. in sch_hfsc comments, 64-bit division is avoided too.

i see sch_hfsc avoids do_div in critical areas for performance
reasons, but uses it other places. it should still be alright to
do_div in tbf_change and htb_change_class, right? it would be nice to
compute the rtabs in those functions instead of having userspace do
it.

> I can only say there is no versioning, but backward compatibility
> is crucial, so you need to do some tricks or data duplication.
> You could probably try to get opinions about it with an RFC on
> moving tbf and htb schedulers to 64 bits if you're interested
> (decoupling it from your specific burst problem).

my burst problem is the only semi-legitimate motivation i can think
of. the only other possible motivations i can imagine are setting
"limit" to buffer more than 4GB of packets and setting "rate" to
something more than 32 gigabit; both of these seem kind of dubious. is
there something else you had in mind?

looking more at the netlink tc interface: why is it that the interface
for so many qdiscs consists of passing a big options struct as a
single netlink attr, instead of a bunch of individual attrs? this kind
of seems contrary to the extensibility / flexibility spirit of
netlink, and seems to be getting in the way of changing the interface.
maybe i should RFC about this instead ;)

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: David Miller @ 2010-10-12 19:31 UTC (permalink / raw)
  To: joe; +Cc: dmitry, netdev, eric.dumazet, vladz, eilong
In-Reply-To: <1286911579.1117.76.camel@Joe-Laptop>

From: Joe Perches <joe@perches.com>
Date: Tue, 12 Oct 2010 12:26:19 -0700

> On Tue, 2010-10-12 at 21:02 +0200, Dmitry Kravkov wrote:
>> @@ -2486,6 +2486,7 @@ void bnx2x_pf_init(struct bnx2x *bp)
>>  	 * if (is_eth_multi(bp))
>>  	 *	flags |= FUNC_FLG_RSS;
>>  	 */
>> +	flags |= FUNC_FLG_RSS;
>>  
>>  	/* function setup */
>>  	if (flags & FUNC_FLG_RSS) {
> 
> Then the "if (flags & FUNC_FLG_RSS)" test should be removed.

Yeah it probably should.  If necessary it could be added back
later.

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: David Miller @ 2010-10-12 19:31 UTC (permalink / raw)
  To: eric.dumazet; +Cc: dmitry, netdev, vladz, eilong
In-Reply-To: <1286911129.2703.7.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 12 Oct 2010 21:18:49 +0200

> Le mardi 12 octobre 2010 à 21:02 +0200, Dmitry Kravkov a écrit :
>> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
>> 
>> Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
>> Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
 ...
> Thanks, this solved the problem.
> 
> Tested-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks guys.

^ permalink raw reply

* Re: [PATCH] IPv4: Remove check for ipv4_is_lbcast() that will always return false
From: David Miller @ 2010-10-12 19:28 UTC (permalink / raw)
  To: awalls; +Cc: netdev, linux-kernel, kuznet, jmorris, kaber
In-Reply-To: <1286727021.2435.17.camel@localhost>

From: Andy Walls <awalls@md.metrocast.net>
Date: Sun, 10 Oct 2010 12:10:21 -0400

> In making an IPv4 routing decision, packets with an all 1's broadcast
> destination are accepted as input packets, before being checked for being a
> martian.  Remove the martian check for the all 1's broadcast destination
> address.  Make the initial check for the all 1's broadcast destination
> address easier to read.
> 
> Signed-off-by: Andy Walls <awalls@md.metrocast.net>

Your email client corrupted this patch, by turning tab characters
into spaces, amongst other things.

Please give Documentation/email-clients.txt a read and resubmit this
patch after you have these issues sorted out.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: Joe Perches @ 2010-10-12 19:26 UTC (permalink / raw)
  To: Dmitry Kravkov; +Cc: davem, netdev, eric.dumazet, vladz, eilong
In-Reply-To: <1286910141.16797.23.camel@lb-tlvb-dmitry>

On Tue, 2010-10-12 at 21:02 +0200, Dmitry Kravkov wrote:
> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
> Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
> ---
>  drivers/net/bnx2x/bnx2x_main.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
> index 7a9556b..ead524b 100644
> --- a/drivers/net/bnx2x/bnx2x_main.c
> +++ b/drivers/net/bnx2x/bnx2x_main.c
> @@ -2486,6 +2486,7 @@ void bnx2x_pf_init(struct bnx2x *bp)
>  	 * if (is_eth_multi(bp))
>  	 *	flags |= FUNC_FLG_RSS;
>  	 */
> +	flags |= FUNC_FLG_RSS;
>  
>  	/* function setup */
>  	if (flags & FUNC_FLG_RSS) {

Then the "if (flags & FUNC_FLG_RSS)" test should be removed.




^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: Eric Dumazet @ 2010-10-12 19:18 UTC (permalink / raw)
  To: Dmitry Kravkov; +Cc: davem, netdev, vladz, eilong
In-Reply-To: <1286910141.16797.23.camel@lb-tlvb-dmitry>

Le mardi 12 octobre 2010 à 21:02 +0200, Dmitry Kravkov a écrit :
> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
> Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
> ---
>  drivers/net/bnx2x/bnx2x_main.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
> index 7a9556b..ead524b 100644
> --- a/drivers/net/bnx2x/bnx2x_main.c
> +++ b/drivers/net/bnx2x/bnx2x_main.c
> @@ -2486,6 +2486,7 @@ void bnx2x_pf_init(struct bnx2x *bp)
>  	 * if (is_eth_multi(bp))
>  	 *	flags |= FUNC_FLG_RSS;
>  	 */
> +	flags |= FUNC_FLG_RSS;
>  
>  	/* function setup */
>  	if (flags & FUNC_FLG_RSS) {

Thanks, this solved the problem.

Tested-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* [PATCH net-next] bnx2x: Fixing a typo: added a missing RSS enablement
From: Dmitry Kravkov @ 2010-10-12 19:02 UTC (permalink / raw)
  To: davem, netdev, eric.dumazet; +Cc: vladz, eilong

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>

Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
---
 drivers/net/bnx2x/bnx2x_main.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 7a9556b..ead524b 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -2486,6 +2486,7 @@ void bnx2x_pf_init(struct bnx2x *bp)
 	 * if (is_eth_multi(bp))
 	 *	flags |= FUNC_FLG_RSS;
 	 */
+	flags |= FUNC_FLG_RSS;
 
 	/* function setup */
 	if (flags & FUNC_FLG_RSS) {
-- 
1.7.1





^ permalink raw reply related

* Re: [PATCH -next] sundance: Add initial ethtool stats support
From: David Miller @ 2010-10-12 18:51 UTC (permalink / raw)
  To: dkirjanov; +Cc: netdev, eric.dumazet, bhutchings
In-Reply-To: <4CB0C56C.2000106@kernel.org>

From: Denis Kirjanov <dkirjanov@kernel.org>
Date: Sat, 09 Oct 2010 23:41:32 +0400

> +	/* ethtool extra stats */
> +	struct {
> +		unsigned long tx_multiple_collisions;
> +		unsigned long tx_single_collisions;
> +		unsigned long tx_late_collisions;
> +		unsigned long tx_deffered;
> +		unsigned long tx_deffered_excessive;
> +		unsigned long tx_aborted;
> +		unsigned long tx_bcasts;
> +		unsigned long rx_bcasts;
> +		unsigned long tx_mcasts;
> +		unsigned long rx_mcasts;
> +	} xstats;

I think these should be "u64".

^ permalink raw reply

* Re: net-next-2.6 [PATCH 0/6] dccp: miscellaneous fixes and helper functions
From: David Miller @ 2010-10-12 18:45 UTC (permalink / raw)
  To: gerrit; +Cc: dccp, netdev
In-Reply-To: <1286860551-6809-1-git-send-email-gerrit@erg.abdn.ac.uk>

From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 12 Oct 2010 07:15:45 +0200

>  Patch #1: fixes two problems in the DCCP  (AWL/SWL) window-size adjustment.
>  Patch #2: refactors the connect_init() function by combining related code.
>  Patch #3: removes an never-user argument from the CCID tx function.
>  Patch #4: provides a function to generalize the data-loss condition 
>            (patch originally came from the ccid-4 subtree).
>  Patch #5: schedules an Ack as carrier for a pending timestamp echo.
>  Patch #6: tidies up the format of DCCP per-connection warnings.
> 
> The set has also been placed into a fresh (today's) copy of net-next-2.6, on
> 
>     git://eden-feed.erg.abdn.ac.uk/net-next-2.6        [subtree 'dccp']

Looks good, pulled, thanks!

^ permalink raw reply

* RE: [BUG net-next] bnx2x: all traffic comes to RX queue 0
From: Vladislav Zolotarov @ 2010-10-12 18:18 UTC (permalink / raw)
  To: Eric Dumazet, Dmitry Kravkov
  Cc: netdev, Michael Chan, Eilon Greenstein, David Miller,
	Yaniv Rosner
In-Reply-To: <1286907103.2703.6.camel@edumazet-laptop>

Thanks, Eric. Dima has already found a problem and we are running 
a regression to verify nothing else is broken after the RSS is 
brought back to life.

If nothing goes wrong Dima will send a patch in an hour or so...

Thanks again for the catch.
vlad

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Eric Dumazet
> Sent: Tuesday, October 12, 2010 8:12 PM
> To: Dmitry Kravkov
> Cc: netdev; Michael Chan; Eilon Greenstein; David Miller; Vladislav
> Zolotarov; Yaniv Rosner
> Subject: RE: [BUG net-next] bnx2x: all traffic comes to RX queue 0
> 
> Le mardi 12 octobre 2010 à 09:20 -0700, Dmitry Kravkov a écrit :
> > Eric,
> >
> > I will take a look
> >
> 
> Thanks ! Here is the bisection result
> 
> # git bisect bad
> 523224a3b3cd407ce4e6731a087194e13a90db18 is the first bad commit
> commit 523224a3b3cd407ce4e6731a087194e13a90db18
> Author: Dmitry Kravkov <dmitry@broadcom.com>
> Date:   Wed Oct 6 03:23:26 2010 +0000
> 
>     bnx2x, cnic, bnx2i: use new FW/HSI
> 
>     This is the new FW HSI blob and the relevant definitions without
> logic changes.
>     It also included code adaptation for new HSI. New features are not
> enabled.
> 
>     New FW/HSI includes:
>     - Support for 57712 HW
>     - Future support for VF (not used)
>     - Improvements in FW interrupts scheme
>     - FW FCoE hooks (stubs for future usage)
> 
>     Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
>     Signed-off-by: Michael Chan <mchan@broadcom.com>
>     Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> :040000 040000 8931a748ae89487466d99e6121a3965131e3a201
> fed2be49565ca291203a865554a0e071a2a9c3fb M	drivers
> :040000 040000 2ff6a5c290ca7e9dc339b7213f49c341f4fc5822
> 82c0a7117b58f538a8b49891c5524e3537c5ae66 M	firmware
> 
> 
> # git bisect log
> git bisect start '--' 'drivers/net/bnx2x'
> # bad: [b9e9b15966e35f3b4a8d3820cac460505552f72c] Merge branch 'master'
> of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
> git bisect bad b9e9b15966e35f3b4a8d3820cac460505552f72c
> # good: [9fe6206f400646a2322096b56c59891d530e8d51] Linux 2.6.35
> git bisect good 9fe6206f400646a2322096b56c59891d530e8d51
> # good: [b7737c9be9d3e894d1a4375c52f5f47789475f26] bnx2x: Split PHY
> functions
> git bisect good b7737c9be9d3e894d1a4375c52f5f47789475f26
> # good: [8681dc3abd54e845a2effab441921b4c4457c241] bnx2x: Moved
> enabling of MSI to the bnx2x_set_num_queues()
> git bisect good 8681dc3abd54e845a2effab441921b4c4457c241
> # bad: [c518cd1c7aca2e2a65edcf4fa5ce4160d59878c2] bnx2x: remove unused
> fields in main driver structure
> git bisect bad c518cd1c7aca2e2a65edcf4fa5ce4160d59878c2
> # bad: [fb3bff178e722fe88b5ab02319c9636da0980e25] bnx2x: rename MF
> related fields
> git bisect bad fb3bff178e722fe88b5ab02319c9636da0980e25
> # good: [560131f313ea9b9439742138289b6f68d61531ec] bnx2x: create folder
> for bnx2x firmware files
> git bisect good 560131f313ea9b9439742138289b6f68d61531ec
> # bad: [523224a3b3cd407ce4e6731a087194e13a90db18] bnx2x, cnic, bnx2i:
> use new FW/HSI
> git bisect bad 523224a3b3cd407ce4e6731a087194e13a90db18
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* RE: [BUG net-next] bnx2x: all traffic comes to RX queue 0
From: Eric Dumazet @ 2010-10-12 18:11 UTC (permalink / raw)
  To: Dmitry Kravkov
  Cc: netdev, Michael Chan, Eilon Greenstein, David Miller,
	Vladislav Zolotarov, Yaniv Rosner
In-Reply-To: <2DFD360E328B3941911E6D28B085D990129FC2028B@SJEXCHCCR01.corp.ad.broadcom.com>

Le mardi 12 octobre 2010 à 09:20 -0700, Dmitry Kravkov a écrit :
> Eric,
> 
> I will take a look
> 

Thanks ! Here is the bisection result

# git bisect bad
523224a3b3cd407ce4e6731a087194e13a90db18 is the first bad commit
commit 523224a3b3cd407ce4e6731a087194e13a90db18
Author: Dmitry Kravkov <dmitry@broadcom.com>
Date:   Wed Oct 6 03:23:26 2010 +0000

    bnx2x, cnic, bnx2i: use new FW/HSI
    
    This is the new FW HSI blob and the relevant definitions without logic changes.
    It also included code adaptation for new HSI. New features are not enabled.
    
    New FW/HSI includes:
    - Support for 57712 HW
    - Future support for VF (not used)
    - Improvements in FW interrupts scheme
    - FW FCoE hooks (stubs for future usage)
    
    Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
    Signed-off-by: Michael Chan <mchan@broadcom.com>
    Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

:040000 040000 8931a748ae89487466d99e6121a3965131e3a201 fed2be49565ca291203a865554a0e071a2a9c3fb M	drivers
:040000 040000 2ff6a5c290ca7e9dc339b7213f49c341f4fc5822 82c0a7117b58f538a8b49891c5524e3537c5ae66 M	firmware


# git bisect log
git bisect start '--' 'drivers/net/bnx2x'
# bad: [b9e9b15966e35f3b4a8d3820cac460505552f72c] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
git bisect bad b9e9b15966e35f3b4a8d3820cac460505552f72c
# good: [9fe6206f400646a2322096b56c59891d530e8d51] Linux 2.6.35
git bisect good 9fe6206f400646a2322096b56c59891d530e8d51
# good: [b7737c9be9d3e894d1a4375c52f5f47789475f26] bnx2x: Split PHY functions
git bisect good b7737c9be9d3e894d1a4375c52f5f47789475f26
# good: [8681dc3abd54e845a2effab441921b4c4457c241] bnx2x: Moved enabling of MSI to the bnx2x_set_num_queues()
git bisect good 8681dc3abd54e845a2effab441921b4c4457c241
# bad: [c518cd1c7aca2e2a65edcf4fa5ce4160d59878c2] bnx2x: remove unused fields in main driver structure
git bisect bad c518cd1c7aca2e2a65edcf4fa5ce4160d59878c2
# bad: [fb3bff178e722fe88b5ab02319c9636da0980e25] bnx2x: rename MF related fields
git bisect bad fb3bff178e722fe88b5ab02319c9636da0980e25
# good: [560131f313ea9b9439742138289b6f68d61531ec] bnx2x: create folder for bnx2x firmware files
git bisect good 560131f313ea9b9439742138289b6f68d61531ec
# bad: [523224a3b3cd407ce4e6731a087194e13a90db18] bnx2x, cnic, bnx2i: use new FW/HSI
git bisect bad 523224a3b3cd407ce4e6731a087194e13a90db18



^ permalink raw reply

* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Eric Dumazet @ 2010-10-12 17:40 UTC (permalink / raw)
  To: Joe Buehler; +Cc: netdev
In-Reply-To: <loom.20101012T185411-307@post.gmane.org>

Le mardi 12 octobre 2010 à 17:14 +0000, Joe Buehler a écrit :
> I am seeing a kernel panic (NULL pointer) in fib_rules_lookup.  There
> were some other reports for 2.6.32 back in March and May.  It looks to
> me as though "rules_list" is not in a good state when fib_rules_lookup
> traverses it.
> 
> My application is bringing TAP interfaces up and down and making
> changes to associated routing tables at a fairly good clip (say, a few
> times a second).  That use case seems to be similar to a previously
> reported crash case.
> 
> This is a MIPS kernel (Cavium Octeon) running two CPUs SMP.  I am
> using 2.6.27.7 patched by Cavium for hardware support reasons.  I
> cannot upgrade because the vendor patches are non-trivial to
> forward-port.
> 
> Here is one stack trace:
> 
> [<ffffffff814671ec>] fib_rules_lookup+0x11c/0x1a8
> [<ffffffff814bd144>] fib_lookup+0x2c/0x48
> [<ffffffff814788d8>] __ip_route_output_key+0x918/0xf38
> [<ffffffff81478f30>] ip_route_output_flow+0x38/0x2e8
> [<ffffffff8149fd1c>] tcp_v4_connect+0x134/0x498
> [<ffffffff814aef80>] inet_stream_connect+0xf8/0x2f0
> [<ffffffff81442680>] sys_connect+0xe0/0xf8
> [<ffffffff8114528c>] handle_sys+0x12c/0x148
> 
> Here is another:
> 
> [<ffffffff814671ec>] fib_rules_lookup+0x11c/0x1a8
> [<ffffffff814bd144>] fib_lookup+0x2c/0x48
> [<ffffffff814b6550>] fib_validate_source+0xb0/0x4c0
> [<ffffffff8147a524>] ip_route_input+0x11a4/0x13c0
> [<ffffffff8147c304>] ip_rcv_finish+0x2f4/0x4c0
> [<ffffffff81454220>] process_backlog+0xa8/0x160
> [<ffffffff81451ea8>] net_rx_action+0x190/0x2e0
> [<ffffffff81166978>] __do_softirq+0xf0/0x218
> [<ffffffff81166b18>] do_softirq+0x78/0x80
> [<ffffffff81100e30>] plat_irq_dispatch+0x130/0x1e0
> [<ffffffff81130948>] ret_from_irq+0x0/0x4
> [<ffffffff8151167c>] _cond_resched+0x34/0x50
> [<ffffffff81148b60>] fpu_emulator_cop1Handler+0x90/0x1c80
> [<ffffffff81136f4c>] do_cpu+0x24c/0x360
> [<ffffffff81130940>] ret_from_exception+0x0/0x8
> 
> *IF* my reading of the disassembled code at point of panic is correct,
>  the "pos" pointer in list_for_each_entry_rcu appears to be NULL.
> 
> Looking at the code in net/core/fib_rules.c I see some uses of the
> "rules_list" using rcu and some apparently not.  Has something simple
> been overlooked?
> 
> I need this fixed so will try adding a spinlock to protect rules_list
> if necessary.

2.6.27 is a bit old, you might try :

commit 7fa7cb7109d07c29ab28bb877bc7049a0150dbe5
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date:   Mon Sep 27 04:18:27 2010 +0000

    fib: use atomic_inc_not_zero() in fib_rules_lookup
    
    It seems we dont use appropriate refcount increment in an
    rcu_read_lock() protected section.
    
    fib_rule_get() might increment a null refcount and bad things could
    happen.
    
    While fib_nl_delrule() respects an rcu grace period before calling
    fib_rule_put(), fib_rules_cleanup_ops() calls fib_rule_put() without a
    grace period.
    
    Note : after this patch, we might avoid the synchronize_rcu() call done
    in fib_nl_delrule()
    
    Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 42e84e0..d078728 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -225,9 +225,11 @@ jumped:
 			err = ops->action(rule, fl, flags, arg);
 
 		if (err != -EAGAIN) {
-			fib_rule_get(rule);
-			arg->rule = rule;
-			goto out;
+			if (likely(atomic_inc_not_zero(&rule->refcnt))) {
+				arg->rule = rule;
+				goto out;
+			}
+			break;
 		}
 	}
 



^ permalink raw reply related

* Re: [PATCH] af_packet: account for VLAN when checking packet size
From: David Miller @ 2010-10-12 17:40 UTC (permalink / raw)
  To: mst; +Cc: eric.dumazet, netdev, johann.baudy
In-Reply-To: <20101012171940.GB30613@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 12 Oct 2010 19:19:41 +0200

> Yes, like eth_type_trans does I guess.  I think we had a similar
> discussion already:
> 
> http://lists.openwall.net/netdev/2010/01/06/38
> 
> Summary: if we want to make the protocol field have the correct
> value for this case we need to make it work for other
> transports not just for ethernet.

Right, so for now we should just allow 4-byte larger
than MTU TX packets, as long as the device is ethernet
and can handle VLANs properly.

^ permalink raw reply

* Re: kvm networking todo wiki
From: Michael S. Tsirkin @ 2010-10-12 17:38 UTC (permalink / raw)
  To: Dragos Tatulea
  Cc: Sridhar Samudrala, David Stevens, sri, Anthony Liguori,
	Rusty Russell, Krishna Kumar2, Shirley Ma, Xin, Xiaohui, jdike,
	herbert, lmr, akong, kvm, qemu-devel, netdev, mjt
In-Reply-To: <AANLkTikzU6g2pYKoyXOWfGPE64VZusPaYdn+TQ14tE7_@mail.gmail.com>

On Sun, Oct 10, 2010 at 01:37:45PM +0200, Dragos Tatulea wrote:
> Hi,
> 
> > More importantly: anyone's going to work on this?
> 
> I'd like to work on this. Might need some assistance though.
> 
> Thanks,
> Dragos

BTW, as in some situations hardware might not be able satisfy
requirements, a subset of this item would be to expose whatever macvtap
is capable of, to the guest.  E.g. if mac can not be changed we could at
least query the mac, something that would be convenient as noted by mjt
in an irc chat.

To enable migration we'd then need a set of flags to limit this to a least
common denominator on a given network.

-- 
MST

^ permalink raw reply

* Re: [PATCH] af_packet: account for VLAN when checking packet size
From: Michael S. Tsirkin @ 2010-10-12 17:19 UTC (permalink / raw)
  To: David Miller, eric.dumazet, netdev, johann.baudy
In-Reply-To: <20101011172932.GB12342@orbit.nwl.cc>

On Mon, Oct 11, 2010 at 07:29:32PM +0200, Phil Sutter wrote:
> On Mon, Oct 11, 2010 at 09:01:53AM -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Mon, 11 Oct 2010 16:03:02 +0200
> > 
> > > If we dont test ETH_P_8021Q protocol here, we allow sending 1504 bytes
> > > frames for MTU=1500
> > > 
> > > Should we really care ?
> > > 
> > > If not, just do
> > > 
> > > reserve = dev->hard_header_len + VLAN_HLEN;
> > 
> > Thats a good point, I think we need to validate the SKB protocol
> > field.
> 
> Which is set to the value of the passed struct sockaddr_ll field
> sll_protocol. At least in the two userspace code samples I have here,
> the later field is set to htons(ETH_P_ALL). So unless one changes the
> API, the only way to find out the packet type is to actually parse the
> given ethernet header.

Yes, like eth_type_trans does I guess.  I think we had a similar
discussion already:

http://lists.openwall.net/netdev/2010/01/06/38

Summary: if we want to make the protocol field have the correct
value for this case we need to make it work for other
transports not just for ethernet.

> Since tpacket_rcv() just interprets the vlan_tci skb field, such
> detailed packet inspection is otherwise not done in af_packet.c.
> 
> Greetings, Phil
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox