Netdev List
 help / color / mirror / Atom feed
* [v5 Patch 1/3] netpoll: add generic support for bridge and bonding devices
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel

V5:
Fix coding style problems pointed by David.

V4:
Use "unlikely" to mark netpoll call path, suggested by Stephen.
Handle NETDEV_GOING_DOWN case.

V3:
Update to latest Linus' tree.
Fix deadlocks when releasing slaves of bonding devices.
Thanks to Andy.

V2:
Fix some bugs of previous version.
Remove ->netpoll_setup and ->netpoll_xmit, they are not necessary.
Don't poll all underlying devices, poll ->real_dev in struct netpoll.
Thanks to David for suggesting above.

------------>

This whole patchset is for adding netpoll support to bridge and bonding
devices. I already tested it for bridge, bonding, bridge over bonding,
and bonding over bridge. It looks fine now.


To make bridge and bonding support netpoll, we need to adjust
some netpoll generic code. This patch does the following things:

1) introduce two new priv_flags for struct net_device:
   IFF_IN_NETPOLL which identifies we are processing a netpoll;
   IFF_DISABLE_NETPOLL is used to disable netpoll support for a device
   at run-time;

2) introduce one new method for netdev_ops:
   ->ndo_netpoll_cleanup() is used to clean up netpoll when a device is
     removed.

3) introduce netpoll_poll_dev() which takes a struct net_device * parameter;
   export netpoll_send_skb() and netpoll_poll_dev() which will be used later;

4) hide a pointer to struct netpoll in struct netpoll_info, ditto.

5) introduce ->real_dev for struct netpoll.

6) introduce a new status NETDEV_BONDING_DESLAE, which is used to disable
   netconsole before releasing a slave, to avoid deadlocks.

Cc: David Miller <davem@davemloft.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/include/linux/if.h
===================================================================
--- linux-2.6.orig/include/linux/if.h
+++ linux-2.6/include/linux/if.h
@@ -71,6 +71,8 @@
 					 * release skb->dst
 					 */
 #define IFF_DONT_BRIDGE 0x800		/* disallow bridging this ether dev */
+#define IFF_IN_NETPOLL	0x1000		/* whether we are processing netpoll */
+#define IFF_DISABLE_NETPOLL	0x2000	/* disable netpoll at run-time */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
Index: linux-2.6/include/linux/netdevice.h
===================================================================
--- linux-2.6.orig/include/linux/netdevice.h
+++ linux-2.6/include/linux/netdevice.h
@@ -667,6 +667,7 @@ struct net_device_ops {
 						        unsigned short vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*ndo_poll_controller)(struct net_device *dev);
+	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
Index: linux-2.6/include/linux/netpoll.h
===================================================================
--- linux-2.6.orig/include/linux/netpoll.h
+++ linux-2.6/include/linux/netpoll.h
@@ -14,6 +14,7 @@
 
 struct netpoll {
 	struct net_device *dev;
+	struct net_device *real_dev;
 	char dev_name[IFNAMSIZ];
 	const char *name;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
@@ -36,8 +37,11 @@ struct netpoll_info {
 	struct sk_buff_head txq;
 
 	struct delayed_work tx_work;
+
+	struct netpoll *netpoll;
 };
 
+void netpoll_poll_dev(struct net_device *dev);
 void netpoll_poll(struct netpoll *np);
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
@@ -47,6 +51,7 @@ int netpoll_trap(void);
 void netpoll_set_trap(int trap);
 void netpoll_cleanup(struct netpoll *np);
 int __netpoll_rx(struct sk_buff *skb);
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 
 
 #ifdef CONFIG_NETPOLL
Index: linux-2.6/net/core/netpoll.c
===================================================================
--- linux-2.6.orig/net/core/netpoll.c
+++ linux-2.6/net/core/netpoll.c
@@ -179,9 +179,8 @@ static void service_arp_queue(struct net
 	}
 }
 
-void netpoll_poll(struct netpoll *np)
+void netpoll_poll_dev(struct net_device *dev)
 {
-	struct net_device *dev = np->dev;
 	const struct net_device_ops *ops;
 
 	if (!dev || !netif_running(dev))
@@ -201,6 +200,11 @@ void netpoll_poll(struct netpoll *np)
 	zap_completion_queue();
 }
 
+void netpoll_poll(struct netpoll *np)
+{
+	netpoll_poll_dev(np->dev);
+}
+
 static void refill_skbs(void)
 {
 	struct sk_buff *skb;
@@ -282,7 +286,7 @@ static int netpoll_owner_active(struct n
 	return 0;
 }
 
-static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	int status = NETDEV_TX_BUSY;
 	unsigned long tries;
@@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_tx_queue_stopped(txq)) {
+					dev->priv_flags |= IFF_IN_NETPOLL;
 					status = ops->ndo_start_xmit(skb, dev);
+					dev->priv_flags &= ~IFF_IN_NETPOLL;
 					if (status == NETDEV_TX_OK)
 						txq_trans_update(txq);
 				}
@@ -756,7 +762,10 @@ int netpoll_setup(struct netpoll *np)
 		atomic_inc(&npinfo->refcnt);
 	}
 
-	if (!ndev->netdev_ops->ndo_poll_controller) {
+	npinfo->netpoll = np;
+
+	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+	    !ndev->netdev_ops->ndo_poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
 		       np->name, np->dev_name);
 		err = -ENOTSUPP;
@@ -878,6 +887,7 @@ void netpoll_cleanup(struct netpoll *np)
 			}
 
 			if (atomic_dec_and_test(&npinfo->refcnt)) {
+				const struct net_device_ops *ops;
 				skb_queue_purge(&npinfo->arp_tx);
 				skb_queue_purge(&npinfo->txq);
 				cancel_rearming_delayed_work(&npinfo->tx_work);
@@ -885,7 +895,11 @@ void netpoll_cleanup(struct netpoll *np)
 				/* clean after last, unfinished work */
 				__skb_queue_purge(&npinfo->txq);
 				kfree(npinfo);
-				np->dev->npinfo = NULL;
+				ops = np->dev->netdev_ops;
+				if (ops->ndo_netpoll_cleanup)
+					ops->ndo_netpoll_cleanup(np->dev);
+				else
+					np->dev->npinfo = NULL;
 			}
 		}
 
@@ -908,6 +922,7 @@ void netpoll_set_trap(int trap)
 		atomic_dec(&trapped);
 }
 
+EXPORT_SYMBOL(netpoll_send_skb);
 EXPORT_SYMBOL(netpoll_set_trap);
 EXPORT_SYMBOL(netpoll_trap);
 EXPORT_SYMBOL(netpoll_print_options);
@@ -915,4 +930,5 @@ EXPORT_SYMBOL(netpoll_parse_options);
 EXPORT_SYMBOL(netpoll_setup);
 EXPORT_SYMBOL(netpoll_cleanup);
 EXPORT_SYMBOL(netpoll_send_udp);
+EXPORT_SYMBOL(netpoll_poll_dev);
 EXPORT_SYMBOL(netpoll_poll);
Index: linux-2.6/drivers/net/netconsole.c
===================================================================
--- linux-2.6.orig/drivers/net/netconsole.c
+++ linux-2.6/drivers/net/netconsole.c
@@ -665,7 +665,8 @@ static int netconsole_netdev_event(struc
 	struct netconsole_target *nt;
 	struct net_device *dev = ptr;
 
-	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER))
+	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
+	      event == NETDEV_BONDING_DESLAVE || event == NETDEV_GOING_DOWN))
 		goto done;
 
 	spin_lock_irqsave(&target_list_lock, flags);
@@ -677,19 +678,21 @@ static int netconsole_netdev_event(struc
 				strlcpy(nt->np.dev_name, dev->name, IFNAMSIZ);
 				break;
 			case NETDEV_UNREGISTER:
-				if (!nt->enabled)
-					break;
 				netpoll_cleanup(&nt->np);
+				/* Fall through */
+			case NETDEV_GOING_DOWN:
+			case NETDEV_BONDING_DESLAVE:
 				nt->enabled = 0;
-				printk(KERN_INFO "netconsole: network logging stopped"
-					", interface %s unregistered\n",
-					dev->name);
 				break;
 			}
 		}
 		netconsole_target_put(nt);
 	}
 	spin_unlock_irqrestore(&target_list_lock, flags);
+	if (event == NETDEV_UNREGISTER || event == NETDEV_BONDING_DESLAVE)
+		printk(KERN_INFO "netconsole: network logging stopped, "
+			"interface %s %s\n",  dev->name,
+			event == NETDEV_UNREGISTER ? "unregistered" : "released slaves");
 
 done:
 	return NOTIFY_DONE;
Index: linux-2.6/include/linux/notifier.h
===================================================================
--- linux-2.6.orig/include/linux/notifier.h
+++ linux-2.6/include/linux/notifier.h
@@ -203,6 +203,7 @@ static inline int notifier_to_errno(int 
 #define NETDEV_BONDING_NEWTYPE  0x000F
 #define NETDEV_POST_INIT	0x0010
 #define NETDEV_UNREGISTER_BATCH 0x0011
+#define NETDEV_BONDING_DESLAVE  0x0012
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN

^ permalink raw reply

* [v5 Patch 2/3] bridge: make bridge support netpoll
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel
In-Reply-To: <20100505081514.5157.83783.sendpatchset@localhost.localdomain>


Based on the previous patch, make bridge support netpoll by:

1) implement the 2 methods to support netpoll for bridge;

2) modify netpoll during forwarding packets via bridge;

3) disable netpoll support of bridge when a netpoll-unabled device
   is added to bridge;

4) enable netpoll support when all underlying devices support netpoll.

Cc: David Miller <davem@davemloft.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/net/bridge/br_device.c
===================================================================
--- linux-2.6.orig/net/bridge/br_device.c
+++ linux-2.6/net/bridge/br_device.c
@@ -13,8 +13,10 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
+#include <linux/list.h>
 
 #include <asm/uaccess.h>
 #include "br_private.h"
@@ -162,6 +164,59 @@ static int br_set_tx_csum(struct net_dev
 	return 0;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+bool br_devices_support_netpoll(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	bool ret = true;
+	int count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&br->lock, flags);
+	list_for_each_entry(p, &br->port_list, list) {
+		count++;
+		if ((p->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
+		    !p->dev->netdev_ops->ndo_poll_controller)
+			ret = false;
+	}
+	spin_unlock_irqrestore(&br->lock, flags);
+	return count != 0 && ret;
+}
+
+static void br_poll_controller(struct net_device *br_dev)
+{
+	struct netpoll *np = br_dev->npinfo->netpoll;
+
+	if (np->real_dev != br_dev)
+		netpoll_poll_dev(np->real_dev);
+}
+
+void br_netpoll_cleanup(struct net_device *br_dev)
+{
+	struct net_bridge *br = netdev_priv(br_dev);
+	struct net_bridge_port *p, *n;
+	const struct net_device_ops *ops;
+
+	br->dev->npinfo = NULL;
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		if (p->dev) {
+			ops = p->dev->netdev_ops;
+			if (ops->ndo_netpoll_cleanup)
+				ops->ndo_netpoll_cleanup(p->dev);
+			else
+				p->dev->npinfo = NULL;
+		}
+	}
+}
+
+#else
+
+void br_netpoll_cleanup(struct net_device *br_dev)
+{
+}
+
+#endif
+
 static const struct ethtool_ops br_ethtool_ops = {
 	.get_drvinfo    = br_getinfo,
 	.get_link	= ethtool_op_get_link,
@@ -184,6 +239,10 @@ static const struct net_device_ops br_ne
 	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
 	.ndo_change_mtu		 = br_change_mtu,
 	.ndo_do_ioctl		 = br_dev_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
+	.ndo_poll_controller	 = br_poll_controller,
+#endif
 };
 
 void br_dev_setup(struct net_device *dev)
Index: linux-2.6/net/bridge/br_forward.c
===================================================================
--- linux-2.6.orig/net/bridge/br_forward.c
+++ linux-2.6/net/bridge/br_forward.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/skbuff.h>
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
@@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
 		else {
 			skb_push(skb, ETH_HLEN);
 
-			dev_queue_xmit(skb);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+			if (unlikely(skb->dev->priv_flags & IFF_IN_NETPOLL)) {
+				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
+				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
+			} else
+#endif
+				dev_queue_xmit(skb);
 		}
 	}
 
@@ -66,9 +73,23 @@ int br_forward_finish(struct sk_buff *sk
 
 static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 {
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct net_bridge *br = to->br;
+	if (unlikely(br->dev->priv_flags & IFF_IN_NETPOLL)) {
+		struct netpoll *np;
+		to->dev->npinfo = skb->dev->npinfo;
+		np = skb->dev->npinfo->netpoll;
+		np->real_dev = np->dev = to->dev;
+		to->dev->priv_flags |= IFF_IN_NETPOLL;
+	}
+#endif
 	skb->dev = to->dev;
 	NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
 			br_forward_finish);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (skb->dev->npinfo)
+		skb->dev->npinfo->netpoll->dev = br->dev;
+#endif
 }
 
 static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
Index: linux-2.6/net/bridge/br_if.c
===================================================================
--- linux-2.6.orig/net/bridge/br_if.c
+++ linux-2.6/net/bridge/br_if.c
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
@@ -153,6 +154,14 @@ static void del_nbp(struct net_bridge_po
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (br_devices_support_netpoll(br))
+		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+	if (dev->netdev_ops->ndo_netpoll_cleanup)
+		dev->netdev_ops->ndo_netpoll_cleanup(dev);
+	else
+		dev->npinfo = NULL;
+#endif
 	call_rcu(&p->rcu, destroy_nbp_rcu);
 }
 
@@ -165,6 +174,8 @@ static void del_br(struct net_bridge *br
 		del_nbp(p);
 	}
 
+	br_netpoll_cleanup(br->dev);
+
 	del_timer_sync(&br->gc_timer);
 
 	br_sysfs_delbr(br->dev);
@@ -438,6 +449,20 @@ int br_add_if(struct net_bridge *br, str
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (br_devices_support_netpoll(br)) {
+		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+		if (br->dev->npinfo)
+			dev->npinfo = br->dev->npinfo;
+	} else if (!(br->dev->priv_flags & IFF_DISABLE_NETPOLL)) {
+		br->dev->priv_flags |= IFF_DISABLE_NETPOLL;
+		printk(KERN_INFO "New device %s does not support netpoll\n",
+			dev->name);
+		printk(KERN_INFO "Disabling netpoll for %s\n",
+			br->dev->name);
+	}
+#endif
+
 	return 0;
 err2:
 	br_fdb_delete_by_port(br, p, 1);
Index: linux-2.6/net/bridge/br_private.h
===================================================================
--- linux-2.6.orig/net/bridge/br_private.h
+++ linux-2.6/net/bridge/br_private.h
@@ -233,6 +233,8 @@ static inline int br_is_root_bridge(cons
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
 			       struct net_device *dev);
+extern bool br_devices_support_netpoll(struct net_bridge *br);
+extern void br_netpoll_cleanup(struct net_device *br_dev);
 
 /* br_fdb.c */
 extern int br_fdb_init(void);

^ permalink raw reply

* [v5 Patch 3/3] bonding: make bonding support netpoll
From: Amerigo Wang @ 2010-05-05  8:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jay Vosburgh, Amerigo Wang, Neil Horman, netdev, Matt Mackall,
	bridge, David Miller, Jeff Moyer, Andy Gospodarek, bonding-devel
In-Reply-To: <20100505081514.5157.83783.sendpatchset@localhost.localdomain>


Based on Andy's work, but I modified a lot.

Similar to the patch for bridge, this patch does:

1) implement the 2 methods to support netpoll for bonding;

2) modify netpoll during forwarding packets via bonding;

3) disable netpoll support of bonding when a netpoll-unabled device
   is added to bonding;

4) enable netpoll support when all underlying devices support netpoll.

Cc: Andy Gospodarek <gospo@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Jay Vosburgh <fubar@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: WANG Cong <amwang@redhat.com>

---

Index: linux-2.6/drivers/net/bonding/bond_main.c
===================================================================
--- linux-2.6.orig/drivers/net/bonding/bond_main.c
+++ linux-2.6/drivers/net/bonding/bond_main.c
@@ -59,6 +59,7 @@
 #include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
 #include <linux/etherdevice.h>
@@ -430,7 +431,18 @@ int bond_dev_queue_xmit(struct bonding *
 	}
 
 	skb->priority = 1;
-	dev_queue_xmit(skb);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (unlikely(bond->dev->priv_flags & IFF_IN_NETPOLL)) {
+		struct netpoll *np = bond->dev->npinfo->netpoll;
+		slave_dev->npinfo = bond->dev->npinfo;
+		np->real_dev = np->dev = skb->dev;
+		slave_dev->priv_flags |= IFF_IN_NETPOLL;
+		netpoll_send_skb(np, skb);
+		slave_dev->priv_flags &= ~IFF_IN_NETPOLL;
+		np->dev = bond->dev;
+	} else
+#endif
+		dev_queue_xmit(skb);
 
 	return 0;
 }
@@ -1329,6 +1341,61 @@ static void bond_detach_slave(struct bon
 	bond->slave_cnt--;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/*
+ * You must hold read lock on bond->lock before calling this.
+ */
+static bool slaves_support_netpoll(struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	int i = 0;
+	bool ret = true;
+
+	bond_for_each_slave(bond, slave, i) {
+		if ((slave->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
+		    !slave->dev->netdev_ops->ndo_poll_controller)
+			ret = false;
+	}
+	return i != 0 && ret;
+}
+
+static void bond_poll_controller(struct net_device *bond_dev)
+{
+	struct net_device *dev = bond_dev->npinfo->netpoll->real_dev;
+	if (dev != bond_dev)
+		netpoll_poll_dev(dev);
+}
+
+static void bond_netpoll_cleanup(struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	const struct net_device_ops *ops;
+	int i;
+
+	read_lock(&bond->lock);
+	bond_dev->npinfo = NULL;
+	bond_for_each_slave(bond, slave, i) {
+		if (slave->dev) {
+			ops = slave->dev->netdev_ops;
+			if (ops->ndo_netpoll_cleanup)
+				ops->ndo_netpoll_cleanup(slave->dev);
+			else
+				slave->dev->npinfo = NULL;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
+#else
+
+static void bond_netpoll_cleanup(struct net_device *bond_dev)
+{
+}
+
+#endif
+
 /*---------------------------------- IOCTL ----------------------------------*/
 
 static int bond_sethwaddr(struct net_device *bond_dev,
@@ -1735,6 +1802,18 @@ int bond_enslave(struct net_device *bond
 
 	bond_set_carrier(bond);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (slaves_support_netpoll(bond_dev)) {
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+		if (bond_dev->npinfo)
+			slave_dev->npinfo = bond_dev->npinfo;
+	} else if (!(bond_dev->priv_flags & IFF_DISABLE_NETPOLL)) {
+		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
+		pr_info("New slave device %s does not support netpoll\n",
+			slave_dev->name);
+		pr_info("Disabling netpoll support for %s\n", bond_dev->name);
+	}
+#endif
 	read_unlock(&bond->lock);
 
 	res = bond_create_slave_symlinks(bond_dev, slave_dev);
@@ -1801,6 +1880,7 @@ int bond_release(struct net_device *bond
 		return -EINVAL;
 	}
 
+	netdev_bonding_change(bond_dev, NETDEV_BONDING_DESLAVE);
 	write_lock_bh(&bond->lock);
 
 	slave = bond_get_slave_by_dev(bond, slave_dev);
@@ -1929,6 +2009,17 @@ int bond_release(struct net_device *bond
 
 	netdev_set_master(slave_dev, NULL);
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	read_lock_bh(&bond->lock);
+	if (slaves_support_netpoll(bond_dev))
+		bond_dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
+	read_unlock_bh(&bond->lock);
+	if (slave_dev->netdev_ops->ndo_netpoll_cleanup)
+		slave_dev->netdev_ops->ndo_netpoll_cleanup(slave_dev);
+	else
+		slave_dev->npinfo = NULL;
+#endif
+
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
@@ -4448,6 +4539,10 @@ static const struct net_device_ops bond_
 	.ndo_vlan_rx_register	= bond_vlan_rx_register,
 	.ndo_vlan_rx_add_vid 	= bond_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_cleanup	= bond_netpoll_cleanup,
+	.ndo_poll_controller	= bond_poll_controller,
+#endif
 };
 
 static void bond_destructor(struct net_device *bond_dev)
@@ -4541,6 +4636,8 @@ static void bond_uninit(struct net_devic
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
+	bond_netpoll_cleanup(bond_dev);
+
 	/* Release the bonded slaves */
 	bond_release_all(bond_dev);

^ permalink raw reply

* [net-next-2.6 PATCH 1/2] ixgbe: Add boolean parameter to ixgbe_set_vmolr
From: Jeff Kirsher @ 2010-05-05  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Greg Rose, Jeff Kirsher

From: Greg Rose <gregory.v.rose@intel.com>

Add a boolean parameter to ixgbe-set_vmolr so that the caller can
specify whether the pool should accept untagged packets.  Required
for a follow on patch to enable administrative configuration of port
VLAN for virtual functions.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c  |    2 +-
 drivers/net/ixgbe/ixgbe_sriov.c |   11 +++++++----
 drivers/net/ixgbe/ixgbe_sriov.h |    2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 2ae5a51..0a0e90e 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -2722,7 +2722,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 		IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), (1 << vf_shift));
 		IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), (1 << vf_shift));
 		IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
-		ixgbe_set_vmolr(hw, adapter->num_vfs);
+		ixgbe_set_vmolr(hw, adapter->num_vfs, true);
 	}
 
 	/* Program MRQC for the distribution of queues */
diff --git a/drivers/net/ixgbe/ixgbe_sriov.c b/drivers/net/ixgbe/ixgbe_sriov.c
index d4cd20f..53f364d 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ixgbe/ixgbe_sriov.c
@@ -113,13 +113,16 @@ int ixgbe_set_vf_vlan(struct ixgbe_adapter *adapter, int add, int vid, u32 vf)
 }
 
 
-void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf)
+void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe)
 {
 	u32 vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(vf));
-	vmolr |= (IXGBE_VMOLR_AUPE |
-		  IXGBE_VMOLR_ROMPE |
+	vmolr |= (IXGBE_VMOLR_ROMPE |
 		  IXGBE_VMOLR_ROPE |
 		  IXGBE_VMOLR_BAM);
+	if (aupe)
+		vmolr |= IXGBE_VMOLR_AUPE;
+	else
+		vmolr &= ~IXGBE_VMOLR_AUPE;
 	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf), vmolr);
 }
 
@@ -128,7 +131,7 @@ inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf)
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	/* reset offloads to defaults */
-	ixgbe_set_vmolr(hw, vf);
+	ixgbe_set_vmolr(hw, vf, true);
 
 
 	/* reset multicast table array for vf */
diff --git a/drivers/net/ixgbe/ixgbe_sriov.h b/drivers/net/ixgbe/ixgbe_sriov.h
index 51d1106..7fb1288 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ixgbe/ixgbe_sriov.h
@@ -32,7 +32,7 @@ int ixgbe_set_vf_multicasts(struct ixgbe_adapter *adapter,
                             int entries, u16 *hash_list, u32 vf);
 void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter);
 int ixgbe_set_vf_vlan(struct ixgbe_adapter *adapter, int add, int vid, u32 vf);
-void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf);
+void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe);
 void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf);
 void ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf);
 void ixgbe_msg_task(struct ixgbe_adapter *adapter);


^ permalink raw reply related

* [net-next-2.6 PATCH 2/2] ixgbe: Add support for VF MAC and VLAN configuration
From: Jeff Kirsher @ 2010-05-05  8:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Greg Rose, Jeff Kirsher
In-Reply-To: <20100505081129.5709.39430.stgit@localhost.localdomain>

From: Greg Rose <gregory.v.rose@intel.com>

Add support for the "ip link set" and "ip link show" commands that allow
configuration of the virtual functions' MAC and port VLAN via user space
command line.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe.h       |    3 +
 drivers/net/ixgbe/ixgbe_main.c  |    4 ++
 drivers/net/ixgbe/ixgbe_sriov.c |  104 ++++++++++++++++++++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_sriov.h |    6 ++
 drivers/net/ixgbe/ixgbe_type.h  |    5 ++
 5 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 79c35ae..d0ea3d6 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -111,7 +111,10 @@ struct vf_data_storage {
 	u16 default_vf_vlan_id;
 	u16 vlans_enabled;
 	bool clear_to_send;
+	bool pf_set_mac;
 	int rar;
+	u16 pf_vlan; /* When set, guest VLAN config not allowed. */
+	u16 pf_qos;
 };
 
 /* wrapper around a pointer to a socket buffer,
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 0a0e90e..d1a1868 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -6311,6 +6311,10 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= ixgbe_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= ixgbe_vlan_rx_kill_vid,
 	.ndo_do_ioctl		= ixgbe_ioctl,
+	.ndo_set_vf_mac		= ixgbe_ndo_set_vf_mac,
+	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
+	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
+	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
diff --git a/drivers/net/ixgbe/ixgbe_sriov.c b/drivers/net/ixgbe/ixgbe_sriov.c
index 53f364d..221b2ca 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ixgbe/ixgbe_sriov.c
@@ -126,13 +126,34 @@ void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe)
 	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf), vmolr);
 }
 
+static void ixgbe_set_vmvir(struct ixgbe_adapter *adapter, u32 vid, u32 vf)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+
+	if (vid)
+		IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf),
+				(vid | IXGBE_VMVIR_VLANA_DEFAULT));
+	else
+		IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), 0);
+}
+
 inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	/* reset offloads to defaults */
-	ixgbe_set_vmolr(hw, vf, true);
-
+	if (adapter->vfinfo[vf].pf_vlan) {
+		ixgbe_set_vf_vlan(adapter, true,
+				  adapter->vfinfo[vf].pf_vlan, vf);
+		ixgbe_set_vmvir(adapter,
+				(adapter->vfinfo[vf].pf_vlan |
+				 (adapter->vfinfo[vf].pf_qos <<
+				  VLAN_PRIO_SHIFT)), vf);
+		ixgbe_set_vmolr(hw, vf, false);
+	} else {
+		ixgbe_set_vmvir(adapter, 0, vf);
+		ixgbe_set_vmolr(hw, vf, true);
+	}
 
 	/* reset multicast table array for vf */
 	adapter->vfinfo[vf].num_vf_mc_hashes = 0;
@@ -266,10 +287,12 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	case IXGBE_VF_SET_MAC_ADDR:
 		{
 			u8 *new_mac = ((u8 *)(&msgbuf[1]));
-			if (is_valid_ether_addr(new_mac))
+			if (is_valid_ether_addr(new_mac) &&
+			    !adapter->vfinfo[vf].pf_set_mac)
 				ixgbe_set_vf_mac(adapter, vf, new_mac);
 			else
-				retval = -1;
+				ixgbe_set_vf_mac(adapter,
+				  vf, adapter->vfinfo[vf].vf_mac_addresses);
 		}
 		break;
 	case IXGBE_VF_SET_MULTICAST:
@@ -363,3 +386,76 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter)
 	}
 }
 
+int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	if (!is_valid_ether_addr(mac) || (vf >= adapter->num_vfs))
+		return -EINVAL;
+	adapter->vfinfo[vf].pf_set_mac = true;
+	dev_info(&adapter->pdev->dev, "setting MAC %pM on VF %d\n", mac, vf);
+	dev_info(&adapter->pdev->dev, "Reload the VF driver to make this"
+				      " change effective.");
+	if (test_bit(__IXGBE_DOWN, &adapter->state)) {
+		dev_warn(&adapter->pdev->dev, "The VF MAC address has been set,"
+			 " but the PF device is not up.\n");
+		dev_warn(&adapter->pdev->dev, "Bring the PF device up before"
+			 " attempting to use the VF device.\n");
+	}
+	return ixgbe_set_vf_mac(adapter, vf, mac);
+}
+
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+{
+	int err = 0;
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7))
+		return -EINVAL;
+	if (vlan || qos) {
+		err = ixgbe_set_vf_vlan(adapter, true, vlan, vf);
+		if (err)
+			goto out;
+		ixgbe_set_vmvir(adapter, vlan | (qos << VLAN_PRIO_SHIFT), vf);
+		ixgbe_set_vmolr(&adapter->hw, vf, false);
+		adapter->vfinfo[vf].pf_vlan = vlan;
+		adapter->vfinfo[vf].pf_qos = qos;
+		dev_info(&adapter->pdev->dev,
+			 "Setting VLAN %d, QOS 0x%x on VF %d\n", vlan, qos, vf);
+		if (test_bit(__IXGBE_DOWN, &adapter->state)) {
+			dev_warn(&adapter->pdev->dev,
+				 "The VF VLAN has been set,"
+				 " but the PF device is not up.\n");
+			dev_warn(&adapter->pdev->dev,
+				 "Bring the PF device up before"
+				 " attempting to use the VF device.\n");
+		}
+	} else {
+		err = ixgbe_set_vf_vlan(adapter, false,
+					adapter->vfinfo[vf].pf_vlan, vf);
+		ixgbe_set_vmvir(adapter, vlan, vf);
+		ixgbe_set_vmolr(&adapter->hw, vf, true);
+		adapter->vfinfo[vf].pf_vlan = 0;
+		adapter->vfinfo[vf].pf_qos = 0;
+       }
+out:
+       return err;
+}
+
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
+{
+	return -EOPNOTSUPP;
+}
+
+int ixgbe_ndo_get_vf_config(struct net_device *netdev,
+			    int vf, struct ifla_vf_info *ivi)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+	ivi->vf = vf;
+	memcpy(&ivi->mac, adapter->vfinfo[vf].vf_mac_addresses, ETH_ALEN);
+	ivi->tx_rate = 0;
+	ivi->vlan = adapter->vfinfo[vf].pf_vlan;
+	ivi->qos = adapter->vfinfo[vf].pf_qos;
+	return 0;
+}
diff --git a/drivers/net/ixgbe/ixgbe_sriov.h b/drivers/net/ixgbe/ixgbe_sriov.h
index 7fb1288..184730e 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ixgbe/ixgbe_sriov.h
@@ -42,6 +42,12 @@ int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask);
 void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter);
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 void ixgbe_dump_registers(struct ixgbe_adapter *adapter);
+int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
+			   u8 qos);
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate);
+int ixgbe_ndo_get_vf_config(struct net_device *netdev,
+			    int vf, struct ifla_vf_info *ivi);
 
 #endif /* _IXGBE_SRIOV_H_ */
 
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index a0f9084..4277cbb 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -219,6 +219,7 @@
 #define IXGBE_MTQC      0x08120
 #define IXGBE_VLVF(_i)  (0x0F100 + ((_i) * 4))  /* 64 of these (0-63) */
 #define IXGBE_VLVFB(_i) (0x0F200 + ((_i) * 4))  /* 128 of these (0-127) */
+#define IXGBE_VMVIR(_i) (0x08000 + ((_i) * 4))  /* 64 of these (0-63) */
 #define IXGBE_VT_CTL    0x051B0
 #define IXGBE_VFRE(_i)  (0x051E0 + ((_i) * 4))
 #define IXGBE_VFTE(_i)  (0x08110 + ((_i) * 4))
@@ -1311,6 +1312,10 @@
 #define IXGBE_VLVF_ENTRIES      64
 #define IXGBE_VLVF_VLANID_MASK  0x00000FFF
 
+/* Per VF Port VLAN insertion rules */
+#define IXGBE_VMVIR_VLANA_DEFAULT 0x40000000 /* Always use default VLAN */
+#define IXGBE_VMVIR_VLANA_NEVER   0x80000000 /* Never insert VLAN tag */
+
 #define IXGBE_ETHERNET_IEEE_VLAN_TYPE 0x8100  /* 802.1q protocol */
 
 /* STATUS Bit Masks */


^ permalink raw reply related

* Re: [PATCH net-next-2.6] net: __alloc_skb() speedup
From: Eric Dumazet @ 2010-05-05  8:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, hadi, therbert
In-Reply-To: <20100505.010658.48498744.davem@davemloft.net>

Le mercredi 05 mai 2010 à 01:06 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 04 May 2010 19:10:54 +0200
> 
> > With following patch I can reach maximum rate of my pktgen+udpsink
> > simulator :
> > - 'old' machine : dual quad core E5450  @3.00GHz
> > - 64 UDP rx flows (only differ by destination port)
> > - RPS enabled, NIC interrupts serviced on cpu0
> > - rps dispatched on 7 other cores. (~130.000 IPI per second)
> > - SLAB allocator (faster than SLUB in this workload)
> > - tg3 NIC
> > - 1.080.000 pps without a single drop at NIC level.
> > 
> > Idea is to add two prefetchw() calls in __alloc_skb(), one to prefetch
> > first sk_buff cache line, the second to prefetch the shinfo part.
> > 
> > Also using one memset() to initialize all skb_shared_info fields instead
> > of one by one to reduce number of instructions, using long word moves.
> > 
> > All skb_shared_info fields before 'dataref' are cleared in 
> > __alloc_skb().
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> I'll apply this, nice work Eric.
> 
> But some caveats...
> 
> On several cpu types it is possible to "prefetch invalidate"
> cachelines.  PowerPC and sparc64 can both do it.  I'm pretty
> sure current gen x86 have SSE bits that can do this too.
> 
> In fact, the memset() for sparc64 is going to do these cacheline
> invalidates, making the prefetches on 'skb' in fact wasteful.
> It will just create spurious bus traffic.
> 

You mean memset() wont be inlined by ompiler to plain memory writes, but
use the custom kernel memset()  ?



^ permalink raw reply

* [net-next-2.6 PATCH 1/2] e1000e: increase rx fifo size to 36K on 82574 and 82583
From: Jeff Kirsher @ 2010-05-05  8:25 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change increases the RX fifo size to 36K for standard frames and
decreases the TX fifo size to 4K.  The reason for this change is that on
slower systems the RX is much more likely to backfill and need space than
the TX is.  As long as the TX fifo is twice the size of the MTU we should
have more than enough TX fifo.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/e1000e/82571.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/e1000e/82571.c b/drivers/net/e1000e/82571.c
index 17a25e1..1e73edd 100644
--- a/drivers/net/e1000e/82571.c
+++ b/drivers/net/e1000e/82571.c
@@ -1845,7 +1845,7 @@ struct e1000_info e1000_82574_info = {
 				  | FLAG_HAS_SMART_POWER_DOWN
 				  | FLAG_HAS_AMT
 				  | FLAG_HAS_CTRLEXT_ON_LOAD,
-	.pba			= 20,
+	.pba			= 36,
 	.max_hw_frame_size	= DEFAULT_JUMBO,
 	.get_variants		= e1000_get_variants_82571,
 	.mac_ops		= &e82571_mac_ops,
@@ -1862,7 +1862,7 @@ struct e1000_info e1000_82583_info = {
 				  | FLAG_HAS_SMART_POWER_DOWN
 				  | FLAG_HAS_AMT
 				  | FLAG_HAS_CTRLEXT_ON_LOAD,
-	.pba			= 20,
+	.pba			= 36,
 	.max_hw_frame_size	= ETH_FRAME_LEN + ETH_FCS_LEN,
 	.get_variants		= e1000_get_variants_82571,
 	.mac_ops		= &e82571_mac_ops,


^ permalink raw reply related

* [net-next-2.6 PATCH 2/2] e1000/e1000e: implement a simple interrupt moderation
From: Jeff Kirsher @ 2010-05-05  8:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Jesse Brandeburg, Jeff Kirsher
In-Reply-To: <20100505082537.5956.86868.stgit@localhost.localdomain>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

Back before e1000-7.3.20, the e1000 driver had a simple algorithm that
managed interrupt moderation.  The driver was updated in 7.3.20 to
have the new "adaptive" interrupt moderation but we have customer
requests to redeploy the old way as an option.  This patch adds the
old functionality back.  The new functionality can be enabled via
module parameter or at runtime via ethtool.
Module parameter: (InterruptThrottleRate=4) to use this new
moderation method.
Ethtool method: ethtool -C ethX rx-usecs 4

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/e1000/e1000_ethtool.c |    8 +++++---
 drivers/net/e1000/e1000_main.c    |   18 +++++++++++++++++-
 drivers/net/e1000/e1000_param.c   |   10 ++++++++--
 drivers/net/e1000e/ethtool.c      |    8 +++++---
 drivers/net/e1000e/netdev.c       |   18 +++++++++++++++++-
 drivers/net/e1000e/param.c        |    5 +++++
 6 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c
index d6931ca..2a3b2dc 100644
--- a/drivers/net/e1000/e1000_ethtool.c
+++ b/drivers/net/e1000/e1000_ethtool.c
@@ -1808,7 +1808,7 @@ static int e1000_get_coalesce(struct net_device *netdev,
 	if (adapter->hw.mac_type < e1000_82545)
 		return -EOPNOTSUPP;
 
-	if (adapter->itr_setting <= 3)
+	if (adapter->itr_setting <= 4)
 		ec->rx_coalesce_usecs = adapter->itr_setting;
 	else
 		ec->rx_coalesce_usecs = 1000000 / adapter->itr_setting;
@@ -1826,12 +1826,14 @@ static int e1000_set_coalesce(struct net_device *netdev,
 		return -EOPNOTSUPP;
 
 	if ((ec->rx_coalesce_usecs > E1000_MAX_ITR_USECS) ||
-	    ((ec->rx_coalesce_usecs > 3) &&
+	    ((ec->rx_coalesce_usecs > 4) &&
 	     (ec->rx_coalesce_usecs < E1000_MIN_ITR_USECS)) ||
 	    (ec->rx_coalesce_usecs == 2))
 		return -EINVAL;
 
-	if (ec->rx_coalesce_usecs <= 3) {
+	if (ec->rx_coalesce_usecs == 4) {
+		adapter->itr = adapter->itr_setting = 4;
+	} else if (ec->rx_coalesce_usecs <= 3) {
 		adapter->itr = 20000;
 		adapter->itr_setting = ec->rx_coalesce_usecs;
 	} else {
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index e6ebc22..4dd2c23 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -31,7 +31,7 @@
 
 char e1000_driver_name[] = "e1000";
 static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
-#define DRV_VERSION "7.3.21-k5-NAPI"
+#define DRV_VERSION "7.3.21-k6-NAPI"
 const char e1000_driver_version[] = DRV_VERSION;
 static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation.";
 
@@ -2386,6 +2386,22 @@ link_up:
 		}
 	}
 
+	/* Simple mode for Interrupt Throttle Rate (ITR) */
+	if (hw->mac_type >= e1000_82540 && adapter->itr_setting == 4) {
+		/*
+		 * Symmetric Tx/Rx gets a reduced ITR=2000;
+		 * Total asymmetrical Tx or Rx gets ITR=8000;
+		 * everyone else is between 2000-8000.
+		 */
+		u32 goc = (adapter->gotcl + adapter->gorcl) / 10000;
+		u32 dif = (adapter->gotcl > adapter->gorcl ?
+			    adapter->gotcl - adapter->gorcl :
+			    adapter->gorcl - adapter->gotcl) / 10000;
+		u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;
+
+		ew32(ITR, 1000000000 / (itr * 256));
+	}
+
 	/* Cause software interrupt to ensure rx ring is cleaned */
 	ew32(ICS, E1000_ICS_RXDMT0);
 
diff --git a/drivers/net/e1000/e1000_param.c b/drivers/net/e1000/e1000_param.c
index 543c6d1..9fbb562 100644
--- a/drivers/net/e1000/e1000_param.c
+++ b/drivers/net/e1000/e1000_param.c
@@ -484,11 +484,17 @@ void __devinit e1000_check_options(struct e1000_adapter *adapter)
 				adapter->itr_setting = adapter->itr;
 				adapter->itr = 20000;
 				break;
+			case 4:
+				e_dev_info("%s set to simplified "
+				           "(2000-8000) ints mode\n", opt.name);
+				adapter->itr_setting = adapter->itr;
+				break;
 			default:
 				e1000_validate_option(&adapter->itr, &opt,
 				        adapter);
-				/* save the setting, because the dynamic bits change itr */
-				/* clear the lower two bits because they are
+				/* save the setting, because the dynamic bits
+				 * change itr.
+				 * clear the lower two bits because they are
 				 * used as control */
 				adapter->itr_setting = adapter->itr & ~3;
 				break;
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index 7f9915c..c81118a 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -1891,7 +1891,7 @@ static int e1000_get_coalesce(struct net_device *netdev,
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
-	if (adapter->itr_setting <= 3)
+	if (adapter->itr_setting <= 4)
 		ec->rx_coalesce_usecs = adapter->itr_setting;
 	else
 		ec->rx_coalesce_usecs = 1000000 / adapter->itr_setting;
@@ -1906,12 +1906,14 @@ static int e1000_set_coalesce(struct net_device *netdev,
 	struct e1000_hw *hw = &adapter->hw;
 
 	if ((ec->rx_coalesce_usecs > E1000_MAX_ITR_USECS) ||
-	    ((ec->rx_coalesce_usecs > 3) &&
+	    ((ec->rx_coalesce_usecs > 4) &&
 	     (ec->rx_coalesce_usecs < E1000_MIN_ITR_USECS)) ||
 	    (ec->rx_coalesce_usecs == 2))
 		return -EINVAL;
 
-	if (ec->rx_coalesce_usecs <= 3) {
+	if (ec->rx_coalesce_usecs == 4) {
+		adapter->itr = adapter->itr_setting = 4;
+	} else if (ec->rx_coalesce_usecs <= 3) {
 		adapter->itr = 20000;
 		adapter->itr_setting = ec->rx_coalesce_usecs;
 	} else {
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index d13760d..0a16465 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -52,7 +52,7 @@
 
 #include "e1000.h"
 
-#define DRV_VERSION "1.0.2-k2"
+#define DRV_VERSION "1.0.2-k4"
 char e1000e_driver_name[] = "e1000e";
 const char e1000e_driver_version[] = DRV_VERSION;
 
@@ -4087,6 +4087,22 @@ link_up:
 		}
 	}
 
+	/* Simple mode for Interrupt Throttle Rate (ITR) */
+	if (adapter->itr_setting == 4) {
+		/*
+		 * Symmetric Tx/Rx gets a reduced ITR=2000;
+		 * Total asymmetrical Tx or Rx gets ITR=8000;
+		 * everyone else is between 2000-8000.
+		 */
+		u32 goc = (adapter->gotc + adapter->gorc) / 10000;
+		u32 dif = (adapter->gotc > adapter->gorc ?
+			    adapter->gotc - adapter->gorc :
+			    adapter->gorc - adapter->gotc) / 10000;
+		u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;
+
+		ew32(ITR, 1000000000 / (itr * 256));
+	}
+
 	/* Cause software interrupt to ensure Rx ring is cleaned */
 	if (adapter->msix_entries)
 		ew32(ICS, adapter->rx_ring->ims_val);
diff --git a/drivers/net/e1000e/param.c b/drivers/net/e1000e/param.c
index f775a48..0f4077c 100644
--- a/drivers/net/e1000e/param.c
+++ b/drivers/net/e1000e/param.c
@@ -351,6 +351,11 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 				adapter->itr_setting = adapter->itr;
 				adapter->itr = 20000;
 				break;
+			case 4:
+				e_info("%s set to simplified (2000-8000 ints) "
+				       "mode\n", opt.name);
+				adapter->itr_setting = 4;
+				break;
 			default:
 				/*
 				 * Save the setting, because the dynamic bits


^ permalink raw reply related

* Re: [PATCH net-next-2.6] net: __alloc_skb() speedup
From: David Miller @ 2010-05-05  8:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, hadi, therbert
In-Reply-To: <1273047734.2367.3.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 05 May 2010 10:22:14 +0200

> You mean memset() wont be inlined by ompiler to plain memory writes, but
> use the custom kernel memset()  ?

I hope memset() is never inlined for a 202 byte piece of memory on
sparc64 or powerpc.  What happens and makes sense on x86 is x86's
business :-)

Especially since that elides the cache invalidate optimizations, and
for anything >= 64 bytes those are absolutely critical on Niagara.

^ permalink raw reply

* [GIT]: Networking
From: David Miller @ 2010-05-05  8:27 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Remove bogus Kconfig default and fix bug in new sierra_net driver.
   From Elina Pasheva.

2) Fix powerpc OOPSer in e1000e, from Anton Blanchard.

3) FEC miscalculates register address in MAC address changing, writing
   to random memory, oops.  Fix from Mattias Walström.

4) p54pci doesn't use DMA API correctly, from Hans de Goede.

5) SOCK_DEBUG can miscompile, fix from Jan Engelhardt.

6) SCTP will skb_over_panic() when there are multiple parameter errors, fix
   from Neil Horman.

7) r8169_rx_interrupt() explodes when called from reset workqueue, fix from
   Eric Dumazet.

8) PPP input deref's two bytes before checking there are actually 2 bytes
   there in the linear packet area, furthermore ppp_read() isn't equipped
   to handle non-linear skbs.  Both fixes from Simon Arlott.

9) dm9601 writes wrong bits in DM_SHARED_CTRL register, resulting in
   severe degradation of performance (in some cases ~900Kb/s to
   ~30Kb/s).  Fix from Peter Korsgaard.

10) Add new Micrel PHY device driver, from David J. Choi.

11) ep93xx driver ACKs RX ring at the wrong spot, hanging chip.  Fix
    from Stefan Agner.

12) Default multicast hops socket setting is wrong as per RFC 3493.
    It should be '1' rather than the per-route unicast hop metric.
    Based upon a report by Elliot Hughes.

Please pull, thanks a lot!

The following changes since commit d93ac51c7a129db7a1431d859a3ef45a0b1f3fc5:
  Linus Torvalds (1):
        Merge branch 'for-linus' of git://git.kernel.org/.../sage/ceph-client

are available in the git repository at:

  master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.git master

Anton Blanchard (1):
      e1000e: Fix oops caused by ASPM patch.

David J. Choi (1):
      drivers/net/phy: micrel phy driver

David S. Miller (3):
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-2.6
      net: ep93xx_eth stops receiving packets
      ipv6: Fix default multicast hops setting.

Elina Pasheva (2):
      net/usb: remove default in Kconfig for sierra_net driver
      net/usb: initiate sync sequence in sierra_net.c driver

Eric Dumazet (1):
      r8169: Fix rtl8169_rx_interrupt()

Hans de Goede (1):
      p54pci: fix bugs in p54p_check_tx_ring

Jan Engelhardt (1):
      net: fix compile error due to double return type in SOCK_DEBUG

Mattias Walström (1):
      FEC: Fix kernel panic in fec_set_mac_address.

Neil Horman (1):
      sctp: Fix skb_over_panic resulting from multiple invalid parameter errors (CVE-2010-1173) (v4)

Peter Korsgaard (1):
      dm9601: fix phy/eeprom write routine

Sebastian Siewior (1):
      net/sb1250: register mdio bus in probe

Simon Arlott (2):
      ppp_generic: pull 2 bytes so that PPP_PROTO(skb) is valid
      ppp_generic: handle non-linear skbs when passing them to pppd

 drivers/net/arm/ep93xx_eth.c      |   10 ++--
 drivers/net/e1000e/netdev.c       |    3 +
 drivers/net/fec.c                 |    2 +-
 drivers/net/phy/Kconfig           |    5 ++
 drivers/net/phy/Makefile          |    1 +
 drivers/net/phy/micrel.c          |  104 +++++++++++++++++++++++++++++++++++++
 drivers/net/ppp_generic.c         |   34 ++++++++----
 drivers/net/r8169.c               |   22 ++++++--
 drivers/net/sb1250-mac.c          |   67 ++++++++++++------------
 drivers/net/usb/Kconfig           |    1 -
 drivers/net/usb/dm9601.c          |    2 +-
 drivers/net/usb/sierra_net.c      |    3 +
 drivers/net/wireless/p54/p54pci.c |    2 +-
 include/net/sctp/structs.h        |    1 +
 include/net/sock.h                |    2 +-
 net/ipv6/af_inet6.c               |    2 +-
 net/sctp/sm_make_chunk.c          |   62 ++++++++++++++++++++--
 17 files changed, 256 insertions(+), 67 deletions(-)
 create mode 100644 drivers/net/phy/micrel.c

^ permalink raw reply

* [PATCH v2] net/gianfar: drop recycled skbs on MTU change
From: Sebastian Andrzej Siewior @ 2010-05-05  8:30 UTC (permalink / raw)
  To: David Miller; +Cc: afleming, afleming, netdev
In-Reply-To: <20100505.005704.115933493.davem@davemloft.net>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

The size for skbs which is added to the recycled list is using the
current descriptor size which is current MTU. gfar_new_skb() is also
using this size. So after changing or alteast increasing the MTU all
recycled skbs should be dropped.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
>> I think we should probably do this in free_skb_resources.  And remove
>> the call from gfar_close().
>
>Ok, Sebastian please rework your patch as requested by Andy.

This has the side effect of dropping them on reset which is not
required.

 drivers/net/gianfar.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 4e97ca1..5d3763f 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1649,6 +1649,7 @@ static void free_skb_resources(struct gfar_private *priv)
 			sizeof(struct rxbd8) * priv->total_rx_ring_size,
 			priv->tx_queue[0]->tx_bd_base,
 			priv->tx_queue[0]->tx_bd_dma_base);
+	skb_queue_purge(&priv->rx_recycle);
 }
 
 void gfar_start(struct net_device *dev)
@@ -2088,7 +2089,6 @@ static int gfar_close(struct net_device *dev)
 
 	disable_napi(priv);
 
-	skb_queue_purge(&priv->rx_recycle);
 	cancel_work_sync(&priv->reset_task);
 	stop_gfar(dev);
 
-- 
1.6.6.1


^ permalink raw reply related

* Re: KS8851: Possible NULL dereferenced in ks8851_rx_pkts
From: Abraham Arce @ 2010-05-05  8:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100413.012844.26960227.davem@davemloft.net>

David,

>> These changes avoid a possible dereference in skb_reserve when skb is
>> NULL. I am increasing rx dropped packet count but not sure about how
>> to handle the dump of frames. Any advice is appreciated.
>
> This isn't sufficient to handle the NULL pointer.
>
> At a minimum you're going to have to do something about
> the fact that the chip has already been told to start
> bringing the packet into the RX fifo.

Ok

>
> If we just return without finishing up the hardware
> operation, the chip is probably going to hang the next
> time we come in here and tell it to start another RX
> DMA operation without having completed the previous one.
>
> The bug definitely needs to be fixed, however, but someone who knows
> this hardware and has access to it for testing will need to implement
> the fix.
>

I'll give a try and test, have a hardware with me...

Thanks!
Abraham

^ permalink raw reply

* 3 packet TCP window limit?
From: dormando @ 2010-05-05  9:10 UTC (permalink / raw)
  To: netdev

Hey,

Noticed in Linux that no matter what sysctl variable I twiddle, or what
TCP congestion algorithm is running, TCP will wait for remote acks after
sending the first 3 packets. After that it's normal.

Apologies, it's hard ot describe:

Linux server listening.

Remote -> SYN
(RTT wait)
Linux -> SYN/ACK
Remote -> ACK
Remote -> Packet (small HTTP request)
(RTT wait)
Linux -> Packet (x 3)
Remote -> (returning acks per packet)
(RTT wait)
Linux -> More packets (up to window size)

If the request response fits in 3 packets or less, that third RTT wait
never happens. The remote client gets all its data, and sends back all the
FIN/ACK packets for closing the connection.

What's bizarre is that this 3 packet/4 packet barrier is regardless of how
much data there is to send. I can cause the extra RTT to flip on or off by
sending exactly +/- 1 byte to cause an extra packet.

Holding the connection open and repeating the request any number of times
runs just fine, after the initial request.

You can pretty easily see this by:
tc qdisc add dev eth0 root netem delay 100ms
... then fetching a 3k file, then 4k file from an http server running
linux. Well. at least I can see this easily. I tried on a half dozen boxes
(2.6.11 through 2.6.32).

I'm trying to track down where in the code this is, or why my sysctl
tuning isn't affecting it. I can't discern its purpose. The lag it causes
is pretty awful for far away clients; adding 300ms of latency will make a
small request take a full second, instead of 600ms.

I'm slugging through the code but any insight would be greatly
appreciated!

-Dormando


^ permalink raw reply

* Re: [Patch 2/3] sysctl: add proc_do_large_bitmap
From: Cong Wang @ 2010-05-05  9:20 UTC (permalink / raw)
  To: Changli Gao
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, adobriyan, David Miller
In-Reply-To: <4BE0E27C.7040200@redhat.com>

Cong Wang wrote:
> Changli Gao wrote:
>>                      add the following lines to let "echo 1-10 >>
>> /proc/..." work as normal.
> 
> Hmm, I haven't tested this, what did you see if we append
> lines into it?
> 
> Also, do we need appending lines to this /proc file when design it?
> Octavian? Eric?
> 

Hmm, currently this behaves like other /proc files, IOW,
echo 'foo' >> /proc/XXX is the same with echo 'foo' > /proc/XXX.

I think it is reasonable for bitmap /proc files to have
echo 'foo' >> /proc/XXX behaves like non-proc files, that is
appending numbers into that file, like what Changli mentioned.

Any objections?

Thanks.

^ permalink raw reply

* [Patch v10 0/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-05-05 10:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, ebiederm, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, xiaosuo, adobriyan, David Miller


Changes from the previous version:
- Use 'true' and 'false' for bool's;
- Fix some coding style problems;
- Allow appending lines to bitmap proc file so that it will be
  easier to add new bits.

------------------>

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

There are still some miss behaviors with regard to proc parsing in odd
invalid cases (for "40000\0-40001" all is acknowledged but only 40000
is accepted) but they are not easy to fix without changing the current
"acknowledge how much we accepted" behavior.

Because of that and because the same issues are present in the
existing proc_dointvec code as well I don't think its worth holding
the actual feature (port reservation) after such petty error recovery
issues.

^ permalink raw reply

* [Patch 1/3] sysctl: refactor integer handling proc code
From: Amerigo Wang @ 2010-05-05 10:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, ebiederm, xiaosuo, David Miller, adobriyan,
	Amerigo Wang
In-Reply-To: <20100505103033.5600.77502.sendpatchset@localhost.localdomain>

(Based on Octavian's work, and I modified a lot.)

As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is now treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2040,8 +2040,122 @@ int proc_dostring(struct ctl_table *tabl
 			       buffer, lenp, ppos);
 }
 
+static size_t proc_skip_spaces(char **buf)
+{
+	size_t ret;
+	char *tmp = skip_spaces(*buf);
+	ret = tmp - *buf;
+	*buf = tmp;
+	return ret;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - a kernel buffer
+ * @size - size of the kernel buffer
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @perm_tr - a vector which contains the allowed trailers
+ * @perm_tr_len - size of the perm_tr vector
+ * @tr - pointer to store the trailer character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN - 1)
+		len = TMPBUFLEN - 1;
+
+	memcpy(tmp, *buf, len);
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = true;
+		p++;
+	} else
+		*neg = false;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+		return -EINVAL;
+
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+			  bool neg)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		char __user **buffer = (char __user **)buf;
+		if (put_user(c, *buffer))
+			return -EFAULT;
+		(*size)--, (*buffer)++;
+		*buf = *buffer;
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2050,33 +2164,31 @@ static int do_proc_dointvec_conv(int *ne
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
+	int *i, vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
 	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!tbl_data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2088,89 +2200,69 @@ static int __do_proc_dointvec(void *tbl_
 	if (!conv)
 		conv = do_proc_dointvec_conv;
 
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, first=0) {
-		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
+		unsigned long lval;
+		bool neg;
 
-			lval = simple_strtoul(p, &p, 0);
+		if (write) {
+			left -= proc_skip_spaces(&kbuf);
 
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			err = proc_get_long(&kbuf, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
+				break;
+			}
 			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+				err = proc_put_char(&buffer, &left, '\t');
+			if (err)
+				break;
+			err = proc_put_long(&buffer, &left, lval, neg);
+			if (err)
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2238,8 +2330,8 @@ struct do_proc_dointvec_minmax_conv_para
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2344,10 @@ static int do_proc_dointvec_minmax_conv(
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
@@ -2295,102 +2387,78 @@ static int __do_proc_doulongvec_minmax(v
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
+
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
-	
+
 	i = (unsigned long *) data;
 	min = (unsigned long *) table->extra1;
 	max = (unsigned long *) table->extra2;
 	vleft = table->maxlen / sizeof(unsigned long);
 	left = *lenp;
-	
+
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, min++, max++, first=0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			left -= proc_skip_spaces(&kbuf);
+
+			err = proc_get_long(&kbuf, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
+			val = convdiv * (*i) / convmul;
 			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
+				err = proc_put_char(&buffer, &left, '\t');
+			err = proc_put_long(&buffer, &left, val, false);
+			if (err)
+				break;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2519,7 @@ int proc_doulongvec_ms_jiffies_minmax(st
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2463,10 +2531,10 @@ static int do_proc_dointvec_jiffies_conv
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = lval / HZ;
@@ -2474,7 +2542,7 @@ static int do_proc_dointvec_jiffies_conv
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2486,10 +2554,10 @@ static int do_proc_dointvec_userhz_jiffi
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2565,7 @@ static int do_proc_dointvec_userhz_jiffi
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2507,10 +2575,10 @@ static int do_proc_dointvec_ms_jiffies_c
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_msecs(lval);

^ permalink raw reply

* [Patch 2/3] sysctl: add proc_do_large_bitmap
From: Amerigo Wang @ 2010-05-05 10:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, ebiederm, xiaosuo, adobriyan,
	David Miller
In-Reply-To: <20100505103033.5600.77502.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

The new function can be used to read/write large bitmaps via /proc. A
comma separated range format is used for compact output and input
(e.g. 1,3-4,10-10).

Writing into the file will first reset the bitmap then update it
based on the given input.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h
+++ linux-2.6/include/linux/sysctl.h
@@ -980,6 +980,8 @@ extern int proc_doulongvec_minmax(struct
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
+extern int proc_do_large_bitmap(struct ctl_table *, int,
+				void __user *, size_t *, loff_t *);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2049,6 +2049,16 @@ static size_t proc_skip_spaces(char **bu
 	return ret;
 }
 
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+	while (*size) {
+		if (**buf != v)
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+}
+
 #define TMPBUFLEN 22
 /**
  * proc_get_long - reads an ASCII formated integer from a user buffer
@@ -2675,6 +2685,157 @@ static int proc_do_cad_pid(struct ctl_ta
 	return 0;
 }
 
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err = 0;
+	bool first = 1;
+	size_t left = *lenp;
+	unsigned long bitmap_len = table->maxlen;
+	unsigned long *bitmap = (unsigned long *) table->data;
+	unsigned long *tmp_bitmap = NULL;
+	char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+	if (!bitmap_len || !left || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		unsigned long page = 0;
+		char *kbuf;
+
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			free_page(page);
+			return -EFAULT;
+                }
+		kbuf[left] = 0;
+
+		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!tmp_bitmap) {
+			free_page(page);
+			return -ENOMEM;
+		}
+		proc_skip_char(&kbuf, &left, '\n');
+		while (!err && left) {
+			unsigned long val_a, val_b;
+			bool neg;
+
+			err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+					     sizeof(tr_a), &c);
+			if (err)
+				break;
+			if (val_a >= bitmap_len || neg) {
+				err = -EINVAL;
+				break;
+			}
+
+			val_b = val_a;
+			if (left) {
+				kbuf++;
+				left--;
+			}
+
+			if (c == '-') {
+				err = proc_get_long(&kbuf, &left, &val_b,
+						     &neg, tr_b, sizeof(tr_b),
+						     &c);
+				if (err)
+					break;
+				if (val_b >= bitmap_len || neg ||
+				    val_a > val_b) {
+					err = -EINVAL;
+					break;
+				}
+				if (left) {
+					kbuf++;
+					left--;
+				}
+			}
+
+			while (val_a <= val_b)
+				set_bit(val_a++, tmp_bitmap);
+
+			first = 0;
+			proc_skip_char(&kbuf, &left, '\n');
+		}
+		free_page(page);
+	} else {
+		unsigned long bit_a, bit_b = 0;
+
+		while (left) {
+			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+			if (bit_a >= bitmap_len)
+				break;
+			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+						   bit_a + 1) - 1;
+
+			if (!first) {
+				err = proc_put_char(&buffer, &left, ',');
+				if (err)
+					break;
+			}
+			err = proc_put_long(&buffer, &left, bit_a, false);
+			if (err)
+				break;
+			if (bit_a != bit_b) {
+				err = proc_put_char(&buffer, &left, '-');
+				if (err)
+					break;
+				err = proc_put_long(&buffer, &left, bit_b, false);
+				if (err)
+					break;
+			}
+
+			first = 0; bit_b++;
+		}
+		if (!err)
+			err = proc_put_char(&buffer, &left, '\n');
+	}
+
+	if (!err) {
+		if (write) {
+			if (*ppos)
+				bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+			else
+				memcpy(bitmap, tmp_bitmap,
+					BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+		}
+		kfree(tmp_bitmap);
+		*lenp -= left;
+		*ppos += *lenp;
+		return 0;
+	} else {
+		kfree(tmp_bitmap);
+		return err;
+	}
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(struct ctl_table *table, int write,

^ permalink raw reply

* [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-05-05 10:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, xiaosuo, David Miller, adobriyan,
	ebiederm
In-Reply-To: <20100505103033.5600.77502.sendpatchset@localhost.localdomain>


(Dropped the infiniband part, because Tetsuo modified the related code,
I will send a separate patch for it once this is accepted.)

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/Documentation/networking/ip-sysctl.txt
===================================================================
--- linux-2.6.orig/Documentation/networking/ip-sysctl.txt
+++ linux-2.6/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.:
+
+	$ cat /proc/sys/net/ipv4/ip_local_port_range
+	32000	61000
+	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
Index: linux-2.6/include/net/ip.h
===================================================================
--- linux-2.6.orig/include/net/ip.h
+++ linux-2.6/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -1552,9 +1552,13 @@ static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1653,6 +1657,8 @@ out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
Index: linux-2.6/net/ipv4/inet_connection_sock.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_connection_sock.c
+++ linux-2.6/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -108,6 +111,8 @@ again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -130,6 +135,7 @@ again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
Index: linux-2.6/net/ipv4/inet_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_hashtables.c
+++ linux-2.6/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_time
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -736,6 +743,16 @@ static __net_initdata struct pernet_oper
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, un
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
Index: linux-2.6/net/sctp/socket.c
===================================================================
--- linux-2.6.orig/net/sctp/socket.c
+++ linux-2.6/net/sctp/socket.c
@@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: __alloc_skb() speedup
From: Eric Dumazet @ 2010-05-05 12:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, hadi, therbert
In-Reply-To: <20100505.012647.260083711.davem@davemloft.net>

Le mercredi 05 mai 2010 à 01:26 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 05 May 2010 10:22:14 +0200
> 
> > You mean memset() wont be inlined by ompiler to plain memory writes, but
> > use the custom kernel memset()  ?
> 
> I hope memset() is never inlined for a 202 byte piece of memory on
> sparc64 or powerpc.  What happens and makes sense on x86 is x86's
> business :-)
> 
> Especially since that elides the cache invalidate optimizations, and
> for anything >= 64 bytes those are absolutely critical on Niagara.
> -

Sorry, I was thinking about the shinfo part :

memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));

offsetof(struct skb_shared_info, dataref) is small enough and we dont
dirty a full cache line, so maybe I can keep prefetchw(data + size) ?

If not, in which cases can we use prefetchw() in kernel, if some arches
dont handle it well ?

Note1 : Without prefetchw(skb) (I removed it in this v2 patch), some
packets are dropped again...

Note2: If NET_SKB_PAD changed to 64, cpu0 has about 2% of free cpu
cycles (as noticed by a user application cycles burner)

-----------------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.8% [1000Hz cycles],  (all, cpu: 0)
-----------------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ___________
             1018.00 16.8% eth_type_trans                
              960.00 15.9% __alloc_skb                   
              757.00 12.5% __netdev_alloc_skb            
              681.00 11.3% _raw_spin_lock                
              479.00  7.9% nommu_map_page                
              424.00  7.0% tg3_poll_work                 
              209.00  3.5% get_rps_cpu                   
              205.00  3.4% _raw_spin_lock_irqsave        
              188.00  3.1% __kmalloc                    
              164.00  2.7% enqueue_to_backlog            
              119.00  2.0% tg3_alloc_rx_skb              
              112.00  1.9% kmem_cache_alloc              

Thanks !

[PATCH v2 net-next-2.6] net: __alloc_skb() speedup

With following patch I can reach maximum rate of my pktgen+udpsink
simulator :
- 'old' machine : dual quad core E5450  @3.00GHz
- 64 UDP rx flows (only differ by destination port)
- RPS enabled, NIC interrupts serviced on cpu0
- rps dispatched on 7 other cores. (~130.000 IPI per second)
- SLAB allocator (faster than SLUB in this workload)
- tg3 NIC [BCM5715S Gigabit Ethernet (rev a3)]
- 1.080.000 pps with few drops (~150 packets per second) at NIC level.
- 32bit kernel

Idea is to add one prefetchw() call in __alloc_skb() to hint cpu we are
about to clear part of skb_shared_info.

Also using one memset() to initialize all skb_shared_info fields instead
of one by one to reduce number of instructions, using long word moves.

All skb_shared_info fields before 'dataref' are cleared in 
__alloc_skb().

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/skbuff.h |    7 ++++++-
 net/core/skbuff.c      |   21 +++++----------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 746a652..f32ccc9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -187,7 +187,6 @@ union skb_shared_tx {
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	atomic_t	dataref;
 	unsigned short	nr_frags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -197,6 +196,12 @@ struct skb_shared_info {
 	union skb_shared_tx tx_flags;
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
+
+	/*
+	 * Warning : all fields before dataref are cleared in __alloc_skb()
+	 */
+	atomic_t	dataref;
+	
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8b9c109..7cafe50 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -187,6 +187,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			gfp_mask, node);
 	if (!data)
 		goto nodata;
+	/* prepare shinfo initialization */
+	prefetchw(data + size);
 
 	/*
 	 * Only clear those fields we need to clear, not those that we will
@@ -208,15 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 	/* make sure we initialize shinfo sequentially */
 	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	shinfo->nr_frags  = 0;
-	shinfo->gso_size = 0;
-	shinfo->gso_segs = 0;
-	shinfo->gso_type = 0;
-	shinfo->ip6_frag_id = 0;
-	shinfo->tx_flags.flags = 0;
-	skb_frag_list_init(skb);
-	memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
 
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
@@ -505,16 +500,10 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
 		return 0;
 
 	skb_release_head_state(skb);
+
 	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	shinfo->nr_frags = 0;
-	shinfo->gso_size = 0;
-	shinfo->gso_segs = 0;
-	shinfo->gso_type = 0;
-	shinfo->ip6_frag_id = 0;
-	shinfo->tx_flags.flags = 0;
-	skb_frag_list_init(skb);
-	memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->data = skb->head + NET_SKB_PAD;



^ permalink raw reply related

* Re: 3 packet TCP window limit?
From: Brian Bloniarz @ 2010-05-05 13:26 UTC (permalink / raw)
  To: dormando; +Cc: netdev
In-Reply-To: <alpine.LNX.2.00.1005050210230.8544@d>

dormando wrote:
> Hey,
> 
> Noticed in Linux that no matter what sysctl variable I twiddle, or what
> TCP congestion algorithm is running, TCP will wait for remote acks after
> sending the first 3 packets. After that it's normal.
> 
> Apologies, it's hard ot describe:
> 
> Linux server listening.
> 
> Remote -> SYN
> (RTT wait)
> Linux -> SYN/ACK
> Remote -> ACK
> Remote -> Packet (small HTTP request)
> (RTT wait)
> Linux -> Packet (x 3)
> Remote -> (returning acks per packet)
> (RTT wait)
> Linux -> More packets (up to window size)
> 
> If the request response fits in 3 packets or less, that third RTT wait
> never happens. The remote client gets all its data, and sends back all the
> FIN/ACK packets for closing the connection.
> 
> What's bizarre is that this 3 packet/4 packet barrier is regardless of how
> much data there is to send. I can cause the extra RTT to flip on or off by
> sending exactly +/- 1 byte to cause an extra packet.
> 
> Holding the connection open and repeating the request any number of times
> runs just fine, after the initial request.
> 
> You can pretty easily see this by:
> tc qdisc add dev eth0 root netem delay 100ms
> ... then fetching a 3k file, then 4k file from an http server running
> linux. Well. at least I can see this easily. I tried on a half dozen boxes
> (2.6.11 through 2.6.32).
> 
> I'm trying to track down where in the code this is, or why my sysctl
> tuning isn't affecting it. I can't discern its purpose. The lag it causes
> is pretty awful for far away clients; adding 300ms of latency will make a
> small request take a full second, instead of 600ms.
> 
> I'm slugging through the code but any insight would be greatly
> appreciated!

This sounds like TCP slow start.

http://en.wikipedia.org/wiki/Slow-start

As far as tunables you might want to play with the initcwnd route
flag (see "ip route help")

> 
> -Dormando
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [PATCH] bonding: fix arp_validate on bonds inside a bridge
From: Jiri Bohac @ 2010-05-05 13:33 UTC (permalink / raw)
  To: David Miller; +Cc: fubar, jbohac, bonding-devel, netdev
In-Reply-To: <20100504.161815.71114704.davem@davemloft.net>

On Tue, May 04, 2010 at 04:18:15PM -0700, David Miller wrote:
> From: Jay Vosburgh <fubar@us.ibm.com>
> > 	Tested and it looks to work as advertised.  I see only one minor
> > nit, there's a pr_debug that missed being renamed to the new function
> > name; here's the whole patch with that fixed.
> 
> I don't think you need the ugly arp hook.
> 
> Instead, it's much cleaner to provide a way for packet type taps to
> see the packet before bridge et al. decapsulation.  In fact this makes
> a lot of sense, wanting to see the device as __netif_receive_skb() saw
> it, with no changes whatsoever.
> 
> In fact ptype_all runs before bridging, ING, and MACVLAN decap the
> thing, so we could have a 'ptype_base_predecap[]' that we run over
> right after those.

I was considering exactly this, but I thought it would be
rejected because of the overhead for all packets received.

In fact, bonding could register the ARP handler in the ptype_all
list and check itself whether the packets were ARPs. This
would require no changes to __netif_receive_skb() at all, but
would cause an extra fuction call and a condition for _every_
packet once a bond with arp_validate would be up.

Having a ptype_base_predecap[] hashtable would still cause at
least a comparision for _every_ packet, even without bonding
being loaded (!).

The current patch causes an extra comparison only for packets
arriving on boding slaves.

If either of the ptype_all or ptype_base_predecap[] method is
preferred, I'll be happy to re-work the patch. I just thought
performance had bigger priority here.

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply

* [RFC 3/5] net/mv643xx: use generic recycling infrastructure
From: Sebastian Andrzej Siewior @ 2010-05-05 14:47 UTC (permalink / raw)
  To: netdev; +Cc: tglx, Sebastian Andrzej Siewior
In-Reply-To: <1273070870-7821-1-git-send-email-sebastian@breakpoint.cc>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/mv643xx_eth.c |   27 +++++++++------------------
 1 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 4ee9d04..f56aaac 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -404,8 +404,6 @@ struct mv643xx_eth_private {
 	u8 work_rx_refill;
 
 	int skb_size;
-	struct sk_buff_head rx_recycle;
-
 	/*
 	 * RX state.
 	 */
@@ -649,6 +647,7 @@ err:
 static int rxq_refill(struct rx_queue *rxq, int budget)
 {
 	struct mv643xx_eth_private *mp = rxq_to_mp(rxq);
+	struct net_device *dev = mp->dev;
 	int refilled;
 
 	refilled = 0;
@@ -658,10 +657,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget)
 		struct rx_desc *rx_desc;
 		int size;
 
-		skb = __skb_dequeue(&mp->rx_recycle);
-		if (skb == NULL)
-			skb = dev_alloc_skb(mp->skb_size);
-
+		skb = net_recycle_get(dev);
 		if (skb == NULL) {
 			mp->oom = 1;
 			goto oom;
@@ -922,6 +918,7 @@ out:
 static int txq_reclaim(struct tx_queue *txq, int budget, int force)
 {
 	struct mv643xx_eth_private *mp = txq_to_mp(txq);
+	struct net_device *dev = mp->dev;
 	struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index);
 	int reclaimed;
 
@@ -968,14 +965,8 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force)
 				       desc->byte_cnt, DMA_TO_DEVICE);
 		}
 
-		if (skb != NULL) {
-			if (skb_queue_len(&mp->rx_recycle) <
-					mp->rx_ring_size &&
-			    skb_recycle_check(skb, mp->skb_size))
-				__skb_queue_head(&mp->rx_recycle, skb);
-			else
-				dev_kfree_skb(skb);
-		}
+		if (skb)
+			net_recycle_add(dev, skb);
 	}
 
 	__netif_tx_unlock(nq);
@@ -1564,7 +1555,7 @@ mv643xx_eth_set_ringparam(struct net_device *dev, struct ethtool_ringparam *er)
 
 	mp->rx_ring_size = er->rx_pending < 4096 ? er->rx_pending : 4096;
 	mp->tx_ring_size = er->tx_pending < 4096 ? er->tx_pending : 4096;
-
+	net_recycle_qlen(dev, mp->rx_ring_size);
 	if (netif_running(dev)) {
 		mv643xx_eth_stop(dev);
 		if (mv643xx_eth_open(dev)) {
@@ -2340,9 +2331,9 @@ static int mv643xx_eth_open(struct net_device *dev)
 
 	mv643xx_eth_recalc_skb_size(mp);
 
-	napi_enable(&mp->napi);
+	net_recycle_init(mp->dev, mp->rx_ring_size, mp->skb_size);
 
-	skb_queue_head_init(&mp->rx_recycle);
+	napi_enable(&mp->napi);
 
 	mp->int_mask = INT_EXT;
 
@@ -2438,7 +2429,7 @@ static int mv643xx_eth_stop(struct net_device *dev)
 	mib_counters_update(mp);
 	del_timer_sync(&mp->mib_counters_timer);
 
-	skb_queue_purge(&mp->rx_recycle);
+	net_recycle_cleanup(dev);
 
 	for (i = 0; i < mp->rxq_count; i++)
 		rxq_deinit(mp->rxq + i);
-- 
1.6.6.1


^ permalink raw reply related

* [RFC 1/5] net: implement generic rx recycling
From: Sebastian Andrzej Siewior @ 2010-05-05 14:47 UTC (permalink / raw)
  To: netdev; +Cc: tglx, Sebastian Andrzej Siewior
In-Reply-To: <1273070870-7821-1-git-send-email-sebastian@breakpoint.cc>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/netdevice.h |   70 +++++++++++++++++++++++++++++++++++++--------
 1 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 98112fb..70d385d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -588,6 +588,18 @@ struct netdev_rx_queue {
 } ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RPS */
 
+/* Use this variant when it is known for sure that it
+ * is executing from hardware interrupt context or with hardware interrupts
+ * disabled.
+ */
+extern void dev_kfree_skb_irq(struct sk_buff *skb);
+
+/* Use this variant in places where it could be invoked
+ * from either hardware interrupt or other context, with hardware interrupts
+ * either disabled or enabled.
+ */
+extern void dev_kfree_skb_any(struct sk_buff *skb);
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1044,9 +1056,55 @@ struct net_device {
 #endif
 	/* n-tuple filter list attached to this device */
 	struct ethtool_rx_ntuple_list ethtool_ntuple_list;
+	struct sk_buff_head rx_recycle;
+	u32 rx_rec_skbs_max;
+	u32 rx_rec_skb_size;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+static inline void net_recycle_init(struct net_device *dev, u32 qlen, u32 size)
+{
+	skb_queue_head_init(&dev->rx_recycle);
+	dev->rx_rec_skbs_max = qlen;
+	dev->rx_rec_skb_size = size;
+}
+
+static inline void net_recycle_cleanup(struct net_device *dev)
+{
+	skb_queue_purge(&dev->rx_recycle);
+}
+
+static inline void net_recycle_add(struct net_device *dev, struct sk_buff *skb)
+{
+	if (skb_queue_len(&dev->rx_recycle) < dev->rx_rec_skbs_max &&
+			skb_recycle_check(skb, dev->rx_rec_skb_size))
+		skb_queue_head(&dev->rx_recycle, skb);
+	else
+		dev_kfree_skb_any(skb);
+}
+
+static inline struct sk_buff *net_recycle_get(struct net_device *dev)
+{
+	struct sk_buff *skb;
+
+	skb = skb_dequeue(&dev->rx_recycle);
+	if (skb)
+		return skb;
+	return netdev_alloc_skb(dev, dev->rx_rec_skb_size);
+}
+
+static inline void net_recycle_size(struct net_device *dev, u32 size)
+{
+	if (dev->rx_rec_skb_size < size)
+		net_recycle_cleanup(dev);
+	dev->rx_rec_skb_size = size;
+}
+
+static inline void net_recycle_qlen(struct net_device *dev, u32 qlen)
+{
+	dev->rx_rec_skbs_max = qlen;
+}
+
 #define	NETDEV_ALIGN		32
 
 static inline
@@ -1635,18 +1693,6 @@ static inline int netif_is_multiqueue(const struct net_device *dev)
 	return (dev->num_tx_queues > 1);
 }
 
-/* Use this variant when it is known for sure that it
- * is executing from hardware interrupt context or with hardware interrupts
- * disabled.
- */
-extern void dev_kfree_skb_irq(struct sk_buff *skb);
-
-/* Use this variant in places where it could be invoked
- * from either hardware interrupt or other context, with hardware interrupts
- * either disabled or enabled.
- */
-extern void dev_kfree_skb_any(struct sk_buff *skb);
-
 #define HAVE_NETIF_RX 1
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
-- 
1.6.6.1


^ permalink raw reply related

* [RFC 0/5] generic rx recycling
From: Sebastian Andrzej Siewior @ 2010-05-05 14:47 UTC (permalink / raw)
  To: netdev; +Cc: tglx

This series merges the rx recycling code trying to come up with generic
code. Recycling skbs from the tx path for incomming rx skips the memory
allocater and improves latency during memory pressure.
This is now used by just by just four drivers in the tree which were doing
this on their own.

Sebastian


^ permalink raw reply

* [RFC 2/5] net/gianfar: use generic recycling infrasstructure
From: Sebastian Andrzej Siewior @ 2010-05-05 14:47 UTC (permalink / raw)
  To: netdev; +Cc: tglx, Sebastian Andrzej Siewior
In-Reply-To: <1273070870-7821-1-git-send-email-sebastian@breakpoint.cc>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/gianfar.c |   28 ++++++++--------------------
 drivers/net/gianfar.h |    2 --
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index c6df2ba..af98675 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1102,6 +1102,9 @@ static int gfar_probe(struct of_device *ofdev,
 		priv->rx_queue[i]->rxic = DEFAULT_RXIC;
 	}
 
+	net_recycle_init(dev, DEFAULT_RX_RING_SIZE,
+			priv->rx_buffer_size + RXBUF_ALIGNMENT);
+
 	/* enable filer if using multiple RX queues*/
 	if(priv->num_rx_queues > 1)
 		priv->rx_filer_enable = 1;
@@ -1705,7 +1708,7 @@ static void free_skb_resources(struct gfar_private *priv)
 			sizeof(struct rxbd8) * priv->total_rx_ring_size,
 			priv->tx_queue[0]->tx_bd_base,
 			priv->tx_queue[0]->tx_bd_dma_base);
-	skb_queue_purge(&priv->rx_recycle);
+	net_recycle_cleanup(priv->ndev);
 }
 
 void gfar_start(struct net_device *dev)
@@ -1886,8 +1889,6 @@ static int gfar_enet_open(struct net_device *dev)
 
 	enable_napi(priv);
 
-	skb_queue_head_init(&priv->rx_recycle);
-
 	/* Initialize a bunch of registers */
 	init_registers(dev);
 
@@ -2291,6 +2292,7 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu)
 		stop_gfar(dev);
 
 	priv->rx_buffer_size = tempsize;
+	net_recycle_size(dev, tempsize + RXBUF_ALIGNMENT);
 
 	dev->mtu = new_mtu;
 
@@ -2422,16 +2424,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 			bdp = next_txbd(bdp, base, tx_ring_size);
 		}
 
-		/*
-		 * If there's room in the queue (limit it to rx_buffer_size)
-		 * we add this skb back into the pool, if it's the right size
-		 */
-		if (skb_queue_len(&priv->rx_recycle) < rx_queue->rx_ring_size &&
-				skb_recycle_check(skb, priv->rx_buffer_size +
-					RXBUF_ALIGNMENT))
-			__skb_queue_head(&priv->rx_recycle, skb);
-		else
-			dev_kfree_skb_any(skb);
+		net_recycle_add(dev, skb);
 
 		tx_queue->tx_skbuff[skb_dirtytx] = NULL;
 
@@ -2497,14 +2490,9 @@ static void gfar_new_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp,
 struct sk_buff * gfar_new_skb(struct net_device *dev)
 {
 	unsigned int alignamount;
-	struct gfar_private *priv = netdev_priv(dev);
 	struct sk_buff *skb = NULL;
 
-	skb = __skb_dequeue(&priv->rx_recycle);
-	if (!skb)
-		skb = netdev_alloc_skb(dev,
-				priv->rx_buffer_size + RXBUF_ALIGNMENT);
-
+	skb = net_recycle_get(dev);
 	if (!skb)
 		return NULL;
 
@@ -2673,7 +2661,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 				 * recycle list.
 				 */
 				skb_reserve(skb, -GFAR_CB(skb)->alignamount);
-				__skb_queue_head(&priv->rx_recycle, skb);
+				net_recycle_add(dev, skb);
 			}
 		} else {
 			/* Increment the number of packets */
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index ac4a92e..99f5a9b 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -1061,8 +1061,6 @@ struct gfar_private {
 
 	u32 cur_filer_idx;
 
-	struct sk_buff_head rx_recycle;
-
 	struct vlan_group *vlgrp;
 
 
-- 
1.6.6.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox