Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 2/3] net: TCP thin linear timeouts
From: Eric Dumazet @ 2009-10-27 16:56 UTC (permalink / raw)
  To: Andreas Petlund; +Cc: netdev, linux-kernel, shemminger, ilpo.jarvinen, davem
In-Reply-To: <4AE72079.4030504@simula.no>

Andreas Petlund a écrit :
> This patch will make TCP use only linear timeouts if the stream is thin. This will help to avoid the very high latencies that thin stream suffer because of exponential backoff. This mechanism is only active if enabled by iocontrol or syscontrol and the stream is identified as thin.
> 

Wont this reduce the session timeout to something very small, ie 15 retransmits, way under the minute ?

^ permalink raw reply

* [net-next-2.6 PATCH 0/6] net: Speedup netdevice unregisters
From: Eric Dumazet @ 2009-10-27 17:02 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

netdevice unregisters are serialized and call synchronize_{net|rcu}()
three times per device.

This means it can take a long time to remove a device, if many virtual
devices are attached. (vlan, macvlan, tunnels, ...)

This patch series partially solve the problem by batching several devices in a list,
so that two synchronize_net() calls can be factorized.

Some results on my dev machine (2x4 cpus, HZ=1000, PREEMPT):

modprobe dummy numdummies=10000
# unregister 10000 DOWN netdevices
time rmmod dummy

Before patches :

real    2m0.303s  (12 ms per device)
user    0m0.001s
sys     0m0.117s

After patches 1,2,3 :

real	0m1.111s
user	0m0.001s
sys	0m0.922s (mostly sysfs overhead)


modprobe dummy
ip link set dummy0 up
for f in `seq 1 1000`
do
 ip link add link dummy0 dummy0.$f type vlan id $f
 ip link set dummy0.$f up
done
# Dismantle of 1000 UP vlans (but no IP address)
time rmmod dummy

Before patches :

real    0m40.410s  (40 ms per vlan)
user    0m0.000s
sys     0m0.022s

After patches 1,2,3 :

real	0m20.990s
user	0m0.001s
sys	0m0.011s

After patch 4 (vlan: Use unregister_netdevice_many())

real	0m4.392s (-> 4.3 ms per vlan)
user	0m0.000s
sys	0m0.067s

We still have a synchronize_rcu() in dev_deactivate() that could be
factorized in followup patches if there is some interest.


PATCH 1/6 : net: Introduce unregister_netdevice_queue()
PATCH 2/6 : net: Introduce unregister_netdevice_many()
PATCH 3/6 : net: Add a list_head parameter to dellink() method
PATCH 4/6 : vlan: Optimize multiple unregistration
PATCH 5/6 : ipip: Optimize multiple unregistration
PATCH 6/6 : gre: Optimize multiple unregistration


^ permalink raw reply

* [PATCH 1/6] net: Introduce unregister_netdevice_queue()
From: Eric Dumazet @ 2009-10-27 17:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

This patchs adds an unreg_list anchor to struct net_device, and
introduces an unregister_netdevice_queue() function, able to queue
a net_device to a list instead of immediately unregister it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    9 ++++++++-
 net/core/dev.c            |   20 +++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8380009..0ded0a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -683,6 +683,7 @@ struct net_device
 
 	struct list_head	dev_list;
 	struct list_head	napi_list;
+	struct list_head	unreg_list;
 
 	/* Net device features */
 	unsigned long		features;
@@ -1116,7 +1117,13 @@ extern int		dev_close(struct net_device *dev);
 extern void		dev_disable_lro(struct net_device *dev);
 extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
-extern void		unregister_netdevice(struct net_device *dev);
+extern void		unregister_netdevice_queue(struct net_device *dev,
+						   struct list_head *head);
+static inline void unregister_netdevice(struct net_device *dev)
+{
+	unregister_netdevice_queue(dev, NULL);
+}
+
 extern void		free_netdev(struct net_device *dev);
 extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 950c13f..ff94e2b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5245,25 +5245,31 @@ void synchronize_net(void)
 EXPORT_SYMBOL(synchronize_net);
 
 /**
- *	unregister_netdevice - remove device from the kernel
+ *	unregister_netdevice_queue - remove device from the kernel
  *	@dev: device
- *
+ *	@head: list
+
  *	This function shuts down a device interface and removes it
  *	from the kernel tables.
+ *	If head not NULL, device is queued to be unregistered later.
  *
  *	Callers must hold the rtnl semaphore.  You may want
  *	unregister_netdev() instead of this.
  */
 
-void unregister_netdevice(struct net_device *dev)
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 {
 	ASSERT_RTNL();
 
-	rollback_registered(dev);
-	/* Finish processing unregister after unlock */
-	net_set_todo(dev);
+	if (head) {
+		list_add_tail(&dev->unreg_list, head);
+	} else {
+		rollback_registered(dev);
+		/* Finish processing unregister after unlock */
+		net_set_todo(dev);
+	}
 }
-EXPORT_SYMBOL(unregister_netdevice);
+EXPORT_SYMBOL(unregister_netdevice_queue);
 
 /**
  *	unregister_netdev - remove device from the kernel

^ permalink raw reply related

* [PATCH 2/6] net: Introduce unregister_netdevice_many()
From: Eric Dumazet @ 2009-10-27 17:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

Introduce rollback_registered_many() and unregister_netdevice_many()

rollback_registered_many() is able to perform necessary steps at device dismantle
time, factorizing two expensive synchronize_net() calls.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    1 
 net/core/dev.c            |   97 ++++++++++++++++++++++++------------
 2 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ded0a4..e7c227d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1119,6 +1119,7 @@ extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
 extern void		unregister_netdevice_queue(struct net_device *dev,
 						   struct list_head *head);
+extern void		unregister_netdevice_many(struct list_head *head);
 static inline void unregister_netdevice(struct net_device *dev)
 {
 	unregister_netdevice_queue(dev, NULL);
diff --git a/net/core/dev.c b/net/core/dev.c
index ff94e2b..15d1453 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4637,59 +4637,76 @@ static void net_set_todo(struct net_device *dev)
 	list_add_tail(&dev->todo_list, &net_todo_list);
 }
 
-static void rollback_registered(struct net_device *dev)
+static void rollback_registered_many(struct list_head *head)
 {
+	struct net_device *dev;
+
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-	/* Some devices call without registering for initialization unwind. */
-	if (dev->reg_state == NETREG_UNINITIALIZED) {
-		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
-				  "was registered\n", dev->name, dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Some devices call without registering
+		 * for initialization unwind.
+		 */
+		if (dev->reg_state == NETREG_UNINITIALIZED) {
+			pr_debug("unregister_netdevice: device %s/%p never "
+				 "was registered\n", dev->name, dev);
 
-		WARN_ON(1);
-		return;
-	}
+			WARN_ON(1);
+			return;
+		}
 
-	BUG_ON(dev->reg_state != NETREG_REGISTERED);
+		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 
-	/* If device is running, close it first. */
-	dev_close(dev);
+		/* If device is running, close it first. */
+		dev_close(dev);
 
-	/* And unlink it from device chain. */
-	unlist_netdevice(dev);
+		/* And unlink it from device chain. */
+		unlist_netdevice(dev);
 
-	dev->reg_state = NETREG_UNREGISTERING;
+		dev->reg_state = NETREG_UNREGISTERING;
+	}
 
 	synchronize_net();
 
-	/* Shutdown queueing discipline. */
-	dev_shutdown(dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Shutdown queueing discipline. */
+		dev_shutdown(dev);
 
 
-	/* Notify protocols, that we are about to destroy
-	   this device. They should clean all the things.
-	*/
-	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+		/* Notify protocols, that we are about to destroy
+		   this device. They should clean all the things.
+		*/
+		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
-	/*
-	 *	Flush the unicast and multicast chains
-	 */
-	dev_unicast_flush(dev);
-	dev_addr_discard(dev);
+		/*
+		 *	Flush the unicast and multicast chains
+		 */
+		dev_unicast_flush(dev);
+		dev_addr_discard(dev);
 
-	if (dev->netdev_ops->ndo_uninit)
-		dev->netdev_ops->ndo_uninit(dev);
+		if (dev->netdev_ops->ndo_uninit)
+			dev->netdev_ops->ndo_uninit(dev);
 
-	/* Notifier chain MUST detach us from master device. */
-	WARN_ON(dev->master);
+		/* Notifier chain MUST detach us from master device. */
+		WARN_ON(dev->master);
 
-	/* Remove entries from kobject tree */
-	netdev_unregister_kobject(dev);
+		/* Remove entries from kobject tree */
+		netdev_unregister_kobject(dev);
+	}
 
 	synchronize_net();
 
-	dev_put(dev);
+	list_for_each_entry(dev, head, unreg_list)
+		dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+	LIST_HEAD(single);
+	
+	list_add(&dev->unreg_list, &single);
+	rollback_registered_many(&single);
 }
 
 static void __netdev_init_queue_locks_one(struct net_device *dev,
@@ -5272,6 +5289,22 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 EXPORT_SYMBOL(unregister_netdevice_queue);
 
 /**
+ *	unregister_netdevice_many - unregister many devices
+ *	@head: list of devices
+ *
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+	struct net_device *dev;
+
+	if (!list_empty(head)) {
+		rollback_registered_many(head);
+		list_for_each_entry(dev, head, unreg_list)
+			net_set_todo(dev);
+	}
+}
+
+/**
  *	unregister_netdev - remove device from the kernel
  *	@dev: device
  *

^ permalink raw reply related

* [PATCH 3/6] net: add a list_head parameter to dellink() method
From: Eric Dumazet @ 2009-10-27 17:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

Adding a list_head parameter to rtnl_link_ops->dellink() methods
allow us to queue devices on a list, in order to dismantle
them all at once.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/macvlan.c   |    6 +++---
 drivers/net/veth.c      |    2 +-
 include/net/rtnetlink.h |    3 ++-
 net/8021q/vlan.c        |    8 ++++----
 net/8021q/vlan.h        |    2 +-
 net/core/dev.c          |    2 +-
 net/core/rtnetlink.c    |   14 +++++++-------
 7 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 3aabfd9..20b7707 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -555,13 +555,13 @@ static int macvlan_newlink(struct net_device *dev,
 	return 0;
 }
 
-static void macvlan_dellink(struct net_device *dev)
+static void macvlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvlan_port *port = vlan->port;
 
 	list_del(&vlan->list);
-	unregister_netdevice(dev);
+	unregister_netdevice_queue(dev, head);
 
 	if (list_empty(&port->vlans))
 		macvlan_port_destroy(port->dev);
@@ -601,7 +601,7 @@ static int macvlan_device_event(struct notifier_block *unused,
 		break;
 	case NETDEV_UNREGISTER:
 		list_for_each_entry_safe(vlan, next, &port->vlans, list)
-			macvlan_dellink(vlan->dev);
+			macvlan_dellink(vlan->dev, NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index ade5b34..ffb502d 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -442,7 +442,7 @@ err_register_peer:
 	return err;
 }
 
-static void veth_dellink(struct net_device *dev)
+static void veth_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct veth_priv *priv;
 	struct net_device *peer;
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index c3aa044..cd5af1f 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -61,7 +61,8 @@ struct rtnl_link_ops {
 	int			(*changelink)(struct net_device *dev,
 					      struct nlattr *tb[],
 					      struct nlattr *data[]);
-	void			(*dellink)(struct net_device *dev);
+	void			(*dellink)(struct net_device *dev,
+					   struct list_head *head);
 
 	size_t			(*get_size)(const struct net_device *dev);
 	int			(*fill_info)(struct sk_buff *skb,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8836575..6b5c9dd 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -140,7 +140,7 @@ static void vlan_rcu_free(struct rcu_head *rcu)
 	vlan_group_free(container_of(rcu, struct vlan_group, rcu));
 }
 
-void unregister_vlan_dev(struct net_device *dev)
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 {
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	struct net_device *real_dev = vlan->real_dev;
@@ -164,7 +164,7 @@ void unregister_vlan_dev(struct net_device *dev)
 
 	synchronize_net();
 
-	unregister_netdevice(dev);
+	unregister_netdevice_queue(dev, head);
 
 	/* If the group is now empty, kill off the group. */
 	if (grp->nr_vlans == 0) {
@@ -535,7 +535,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			if (grp->nr_vlans == 1)
 				i = VLAN_GROUP_ARRAY_LEN;
 
-			unregister_vlan_dev(vlandev);
+			unregister_vlan_dev(vlandev, NULL);
 		}
 		break;
 	}
@@ -642,7 +642,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 		err = -EPERM;
 		if (!capable(CAP_NET_ADMIN))
 			break;
-		unregister_vlan_dev(dev);
+		unregister_vlan_dev(dev, NULL);
 		err = 0;
 		break;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 4ade5ed..68f9290 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -82,7 +82,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
-void unregister_vlan_dev(struct net_device *dev);
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
 					    u16 vlan_tci)
diff --git a/net/core/dev.c b/net/core/dev.c
index 15d1453..dedacd8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5629,7 +5629,7 @@ restart:
 
 		/* Delete virtual devices */
 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
-			dev->rtnl_link_ops->dellink(dev);
+			dev->rtnl_link_ops->dellink(dev, NULL);
 			goto restart;
 		}
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 52ea418..391a62c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -248,7 +248,7 @@ static LIST_HEAD(link_ops);
 int __rtnl_link_register(struct rtnl_link_ops *ops)
 {
 	if (!ops->dellink)
-		ops->dellink = unregister_netdevice;
+		ops->dellink = unregister_netdevice_queue;
 
 	list_add_tail(&ops->list, &link_ops);
 	return 0;
@@ -277,13 +277,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register);
 static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
 {
 	struct net_device *dev;
-restart:
+	LIST_HEAD(list_kill);
+
 	for_each_netdev(net, dev) {
-		if (dev->rtnl_link_ops == ops) {
-			ops->dellink(dev);
-			goto restart;
-		}
+		if (dev->rtnl_link_ops == ops)
+			ops->dellink(dev, &list_kill);
 	}
+	unregister_netdevice_many(&list_kill);
 }
 
 void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
@@ -972,7 +972,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	if (!ops)
 		return -EOPNOTSUPP;
 
-	ops->dellink(dev);
+	ops->dellink(dev, NULL);
 	return 0;
 }
 

^ permalink raw reply related

* [PATCH 4/6] vlan: Optimize multiple unregistration
From: Eric Dumazet @ 2009-10-27 17:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

Use unregister_netdevice_many() to speedup master device unregister.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/if_vlan.h |    1 
 net/8021q/vlan.c        |   49 +++++++++++++++++++++++++-------------
 net/core/dev.c          |    1 
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 8898cbe..71a4870 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -85,6 +85,7 @@ struct vlan_group {
 					    * the vlan is attached to.
 					    */
 	unsigned int		nr_vlans;
+	int			killall;
 	struct hlist_node	hlist;	/* linked list */
 	struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
 	struct rcu_head		rcu;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 6b5c9dd..511afe7 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -159,11 +159,12 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
 		ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id);
 
-	vlan_group_set_device(grp, vlan_id, NULL);
 	grp->nr_vlans--;
 
-	synchronize_net();
-
+	if (!grp->killall) {
+		vlan_group_set_device(grp, vlan_id, NULL);
+		synchronize_net();
+	}
 	unregister_netdevice_queue(dev, head);
 
 	/* If the group is now empty, kill off the group. */
@@ -183,6 +184,34 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	dev_put(real_dev);
 }
 
+void unregister_vlan_dev_alls(struct vlan_group *grp)
+{
+	LIST_HEAD(list);
+	int i;
+	struct net_device *vlandev;
+	struct vlan_group save;
+
+	memcpy(&save, grp, sizeof(save));
+	memset(&grp->vlan_devices_arrays, 0, sizeof(grp->vlan_devices_arrays));
+	grp->killall = 1;
+
+	synchronize_net();
+
+	/* Delete all VLANs for this dev. */
+	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		vlandev = vlan_group_get_device(&save, i);
+		if (!vlandev)
+			continue;
+
+		unregister_vlan_dev(vlandev, &list);
+		if (grp->nr_vlans == 0)
+			break;
+	}
+	unregister_netdevice_many(&list);
+	for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++)
+		kfree(save.vlan_devices_arrays[i]);
+}
+
 static void vlan_transfer_operstate(const struct net_device *dev,
 				    struct net_device *vlandev)
 {
@@ -524,19 +553,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_UNREGISTER:
-		/* Delete all VLANs for this dev. */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
-			/* unregistration of last vlan destroys group, abort
-			 * afterwards */
-			if (grp->nr_vlans == 1)
-				i = VLAN_GROUP_ARRAY_LEN;
-
-			unregister_vlan_dev(vlandev, NULL);
-		}
+		unregister_vlan_dev_alls(grp);
 		break;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index dedacd8..82a3bb9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5303,6 +5303,7 @@ void unregister_netdevice_many(struct list_head *head)
 			net_set_todo(dev);
 	}
 }
+EXPORT_SYMBOL(unregister_netdevice_many);
 
 /**
  *	unregister_netdev - remove device from the kernel

^ permalink raw reply related

* [PATCH 5/6] ipip: Optimize multiple unregistration
From: Eric Dumazet @ 2009-10-27 17:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

Speedup module unloading by factorizing synchronize_rcu() calls

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/ipip.c |   17 +++++++++++------
 1 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3bd6998..a2ca53d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -754,16 +754,19 @@ static struct xfrm_tunnel ipip_handler = {
 static const char banner[] __initconst =
 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
 
-static void ipip_destroy_tunnels(struct ipip_net *ipn)
+static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
 {
 	int prio;
 
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t;
-			while ((t = ipn->tunnels[prio][h]) != NULL)
-				unregister_netdevice(t->dev);
+			struct ip_tunnel *t = ipn->tunnels[prio][h];
+
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = t->next;
+			}
 		}
 	}
 }
@@ -816,11 +819,13 @@ err_alloc:
 static void ipip_exit_net(struct net *net)
 {
 	struct ipip_net *ipn;
+	LIST_HEAD(list);
 
 	ipn = net_generic(net, ipip_net_id);
 	rtnl_lock();
-	ipip_destroy_tunnels(ipn);
-	unregister_netdevice(ipn->fb_tunnel_dev);
+	ipip_destroy_tunnels(ipn, &list);
+	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
 	kfree(ipn);
 }

^ permalink raw reply related

* [PATCH 6/6] gre: Optimize multiple unregistration
From: Eric Dumazet @ 2009-10-27 17:07 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

Speedup module unloading by factorizing synchronize_rcu() calls

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/ip_gre.c |   15 ++++++++++-----
 1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 40f0439..a77807d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1290,16 +1290,19 @@ static const struct net_protocol ipgre_protocol = {
 	.netns_ok	=	1,
 };
 
-static void ipgre_destroy_tunnels(struct ipgre_net *ign)
+static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
 {
 	int prio;
 
 	for (prio = 0; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t;
-			while ((t = ign->tunnels[prio][h]) != NULL)
-				unregister_netdevice(t->dev);
+			struct ip_tunnel *t = ign->tunnels[prio][h];
+
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = t->next;
+			}
 		}
 	}
 }
@@ -1347,10 +1350,12 @@ err_alloc:
 static void ipgre_exit_net(struct net *net)
 {
 	struct ipgre_net *ign;
+	LIST_HEAD(list);
 
 	ign = net_generic(net, ipgre_net_id);
 	rtnl_lock();
-	ipgre_destroy_tunnels(ign);
+	ipgre_destroy_tunnels(ign, &list);
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
 	kfree(ign);
 }

^ permalink raw reply related

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-27 17:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Andrew Morton, Linus Torvalds,
	Octavian Purdila, netdev, linux-kernel, Al Viro
In-Reply-To: <4AE6A16F.4020002@gmail.com>

On Tue, 27 Oct 2009 08:29:51 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Eric Dumazet a écrit :
> > 
> > 
> > 511 value on 64bit, and 1023 on 32bit arches are nice because
> > hashsz * sizeof(pointer) <= 4096, wasting space for one pointer only.
> > 
> > Conclusion : jhash and 511/1023 hashsize for netdevices,
> > no divides, only one multiply for the fold.
> 
> Just forget about 511 & 1023, as power of two works too.
> 
> -> 512 & 1024 + jhash
> 
> Guess what, David already said this :)


Rather than wasting space, or doing expensive, modulus; just folding
the higher bits back with XOR redistributes the bits better.


On fast machine (Nehalam):

100000000 Iterations
256 Slots (order 8)
Algorithm             Time       Ratio       Max   StdDev
string10             2.505290       1.00    390628   0.00
xor                  2.521329       1.00    392120   2.14
SuperFastHash        2.781745       1.00    397027   4.43
fnv32                2.847892       1.00    392139   0.98
djb2                 2.886342       1.00    390827   0.12
string_hash31        2.900980       1.00    391001   0.20
string_hash17        2.938708       1.00    391122   0.20
full_name_hash       3.080886       1.00    390860   0.10
jhash_string         3.092161       1.00    392775   1.08
fnv64                5.340740       1.00    392854   0.88
kr_hash              2.395757       7.30   4379091 1568.25

On slow machine (CULV):
100000000 Iterations
256 Slots (order 8)
Algorithm             Time       Ratio       Max   StdDev
string10             10.807174       1.00    390628   0.00
SuperFastHash        11.397303       1.00    397027   4.43
xor                  11.660968       1.00    392120   2.14
djb2                 11.674707       1.00    390827   0.12
jhash_string         11.997104       1.00    392775   1.08
fnv32                12.289086       1.00    392139   0.98
string_hash17        12.863864       1.00    391122   0.20
full_name_hash       13.249483       1.00    390860   0.10
string_hash31        13.668270       1.00    391001   0.20
fnv64                39.808964       1.00    392854   0.88
kr_hash              10.316305       7.30   4379091 1568.25

So Eric's string10 is fastest for special case of fooNNN style names.
But probably isn't best for general strings. Orignal function
is >20% slower, which is surprising probably because of overhead
of 2 shifts and multipy. jenkins and fnv are both 10% slower.

The following seems to give best results (combination of 16bit trick
and string17).


static unsigned int xor17(const unsigned char *key, unsigned int len)
{
    	uint32_t h = 0;
	unsigned int rem;

	rem = len & 1;
	len >>= 1;

	while (len--) {
		h = ((h << 4) + h) ^ get_unaligned16(key);
		key += sizeof(uint16_t);
	}

	if (rem)
		h = ((h << 4) + h) ^ *key;


    	return h;
}

^ permalink raw reply

* [PATCH] net: fold network name hash
From: Stephen Hemminger @ 2009-10-27 17:22 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-kernel, eric.dumazet, akpm, torvalds, opurdila,
	netdev, linux-kernel, viro
In-Reply-To: <20091026.222428.80364204.davem@davemloft.net>

The full_name_hash does not produce a value that is evenly distributed
over the lower 8 bits. This causes name hash to be unbalanced with large
number of names.  A simple fix is to just fold in the higher bits
with XOR.

This is independent of possible improvements to full_name_hash()
in future.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/core/dev.c	2009-10-27 09:21:46.127252547 -0700
+++ b/net/core/dev.c	2009-10-27 09:25:14.593313378 -0700
@@ -199,7 +199,11 @@ EXPORT_SYMBOL(dev_base_lock);
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 {
 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+
+	hash ^= (hash >> NETDEV_HASHBITS);
+	hash &= NETDEV_HASHENTRIES - 1;
+
+	return &net->dev_name_head[hash];
 }
 
 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)

^ permalink raw reply

* Re: [PATCH 2/6] net: Introduce unregister_netdevice_many()
From: Octavian Purdila @ 2009-10-27 17:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, Linux Netdev List
In-Reply-To: <4AE72813.5080709@gmail.com>

On Tuesday 27 October 2009 19:04:19 you wrote:

> +void unregister_netdevice_many(struct list_head *head)
> +{
> +       struct net_device *dev;
> +
> +       if (!list_empty(head)) {
> +               rollback_registered_many(head);
> +               list_for_each_entry(dev, head, unreg_list)
> +                       net_set_todo(dev);
> +       }
> +}
> 

Looks like EXPORT_SYMBOL(unregister_netdevice_many) is missing?

Thanks,
tavi

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Linus Torvalds @ 2009-10-27 17:32 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <20091027100736.5303f1ab@nehalam>



On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> 
> Rather than wasting space, or doing expensive, modulus; just folding
> the higher bits back with XOR redistributes the bits better.

Please don't make up any new hash functions without having a better input 
set than the one you seem to use.

The 'fnv' function I can believe in, because the whole "multiply by big 
prime number" thing to spread out the bits is a very traditional model. 
But making up a new hash function based on essentially consecutive names 
is absolutely the wrong thing to do. You need a much better corpus of path 
component names for testing.

> The following seems to give best results (combination of 16bit trick
> and string17).

.. and these kinds of games are likely to work badly on some 
architectures. Don't use 16-bit values, and don't use 'get_unaligned()'. 
Both tend to work fine on x86, but likely suck on some other 
architectures.

Also remember that the critical hash function needs to check for '/' and 
'\0' while at it, which is one reason why it does things byte-at-a-time. 
If you try to be smart, you'd need to be smart about the end condition 
too.

The loop to optimize is _not_ based on 'name+len', it is this code:

                this.name = name;
                c = *(const unsigned char *)name;

                hash = init_name_hash();
                do {
                        name++;
                        hash = partial_name_hash(c, hash);
                        c = *(const unsigned char *)name;
                } while (c && (c != '/'));
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);

(which depends on us having already removed all slashed at the head, and 
knowing that the string is not zero-sized)

So doing things multiple bytes at a time is certainly still possible, but 
you would always have to find the slashes/NUL's in there first. Doing that 
efficiently and portably is not trivial - especially since a lot of 
critical path components are short.

(Remember: there may be just a few 'bin' directory names, but if you do 
performance analysis, 'bin' as a path component is probably hashed a lot 
more than 'five_slutty_bimbos_and_a_donkey.jpg'. So the relative weighting 
of importance of the filename should probably include the frequency it 
shows up in pathname lookup)

		Linus

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-27 17:35 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <4AE72B91.7040700@gmail.com>

On Tue, 27 Oct 2009 18:19:13 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Stephen Hemminger a écrit :
> 
> > So Eric's string10 is fastest for special case of fooNNN style names.
> > But probably isn't best for general strings. Orignal function
> > is >20% slower, which is surprising probably because of overhead
> > of 2 shifts and multipy. jenkins and fnv are both 10% slower.
> > 
> 
> 
> jhash() is faster when strings are longer, being able to process 12 bytes per loop.
> 

But jhash is not amenable to usage in namei (with partial_name_hash).

name_hash is rarely done on long strings, the average length of a filename
is fairly short (probably leftover Unix legacy). On my system, average
path component length in /usr is 13 characters; therefore jhash has
no big benefit here.

^ permalink raw reply

* Re: [PATCH 2/6] net: Introduce unregister_netdevice_many()
From: Eric Dumazet @ 2009-10-27 17:40 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: David S. Miller, Linux Netdev List
In-Reply-To: <200910271926.41734.opurdila@ixiacom.com>

Octavian Purdila a écrit :

> Looks like EXPORT_SYMBOL(unregister_netdevice_many) is missing?
> 

I added it in following patch, when really needed/used


^ permalink raw reply

* Re: wanPMC-CxT1E1
From: Bob Beers @ 2009-10-27 17:48 UTC (permalink / raw)
  To: Greg KH; +Cc: netdev
In-Reply-To: <20091026204144.GA28436@kroah.com>

On Mon, Oct 26, 2009 at 4:41 PM, Greg KH <greg@kroah.com> wrote:
> Getting it to build on 2.6.31 is more important than RHEL5, we can't do
> anything with an old kernel like that.

ok, so where do I start, I have a system ready to start
 git cloning, and creating patches. I googled for a while
 but didn't find a nice recipe for participating in the -staging
 process.

I'll try to add something here[1] if I get some good advice.

thanks,

-Bob Beers

[1] <http://www.linuxdriverproject.org/foswiki/bin/view/Main/OutOfTreeDrivers>

^ permalink raw reply

* [PATCH 2/3] [RFC] Add c/r support for connected INET sockets (v3)
From: Dan Smith @ 2009-10-27 17:53 UTC (permalink / raw)
  To: containers; +Cc: netdev, Oren Laadan, John Dykstra
In-Reply-To: <1256666008-8231-1-git-send-email-danms@us.ibm.com>

This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

We probably need comments from the netdev people about the quality of
sanity checking we do on the values in the ckpt_hdr_socket_inet
structure on restart.

Note that this still doesn't address lingering sockets yet.

Changes in v3:
 - Prevent restart from allowing a bind on a <1024 port unless the
   user is granted that capability
 - Add some sanity checking in the inet_precheck() function to make sure
   the values read from the checkpoint image are within acceptable ranges
 - Check the result of sock_restore_header_info() and fail if needed

Changes in v2:
 - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
   structure instead of saving them separately
 - Fix 'sock' naming in sock_cptrst()
 - Don't take the queue lock before skb_queue_tail() since it is
   done for us
 - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
   flag is specified on sys_restart()
 - Pull the implementation of the list of listening sockets back into
   this patch
 - Fix dangling printk
 - Add some comments around the parent/child restore logic

Cc: netdev@vger.kernel.org
Cc: Oren Laadan <orenl@librato.com>
Cc: John Dykstra <jdykstra72@gmail.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint.h       |    5 +-
 include/linux/checkpoint_hdr.h   |   95 ++++++++++
 include/linux/checkpoint_types.h |    2 +
 net/checkpoint.c                 |   23 ++--
 net/ipv4/checkpoint.c            |  349 +++++++++++++++++++++++++++++++++++++-
 6 files changed, 463 insertions(+), 15 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 260a1ee..df00973 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -221,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	kfree(ctx->pids_arr);
 
+	sock_listening_list_free(&ctx->listen_sockets);
+
 	kfree(ctx);
 }
 
@@ -249,6 +251,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	spin_lock_init(&ctx->lock);
 #endif
 
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index c2832ac..8702307 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -19,6 +19,7 @@
 #define RESTART_TASKSELF	0x1
 #define RESTART_FROZEN		0x2
 #define RESTART_GHOST		0x4
+#define RESTART_SOCK_LISTENONLY 0x8
 
 #ifdef __KERNEL__
 #ifdef CONFIG_CHECKPOINT
@@ -48,7 +49,8 @@
 #define RESTART_USER_FLAGS  \
 	(RESTART_TASKSELF | \
 	 RESTART_FROZEN | \
-	 RESTART_GHOST)
+	 RESTART_GHOST | \
+	 RESTART_SOCK_LISTENONLY)
 
 extern int walk_task_subtree(struct task_struct *task,
 			     int (*func)(struct task_struct *, void *),
@@ -102,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct sockaddr *rem, unsigned *rem_len);
 int sock_restore_header_info(struct sk_buff *skb,
 			     struct ckpt_hdr_socket_buffer *h);
+void sock_listening_list_free(struct list_head *head);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 3e6cab1..e0f6d25 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -20,6 +20,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -569,6 +570,100 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
 	struct ckpt_hdr h;
+	__u32 daddr;
+	__u32 rcv_saddr;
+	__u32 saddr;
+	__u16 dport;
+	__u16 num;
+	__u16 sport;
+	__s16 uc_ttl;
+	__u16 cmsg_flags;
+
+	struct {
+		__u64 timeout;
+		__u32 ato;
+		__u32 lrcvtime;
+		__u16 last_seg_size;
+		__u16 rcv_mss;
+		__u8 pending;
+		__u8 quick;
+		__u8 pingpong;
+		__u8 blocked;
+	} icsk_ack __attribute__ ((aligned(8)));
+
+	/* FIXME: Skipped opt, tos, multicast, cork settings */
+
+	struct {
+		__u32 rcv_nxt;
+		__u32 copied_seq;
+		__u32 rcv_wup;
+		__u32 snd_nxt;
+		__u32 snd_una;
+		__u32 snd_sml;
+		__u32 rcv_tstamp;
+		__u32 lsndtime;
+
+		__u32 snd_wl1;
+		__u32 snd_wnd;
+		__u32 max_window;
+		__u32 mss_cache;
+		__u32 window_clamp;
+		__u32 rcv_ssthresh;
+		__u32 frto_highmark;
+
+		__u32 srtt;
+		__u32 mdev;
+		__u32 mdev_max;
+		__u32 rttvar;
+		__u32 rtt_seq;
+
+		__u32 packets_out;
+		__u32 retrans_out;
+
+		__u32 snd_up;
+		__u32 rcv_wnd;
+		__u32 write_seq;
+		__u32 pushed_seq;
+		__u32 lost_out;
+		__u32 sacked_out;
+		__u32 fackets_out;
+		__u32 tso_deferred;
+		__u32 bytes_acked;
+
+		__s32 lost_cnt_hint;
+		__u32 retransmit_high;
+
+		__u32 lost_retrans_low;
+
+		__u32 prior_ssthresh;
+		__u32 high_seq;
+
+		__u32 retrans_stamp;
+		__u32 undo_marker;
+		__s32 undo_retrans;
+		__u32 total_retrans;
+
+		__u32 urg_seq;
+		__u32 keepalive_time;
+		__u32 keepalive_intvl;
+
+		__u16 urg_data;
+		__u16 advmss;
+		__u8 frto_counter;
+		__u8 nonagle;
+
+		__u8 ecn_flags;
+		__u8 reordering;
+
+		__u8 keepalive_probes;
+	} tcp __attribute__ ((aligned(8)));
+
+	struct {
+		struct in6_addr saddr;
+		struct in6_addr rcv_saddr;
+		struct in6_addr daddr;
+	} inet6 __attribute__ ((aligned(8)));
+
 	__u32 laddr_len;
 	__u32 raddr_len;
 	struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index fa57cdc..91c141b 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -65,6 +65,8 @@ struct ckpt_ctx {
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
+	struct list_head listen_sockets;/* listening parent sockets */
+
 	/* [multi-process checkpoint] */
 	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
 	int nr_tasks;                   /* size of tasks array */
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 50baea9..98ba083 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -134,6 +134,7 @@ int sock_restore_header_info(struct sk_buff *skb,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
+				uint16_t family,
 				int dst_objref)
 {
 	struct sk_buff *skb;
@@ -142,11 +143,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
-		if (UNIXCB(skb).fp) {
+		if ((family == AF_UNIX) && UNIXCB(skb).fp) {
 			ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
 			return -EBUSY;
 		}
@@ -186,6 +183,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
 			      struct sk_buff_head *queue,
+			      uint16_t family,
 			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
@@ -205,7 +203,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -227,12 +225,14 @@ int sock_deferred_write_buffers(void *data)
 		return dst_objref;
 	}
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write recv buffers: %i\n", ret);
 	if (ret < 0)
 		return ret;
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write send buffers: %i\n", ret);
 
 	return ret;
@@ -757,10 +757,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 		goto err;
 
 	if ((h->sock_common.family == AF_INET) &&
-	    (h->sock.state != TCP_LISTEN)) {
-		/* Temporary hack to enable restore of TCP_LISTEN sockets
-		 * while forcing anything else to a closed state
-		 */
+	    (h->sock.state != TCP_LISTEN) &&
+	    (ctx->uflags & RESTART_SOCK_LISTENONLY)) {
+		ckpt_debug("Forcing open socket closed\n");
 		sock->sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	}
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..671014b 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
 	struct ckpt_ctx *ctx;
@@ -28,6 +29,233 @@ struct dq_buffers {
 	struct sock *sk;
 };
 
+struct listen_item {
+	struct sock *sk;
+	struct list_head list;
+};
+
+void sock_listening_list_free(struct list_head *head)
+{
+	struct listen_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, head, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+}
+
+static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->sk = sk;
+	list_add(&item->list, &ctx->listen_sockets);
+
+	return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	list_for_each_entry(item, &ctx->listen_sockets, list) {
+		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
+			return item->sk;
+	}
+
+	return NULL;
+}
+
+static int sock_hash_parent(void *data)
+{
+	struct dq_sock *dq = (struct dq_sock *)data;
+	struct sock *parent;
+
+	ckpt_debug("INET post-restart hash\n");
+
+	dq->sk->sk_prot->hash(dq->sk);
+
+	/* If there is a listening socket with the same source port,
+	 * then become a child of that socket [we are the result of an
+	 * accept()].  Otherwise hash ourselves directly in [we are
+	 * the result of a connect()]
+	 */
+
+	parent = sock_get_parent(dq->ctx, dq->sk);
+	if (parent) {
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+		local_bh_disable();
+		__inet_inherit_port(parent, dq->sk);
+		local_bh_enable();
+	} else {
+		inet_sk(dq->sk)->num = 0;
+		inet_hash_connect(&tcp_death_row, dq->sk);
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+	}
+
+	return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct dq_sock dq;
+
+	dq.sk = sock;
+	dq.ctx = ctx;
+
+	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+			      sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+				struct tcp_sock *sk,
+				struct ckpt_hdr_socket_inet *hh,
+				int op)
+{
+	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+	return 0;
+}
+
+static int sock_inet_restore_addrs(struct inet_sock *inet,
+				   struct ckpt_hdr_socket_inet *hh)
+{
+	inet->daddr = hh->raddr.sin_addr.s_addr;
+	inet->saddr = hh->laddr.sin_addr.s_addr;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = hh->raddr.sin_port;
+	inet->sport = hh->laddr.sin_port;
+
+	return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+			    struct sock *sk,
+			    struct ckpt_hdr_socket_inet *hh,
+			    int op)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret;
+
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, hh->daddr, inet->daddr);
+		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
+		CKPT_COPY(op, hh->dport, inet->dport);
+		CKPT_COPY(op, hh->saddr, inet->saddr);
+		CKPT_COPY(op, hh->sport, inet->sport);
+	} else {
+		ret = sock_inet_restore_addrs(inet, hh);
+		if (ret)
+			return ret;
+	}
+
+	CKPT_COPY(op, hh->num, inet->num);
+	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
+	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
+
+	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+	CKPT_COPY(op,
+		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
+	else if (sk->sk_protocol == IPPROTO_UDP)
+		ret = 0;
+	else {
+		ckpt_write_err(ctx, "T", "unknown socket protocol %d",
+			       sk->sk_protocol);
+		ret = -EINVAL;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *inet6 = inet6_sk(sk);
+		if (op == CKPT_CPT) {
+			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+		} else {
+			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+		}
+	}
+
+	return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct ckpt_hdr_socket_inet *in;
@@ -43,6 +271,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	if (ret)
 		goto out;
 
+	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+	if (ret < 0)
+		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
 	ckpt_hdr_put(ctx, in);
@@ -87,9 +319,11 @@ static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
 	if (ret < 0)
 		goto out;
 
-	spin_lock(&queue->lock);
+	ret = sock_restore_header_info(skb, h);
+	if (!ret)
+		goto out;
+
 	skb_queue_tail(queue, skb);
-	spin_unlock(&queue->lock);
  out:
 	ckpt_hdr_put(ctx, h);
 
@@ -162,6 +396,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
 
 static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 {
+	__u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
+		ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
+	__u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
+	__u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
+	__u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
+
+	if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
+	    !capable(CAP_NET_BIND_SERVICE)) {
+		ckpt_debug("unable to bind to port %hu\n",
+			   htons(in->laddr.sin_port));
+		return -EINVAL;
+	}
+
 	if (in->laddr_len > sizeof(struct sockaddr_in)) {
 		ckpt_debug("laddr_len is too big\n");
 		return -EINVAL;
@@ -172,6 +419,77 @@ static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 		return -EINVAL;
 	}
 
+	/* Set ato to the default */
+	in->icsk_ack.ato = TCP_ATO_MIN;
+
+	/* No quick acks are scheduled after a restart */
+	in->icsk_ack.quick = 0;
+
+	if (in->icsk_ack.pending & ~icsk_ack_mask) {
+		ckpt_debug("invalid pending flags 0x%x\n",
+			   in->icsk_ack.pending & ~icsk_ack_mask);
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.pingpong > 1) {
+		ckpt_debug("invalid icsk_ack.pingpong value\n");
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.blocked > 1) {
+		ckpt_debug("invalid icsk_ack.blocked value\n");
+		return -EINVAL;
+	}
+
+	/* do_tcp_setsockopt() quietly makes this coercion */
+	if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
+		in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
+	else if (in->tcp.window_clamp > 65535U) {
+		ckpt_debug("invalid window_clamp value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
+		in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
+
+	/* These will all be recalculated on the next call to
+	 * tcp_rtt_estimator()
+	 */
+	in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
+	in->tcp.rttvar = in->tcp.rtt_seq = 0;
+
+	/* Might want to set packets_out to zero ? */
+
+	if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
+		in->tcp.rcv_wnd = MAX_TCP_WINDOW;
+
+	if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
+		ckpt_debug("keepalive_intvl %i out of range\n",
+			   in->tcp.keepalive_intvl);
+		return -EINVAL;
+	}
+
+	if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
+		ckpt_debug("Invalid keepalive_probes value %i\n",
+			   in->tcp.keepalive_probes);
+		return -EINVAL;
+	}
+
+	if (in->tcp.urg_data & ~urg_mask) {
+		ckpt_debug("Invalid urg_data value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.nonagle & ~nonagle_mask) {
+		ckpt_debug("Invalid nonagle value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.ecn_flags & ~ecn_mask) {
+		ckpt_debug("Invalid ecn_flags value\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -209,8 +527,35 @@ int inet_restore(struct ckpt_ctx *ctx,
 			ckpt_debug("inet listen: %i\n", ret);
 			if (ret < 0)
 				goto out;
+
+			/* We are a listening socket, so add ourselves
+			 * to the list of parent sockets.  This will
+			 * allow our children to find us later and
+			 * link up
+			 */
+
+			ret = sock_listening_list_add(ctx, sock->sk);
+			if (ret < 0)
+				goto out;
 		}
 	} else {
+		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+		if (ret)
+			goto out;
+
+		if ((h->sock.state == TCP_ESTABLISHED) &&
+		    (h->sock.protocol == IPPROTO_TCP)) {
+			/* A connected socket that was spawned from an
+			 * accept() needs to be hashed with its parent
+			 * listening socket in order to receive
+			 * traffic on the original port.  Since we may
+			 * not have restarted the parent yet, we defer
+			 * this until later when we know we have all
+			 * the listening sockets accounted for.
+			 */
+			ret = sock_defer_hash(ctx, sock->sk);
+		}
+
 		if (!sock_flag(sock->sk, SOCK_DEAD))
 			ret = inet_defer_restore_buffers(ctx, sock->sk);
 	}
-- 
1.6.2.5


^ permalink raw reply related

* Re: [PATCH] net: fold network name hash
From: Octavian Purdila @ 2009-10-27 18:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, netdev, linux-kernel, eric.dumazet, akpm, torvalds,
	viro
In-Reply-To: <20091027102251.244ee681@nehalam>

On Tuesday 27 October 2009 19:22:51 you wrote:

> The full_name_hash does not produce a value that is evenly distributed
> over the lower 8 bits. This causes name hash to be unbalanced with large
> number of names.  A simple fix is to just fold in the higher bits
> with XOR.
> 
> This is independent of possible improvements to full_name_hash()
> in future.
> 

I can confirm that the distribution looks good now for our most common cases. 

Thanks,
tavi

^ permalink raw reply

* Re: [PATCH 3/9] pcmcia: use pcmcia_loop_config in misc pcmcia drivers
From: John W. Linville @ 2009-10-27 19:28 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: linux-pcmcia-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, David S. Miller,
	Jiri Kosina, David Sterba, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1255907255-28297-3-git-send-email-linux-X3ehHDuj6sIIGcDfoQAp7OTW4wlIGRCZ@public.gmane.org>

On Mon, Oct 19, 2009 at 01:07:29AM +0200, Dominik Brodowski wrote:
> Use pcmcia_loop_config() in a few drivers missed during the first
> round. On fmvj18x_cs.c it -- strangely -- only requries us to set
> conf.ConfigIndex, which is done by the core, so include an empty
> loop function which returns 0 unconditionally.
> 
> CC: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> CC: John W. Linville <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
> CC: Jiri Kosina <jkosina-AlSwsSmVLrQ@public.gmane.org>
> CC: David Sterba <dsterba-AlSwsSmVLrQ@public.gmane.org>
> CC: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> CC: linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Signed-off-by: Dominik Brodowski <linux-X3ehHDuj6sIIGcDfoQAp7OTW4wlIGRCZ@public.gmane.org>

Acked-by: John W. Linville <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

-- 
John W. Linville		Someday the world will need a hero, and you
linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org			might be all we have.  Be ready.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/9] pcmcia: use pre-determined values
From: John W. Linville @ 2009-10-27 19:28 UTC (permalink / raw)
  To: Dominik Brodowski; +Cc: linux-pcmcia, David S. Miller, netdev, linux-wireless
In-Reply-To: <1255907255-28297-2-git-send-email-linux@dominikbrodowski.net>

On Mon, Oct 19, 2009 at 01:07:28AM +0200, Dominik Brodowski wrote:
> A few PCMCIA network drivers can make use of values provided by the pcmcia
> core, instead of tedious, independent CIS parsing.
> 
> xirc32ps_cs.c: manf_id
> 
> hostap_cs.c: multifunction count
> 
> b43/pcmcia.c: ConfigBase address and "Present"
> 
> smc91c92_cs.c:  By default, mhz_setup() can use VERS_1 as it is stored
> in struct pcmcia_device. Only some cards require workarounds, such as
> reading out VERS_1 twice.
> 
> CC: David S. Miller <davem@davemloft.net>
> CC: John W. Linville <linville@tuxdriver.com>
> CC: netdev@vger.kernel.org
> CC: linux-wireless@vger.kernel.org
> Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>

Acked-by: John W. Linville <linville@tuxdriver.com>

-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* [PATCH net-2.6] sfc: Really allow RX checksum offload to be disabled
From: Ben Hutchings @ 2009-10-27 19:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1256655057.2794.4.camel@achroite>

We have never checked the efx_nic::rx_checksum_enabled flag everywhere
we should, and since the switch to GRO we don't check it anywhere.
It's simplest to check it in the one place where we initialise the
per-packet checksummed flag.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Cc: stable@kernel.org
---
I'm not sure whether this is serious enough to merit a stable update.
It's not a recent regression.

Ben.

 drivers/net/sfc/falcon.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index 8776432..865638b 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -869,8 +869,9 @@ static void falcon_handle_rx_event(struct efx_channel *channel,
 		 * UDP/IPv4, then we can rely on the hardware checksum.
 		 */
 		checksummed =
-			rx_ev_hdr_type == FSE_AB_RX_EV_HDR_TYPE_IPV4_TCP ||
-			rx_ev_hdr_type == FSE_AB_RX_EV_HDR_TYPE_IPV4_UDP;
+			efx->rx_checksum_enabled &&
+			(rx_ev_hdr_type == FSE_AB_RX_EV_HDR_TYPE_IPV4_TCP ||
+			 rx_ev_hdr_type == FSE_AB_RX_EV_HDR_TYPE_IPV4_UDP);
 	} else {
 		falcon_handle_rx_not_ok(rx_queue, event, &rx_ev_pkt_ok,
 					&discard);

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [PATCH] udev: create empty regular files to represent net interfaces
From: Matt Domsch @ 2009-10-27 20:55 UTC (permalink / raw)
  To: dann frazier
  Cc: linux-hotplug, Narendra_K, netdev, Jordan_Hargrave, Charles_Rose,
	Ben Hutchings
In-Reply-To: <20091022063619.GB6321@ldl.fc.hp.com>

On Thu, Oct 22, 2009 at 12:36:20AM -0600, dann frazier wrote:
> Here's a proof of concept to further the discussion..
> 
> The default filename uses the format:
>   /dev/netdev/by-ifindex/$ifindex
> 
> This provides the infrastructure to permit udev rules to create aliases for
> network devices using symlinks, for example:
> 
>   /dev/netdev/by-name/eth0 -> ../by-ifindex/1
>   /dev/netdev/by-biosname/LOM0 -> ../by-ifindex/3
> 
> A library (such as the proposed libnetdevname) could use this information
> to provide an alias->realname mapping for network utilities.

yes, this could work, as IFINDEX is already exported in the uevents,
and that's the primary value udev needs to set up the mapping.

While I like the little ifindex2name script you've got, I think udev
could simply call if_indextoname() to get this, and not call an
external program?  I suppose it could be a really really simple
external program too.

I'd be fine with this approach.  It has the advantages of not
requiring a kernel change at all, and not creating a whole character
device which would be useless.  And it doesn't preclude someone in the
future from creating a char device for network devices should they so
choose.

Kay, what say you as udev owner?

Thanks,
Matt

-- 
Matt Domsch
Technology Strategist, Dell Office of the CTO
linux.dell.com & www.dell.com/linux

^ permalink raw reply

* Re: [PATCH 2/2] page allocator: Direct reclaim should always obey watermarks
From: Frans Pop @ 2009-10-27 21:00 UTC (permalink / raw)
  To: Mel Gorman
  Cc: David Rientjes, Andrew Morton, stable, Rafael J. Wysocki,
	David Miller, reinette chatre, Kalle Valo, John W. Linville,
	Pekka Enberg, Bartlomiej Zolnierkiewicz, Karol Lewandowski,
	netdev, linux-kernel, linux-mm@kvack.org
In-Reply-To: <200910170128.29086.elendil@planet.nl>

On Saturday 17 October 2009, Frans Pop wrote:
> On Saturday 17 October 2009, Mel Gorman wrote:
> > Frans, you reported that both patches in combination reduced the
> > number of failures. Was it in fact just the kswapd change that made
> > the difference?
>
> I will retest both patches (as I already mailed you privately
> yesterday), but not today. The improvement with the combination was
> real, but I'm not sure which patch is the reason. I think the second,
> but I need to verify.
>
> I've done another 30 boots or so today, mainly in the "akpm" merge, and
> I've found new patterns that will help me nail down the regression. But
> ATM I can't see straight anymore, so it will have to wait until
> tomorrow.

Again sorry for the delay, but I needed to retest these with various 
kernels as the results were inconclusive. AFAICT neither of the two 
patches makes a significant difference for my test case.

Not sure if my initial test was broken or that it was just a case where the 
timings worked out favorably.

Cheers,
FJP

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces
From: Julian Anastasov @ 2009-10-27 21:07 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: netdev
In-Reply-To: <20091027000302.GA3141@kvack.org>


	Hello,

On Mon, 26 Oct 2009, Benjamin LaHaise wrote:

> Hi folks,
> 
> Below is a patch to improve the scaling of interface destruction in 
> fib_hash.  The general idea is to tie the fib_alias structure into a 
> list off of net_device and walk that list during a fib_flush() caused 
> by an interface drop.  This makes the resulting flush only have to walk 
> the number of routes attached to an interface rather than the number of 
> routes attached to all interfaces at the expense of a couple of additional 
> pointers in struct fib_alias.

	May be this can not work for multipath routes because
you consider only the first device (fib_dev). Also nh_dev is
optional, not every route has device, so you should add checks
for dev ! NULL.

> @@ -516,6 +517,10 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
>  	new_fa->fa_type = cfg->fc_type;
>  	new_fa->fa_scope = cfg->fc_scope;
>  	new_fa->fa_state = 0;
> +	new_fa->fa_fib_node = f;
> +	new_fa->fa_fz = fz;
> +
> +	dev = fi->fib_dev;

Regards

--
Julian Anastasov <ja@ssi.bg>


^ permalink raw reply

* [PATCH] net: fold network name hash (v2)
From: Stephen Hemminger @ 2009-10-27 22:04 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, netdev, linux-kernel, eric.dumazet, akpm, torvalds,
	opurdila, viro
In-Reply-To: <20091027102251.244ee681@nehalam>

The full_name_hash does not produce a value that is evenly distributed
over the lower 8 bits. This causes name hash to be unbalanced with large
number of names. There is a standard function to fold in upper bits
so use that.

This is independent of possible improvements to full_name_hash()
in future.
 
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/core/dev.c	2009-10-27 14:54:21.922563076 -0700
+++ b/net/core/dev.c	2009-10-27 15:04:16.733813459 -0700
@@ -86,6 +86,7 @@
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/errno.h>
+#include <linux/hash.h>
 #include <linux/interrupt.h>
 #include <linux/if_ether.h>
 #include <linux/netdevice.h>
@@ -199,7 +200,7 @@ EXPORT_SYMBOL(dev_base_lock);
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 {
 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+	return &net->dev_name_head[hash_long(hash, NETDEV_HASHBITS)];
 }
 
 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)

^ permalink raw reply

* RE: [PATCH -next] netxen: fix builds for SYSFS=n or MODULES=n
From: Dhananjay Phadke @ 2009-10-27 22:09 UTC (permalink / raw)
  To: Randy Dunlap, Stephen Rothwell, netdev
  Cc: linux-next@vger.kernel.org, LKML, davem@davemloft.net
In-Reply-To: <20091026150945.3f35a811.randy.dunlap@oracle.com>

Sorry, seems like I am keeping you busy with the build errors.
Thanks for fixing it again (may be I need to setup all different build configs).

Acked-by: Dhananjay Phadke <dhananjay@netxen.com>

-----Original Message-----
From: Randy Dunlap [mailto:randy.dunlap@oracle.com] 
Sent: Monday, October 26, 2009 3:10 PM
To: Stephen Rothwell; netdev
Cc: linux-next@vger.kernel.org; LKML; davem@davemloft.net; Dhananjay Phadke
Subject: [PATCH -next] netxen: fix builds for SYSFS=n or MODULES=n

From: Randy Dunlap <randy.dunlap@oracle.com>

When CONFIG_MODULES=n:
drivers/net/netxen/netxen_nic_main.c:2751: error: dereferencing pointer to incomplete type
drivers/net/netxen/netxen_nic_main.c:2764: error: dereferencing pointer to incomplete type

Also needs addition of <linux/sysfs.h> for sysfs function prototypes or
stubs when CONFIG_SYSFS=n.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 drivers/net/netxen/netxen_nic_main.c |    9 +++++++++
 1 file changed, 9 insertions(+)

--- linux-next-20091026.orig/drivers/net/netxen/netxen_nic_main.c
+++ linux-next-20091026/drivers/net/netxen/netxen_nic_main.c
@@ -34,6 +34,7 @@
 #include <net/ip.h>
 #include <linux/ipv6.h>
 #include <linux/inetdevice.h>
+#include <linux/sysfs.h>
 
 MODULE_DESCRIPTION("NetXen Multi port (1/10) Gigabit Network Driver");
 MODULE_LICENSE("GPL");
@@ -2500,6 +2501,7 @@ static struct bin_attribute bin_attr_mem
 	.write = netxen_sysfs_write_mem,
 };
 
+#ifdef CONFIG_MODULES
 static ssize_t
 netxen_store_auto_fw_reset(struct module_attribute *mattr,
 		struct module *mod, const char *buf, size_t count)
@@ -2534,6 +2536,7 @@ static struct module_attribute mod_attr_
 	.show = netxen_show_auto_fw_reset,
 	.store = netxen_store_auto_fw_reset,
 };
+#endif
 
 static void
 netxen_create_sysfs_entries(struct netxen_adapter *adapter)
@@ -2739,7 +2742,9 @@ static struct pci_driver netxen_driver =
 
 static int __init netxen_init_module(void)
 {
+#ifdef CONFIG_MODULES
 	struct module *mod = THIS_MODULE;
+#endif
 
 	printk(KERN_INFO "%s\n", netxen_nic_driver_string);
 
@@ -2748,9 +2753,11 @@ static int __init netxen_init_module(voi
 	register_inetaddr_notifier(&netxen_inetaddr_cb);
 #endif
 
+#ifdef CONFIG_MODULES
 	if (sysfs_create_file(&mod->mkobj.kobj, &mod_attr_fw_reset.attr))
 		printk(KERN_ERR "%s: Failed to create auto_fw_reset "
 				"sysfs entry.", netxen_nic_driver_name);
+#endif
 
 	return pci_register_driver(&netxen_driver);
 }
@@ -2759,9 +2766,11 @@ module_init(netxen_init_module);
 
 static void __exit netxen_exit_module(void)
 {
+#ifdef CONFIG_MODULES
 	struct module *mod = THIS_MODULE;
 
 	sysfs_remove_file(&mod->mkobj.kobj, &mod_attr_fw_reset.attr);
+#endif
 
 	pci_unregister_driver(&netxen_driver);
 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox