netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
@ 2004-06-14 17:03 Arthur Kepner
  2004-06-14 18:23 ` Andi Kleen
  2004-06-14 18:23 ` Mitchell Blank Jr
  0 siblings, 2 replies; 6+ messages in thread
From: Arthur Kepner @ 2004-06-14 17:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1419 bytes --]



About three weeks ago I sent a patch to address lock contention on
the loopback device. I received several constructive criticisms and
now have a second version which is attached. The patch is against
2.6.

Why this is useful
------------------

Lock contention on the loopback can be a serious performance problem
on large (>32 cpu) multiprocessor systems, even leading to near-livelock.

There is supporting lockstat data in the email which accompanied my first
patch.


Issues addressed in the second version of the patch
---------------------------------------------------

Here's a summary of the comments on the first patch and the actions that
were taken (let me know if I missed anything):


1) It prevented the use of queueing disciplines on the loopback.

Ugh, that it did.

The new patch makes struct Qdisc an "rcu structure". It isn't deleted
until it's known that no references to it exist (or at least none should
exist.) This allows lockless read access (to a possibly stale Qdisc.)
This gives the performance benefits of the previous patch, and doesn't
prevent the use of queueing disciplines on the loopback.

(I'd especially appreciate comments on the interaction with Qdiscs.)


2) Should use per-cpu stats rather than slots in an array.

Done.

3) It's ugly

It's now beautiful ;-)

There was also the suggestion of creating multiple loopback devices, but
I preferred not to do that.


--

Arthur

[-- Attachment #2: lockless_loopback.patch.2 --]
[-- Type: TEXT/PLAIN, Size: 8643 bytes --]

--- linux.orig//drivers/net/loopback.c	2004-06-08 15:36:50.000000000 -0700
+++ linux//drivers/net/loopback.c	2004-06-11 14:04:33.000000000 -0700
@@ -56,6 +56,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 
+static struct net_device_stats *loopback_stats;
 
 #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)
 
@@ -117,13 +118,17 @@
 	dev_kfree_skb(skb);
 }
 
+#define LOOPBACK_STAT_INC(field)				\
+	(per_cpu_ptr(loopback_stats, smp_processor_id())->field++)
+#define LOOPBACK_STAT_ADD(field, n)				\
+	(per_cpu_ptr(loopback_stats, smp_processor_id())->field += n)
+
 /*
  * The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct net_device_stats *stats = dev->priv;
 
 	skb_orphan(skb);
 
@@ -142,11 +147,11 @@
 	}
 
 	dev->last_rx = jiffies;
-	if (likely(stats)) {
-		stats->rx_bytes+=skb->len;
-		stats->tx_bytes+=skb->len;
-		stats->rx_packets++;
-		stats->tx_packets++;
+	if (likely(loopback_stats)) {
+		LOOPBACK_STAT_ADD(rx_bytes, skb->len);
+		LOOPBACK_STAT_ADD(tx_bytes, skb->len);
+		LOOPBACK_STAT_INC(rx_packets);
+		LOOPBACK_STAT_INC(tx_packets);
 	}
 
 	netif_rx(skb);
@@ -156,7 +161,28 @@
 
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	return (struct net_device_stats *)dev->priv;
+	struct net_device_stats *stats = dev->priv;
+	int i;
+
+	if (unlikely(!stats)) {
+		return NULL;
+	}
+
+	memset(stats, 0, sizeof(struct net_device_stats));
+	if (unlikely(!loopback_stats)) {
+		return stats;
+	}
+
+	for (i=0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i)) 
+			continue;
+		stats->rx_bytes   += per_cpu_ptr(loopback_stats, i)->rx_bytes;
+		stats->tx_bytes   += per_cpu_ptr(loopback_stats, i)->tx_bytes;
+		stats->rx_packets += per_cpu_ptr(loopback_stats, i)->rx_packets;
+		stats->tx_packets += per_cpu_ptr(loopback_stats, i)->tx_packets;
+	}
+				
+	return stats;
 }
 
 struct net_device loopback_dev = {
@@ -173,7 +199,8 @@
 	.rebuild_header		= eth_rebuild_header,
 	.flags			= IFF_LOOPBACK,
 	.features 		= NETIF_F_SG|NETIF_F_FRAGLIST
-				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA,
+				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA
+				  |NETIF_F_LLTX,
 };
 
 /* Setup and register the of the LOOPBACK device. */
@@ -188,6 +215,8 @@
 		loopback_dev.priv = stats;
 		loopback_dev.get_stats = &get_stats;
 	}
+
+	loopback_stats = alloc_percpu(struct net_device_stats);
 	
 	return register_netdev(&loopback_dev);
 };
--- linux.orig//include/linux/netdevice.h	2004-06-08 15:36:51.000000000 -0700
+++ linux//include/linux/netdevice.h	2004-06-14 09:20:26.000000000 -0700
@@ -403,6 +403,7 @@
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
+#define NETIF_F_LLTX		4096	/* LockLess TX */
 
 	/* Called after device is detached from network. */
 	void			(*uninit)(struct net_device *dev);
--- linux.orig//include/net/pkt_sched.h	2004-06-08 15:36:51.000000000 -0700
+++ linux//include/net/pkt_sched.h	2004-06-11 13:46:25.000000000 -0700
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
+#include <linux/rcupdate.h>
 #include <net/pkt_cls.h>
 
 #ifdef CONFIG_X86_TSC
@@ -92,6 +93,7 @@
 	struct net_device	*dev;
 
 	struct tc_stats		stats;
+	struct rcu_head 	q_rcu;
 	int			(*reshape_fail)(struct sk_buff *skb, struct Qdisc *q);
 
 	/* This field is deprecated, but it is still used by CBQ
--- linux.orig//net/core/dev.c	2004-06-08 15:36:53.000000000 -0700
+++ linux//net/core/dev.c	2004-06-14 09:26:18.000000000 -0700
@@ -107,6 +107,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/netpoll.h>
+#include <linux/rcupdate.h>
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 #include <net/iw_handler.h>
@@ -1277,6 +1278,20 @@
 	return 0;
 }
 
+#define HARD_TX_LOCK_BH(dev, cpu) {			\
+	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
+		spin_lock_bh(&dev->xmit_lock);		\
+		dev->xmit_lock_owner = cpu;		\
+	}						\
+}
+
+#define HARD_TX_UNLOCK_BH(dev) {			\
+	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
+		dev->xmit_lock_owner = -1;		\
+		spin_unlock_bh(&dev->xmit_lock);	\
+	}						\
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -1321,18 +1336,34 @@
 			goto out;
 	}
 
-	/* Grab device queue */
-	spin_lock_bh(&dev->queue_lock);
+	rcu_read_lock();
+	/* Updates of qdisc are serialized by queue_lock. 
+	 * The struct Qdisc which is pointed to by qdisc is now a 
+	 * rcu structure - it may be accessed without acquiring 
+	 * a lock (but the structure may be stale.) The freeing of the
+	 * qdisc will be deferred until it's known that there are no 
+	 * more references to it.
+	 * 
+	 * If the qdisc has an enqueue function, we still need to 
+	 * hold the queue_lock before calling it, since queue_lock
+	 * also serializes access to the device queue.
+	 */
+
 	q = dev->qdisc;
 	if (q->enqueue) {
+		/* Grab device queue */
+		spin_lock_bh(&dev->queue_lock);
+
 		rc = q->enqueue(skb, q);
 
 		qdisc_run(dev);
 
 		spin_unlock_bh(&dev->queue_lock);
+		rcu_read_unlock();
 		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
 		goto out;
 	}
+	rcu_read_unlock();
 
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
@@ -1347,17 +1378,12 @@
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
+		preempt_disable();
 		int cpu = smp_processor_id();
 
 		if (dev->xmit_lock_owner != cpu) {
-			/*
-			 * The spin_lock effectivly does a preempt lock, but 
-			 * we are about to drop that...
-			 */
-			preempt_disable();
-			spin_unlock(&dev->queue_lock);
-			spin_lock(&dev->xmit_lock);
-			dev->xmit_lock_owner = cpu;
+
+			HARD_TX_LOCK_BH(dev, cpu);
 			preempt_enable();
 
 			if (!netif_queue_stopped(dev)) {
@@ -1366,18 +1392,17 @@
 
 				rc = 0;
 				if (!dev->hard_start_xmit(skb, dev)) {
-					dev->xmit_lock_owner = -1;
-					spin_unlock_bh(&dev->xmit_lock);
+					HARD_TX_UNLOCK_BH(dev);
 					goto out;
 				}
 			}
-			dev->xmit_lock_owner = -1;
-			spin_unlock_bh(&dev->xmit_lock);
+			HARD_TX_UNLOCK_BH(dev);
 			if (net_ratelimit())
 				printk(KERN_CRIT "Virtual device %s asks to "
 				       "queue packet!\n", dev->name);
 			goto out_enetdown;
 		} else {
+			preempt_enable();
 			/* Recursion is detected! It is possible,
 			 * unfortunately */
 			if (net_ratelimit())
@@ -1385,7 +1410,6 @@
 				       "%s, fix it urgently!\n", dev->name);
 		}
 	}
-	spin_unlock_bh(&dev->queue_lock);
 out_enetdown:
 	rc = -ENETDOWN;
 out_kfree_skb:
--- linux.orig//net/sched/sch_generic.c	2004-06-08 15:36:53.000000000 -0700
+++ linux//net/sched/sch_generic.c	2004-06-11 13:52:06.000000000 -0700
@@ -30,6 +30,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -404,18 +405,36 @@
 		ops->reset(qdisc);
 }
 
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
+
+static void __qdisc_destroy (void * arg) 
+{
+	struct Qdisc    *qdisc = (struct Qdisc *) arg;
+	struct Qdisc_ops  *ops = qdisc->ops;
+
+#ifdef CONFIG_NET_ESTIMATOR
+	qdisc_kill_estimator(&qdisc->stats);
+#endif
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
+	module_put(ops->owner);
+
+	if (!(qdisc->flags&TCQ_F_BUILTIN))
+		kfree(qdisc);
+}
+
 /* Under dev->queue_lock and BH! */
 
 void qdisc_destroy(struct Qdisc *qdisc)
 {
-	struct Qdisc_ops *ops = qdisc->ops;
-	struct net_device *dev;
+	struct net_device *dev = qdisc->dev;
 
 	if (!atomic_dec_and_test(&qdisc->refcnt))
 		return;
 
-	dev = qdisc->dev;
-
 	if (dev) {
 		struct Qdisc *q, **qp;
 		for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
@@ -425,16 +444,9 @@
 			}
 		}
 	}
-#ifdef CONFIG_NET_ESTIMATOR
-	qdisc_kill_estimator(&qdisc->stats);
-#endif
-	if (ops->reset)
-		ops->reset(qdisc);
-	if (ops->destroy)
-		ops->destroy(qdisc);
-	module_put(ops->owner);
-	if (!(qdisc->flags&TCQ_F_BUILTIN))
-		kfree(qdisc);
+
+	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);
+
 }
 
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2004-06-21  0:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-14 17:03 [RFC/PATCH] lockless loopback patch for 2.6 (version 2) Arthur Kepner
2004-06-14 18:23 ` Andi Kleen
2004-06-18 20:12   ` Arthur Kepner
2004-06-21  0:39     ` David S. Miller
2004-06-14 18:23 ` Mitchell Blank Jr
2004-06-14 19:45   ` Arthur Kepner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).