[RFC/PATCH] lockless loopback patch for 2.6 (version 2)

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
@ 2004-06-14 17:03 Arthur Kepner
  2004-06-14 18:23 ` Andi Kleen
  2004-06-14 18:23 ` Mitchell Blank Jr
  0 siblings, 2 replies; 6+ messages in thread
From: Arthur Kepner @ 2004-06-14 17:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1419 bytes --]



About three weeks ago I sent a patch to address lock contention on
the loopback device. I received several constructive criticisms and
now have a second version which is attached. The patch is against
2.6.

Why this is useful
------------------

Lock contention on the loopback can be a serious performance problem
on large (>32 cpu) multiprocessor systems, even leading to near-livelock.

There is supporting lockstat data in the email which accompanied my first
patch.


Issues addressed in the second version of the patch
---------------------------------------------------

Here's a summary of the comments on the first patch and the actions that
were taken (let me know if I missed anything):


1) It prevented the use of queueing disciplines on the loopback.

Ugh, that it did.

The new patch makes struct Qdisc an "rcu structure". It isn't deleted
until it's known that no references to it exist (or at least none should
exist.) This allows lockless read access (to a possibly stale Qdisc.)
This gives the performance benefits of the previous patch, and doesn't
prevent the use of queueing disciplines on the loopback.

(I'd especially appreciate comments on the interaction with Qdiscs.)


2) Should use per-cpu stats rather than slots in an array.

Done.

3) It's ugly

It's now beautiful ;-)

There was also the suggestion of creating multiple loopback devices, but
I preferred not to do that.


--

Arthur

[-- Attachment #2: lockless_loopback.patch.2 --]
[-- Type: TEXT/PLAIN, Size: 8643 bytes --]

--- linux.orig//drivers/net/loopback.c	2004-06-08 15:36:50.000000000 -0700
+++ linux//drivers/net/loopback.c	2004-06-11 14:04:33.000000000 -0700
@@ -56,6 +56,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 
+static struct net_device_stats *loopback_stats;
 
 #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)
 
@@ -117,13 +118,17 @@
 	dev_kfree_skb(skb);
 }
 
+#define LOOPBACK_STAT_INC(field)				\
+	(per_cpu_ptr(loopback_stats, smp_processor_id())->field++)
+#define LOOPBACK_STAT_ADD(field, n)				\
+	(per_cpu_ptr(loopback_stats, smp_processor_id())->field += n)
+
 /*
  * The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct net_device_stats *stats = dev->priv;
 
 	skb_orphan(skb);
 
@@ -142,11 +147,11 @@
 	}
 
 	dev->last_rx = jiffies;
-	if (likely(stats)) {
-		stats->rx_bytes+=skb->len;
-		stats->tx_bytes+=skb->len;
-		stats->rx_packets++;
-		stats->tx_packets++;
+	if (likely(loopback_stats)) {
+		LOOPBACK_STAT_ADD(rx_bytes, skb->len);
+		LOOPBACK_STAT_ADD(tx_bytes, skb->len);
+		LOOPBACK_STAT_INC(rx_packets);
+		LOOPBACK_STAT_INC(tx_packets);
 	}
 
 	netif_rx(skb);
@@ -156,7 +161,28 @@
 
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	return (struct net_device_stats *)dev->priv;
+	struct net_device_stats *stats = dev->priv;
+	int i;
+
+	if (unlikely(!stats)) {
+		return NULL;
+	}
+
+	memset(stats, 0, sizeof(struct net_device_stats));
+	if (unlikely(!loopback_stats)) {
+		return stats;
+	}
+
+	for (i=0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i)) 
+			continue;
+		stats->rx_bytes   += per_cpu_ptr(loopback_stats, i)->rx_bytes;
+		stats->tx_bytes   += per_cpu_ptr(loopback_stats, i)->tx_bytes;
+		stats->rx_packets += per_cpu_ptr(loopback_stats, i)->rx_packets;
+		stats->tx_packets += per_cpu_ptr(loopback_stats, i)->tx_packets;
+	}
+				
+	return stats;
 }
 
 struct net_device loopback_dev = {
@@ -173,7 +199,8 @@
 	.rebuild_header		= eth_rebuild_header,
 	.flags			= IFF_LOOPBACK,
 	.features 		= NETIF_F_SG|NETIF_F_FRAGLIST
-				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA,
+				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA
+				  |NETIF_F_LLTX,
 };
 
 /* Setup and register the of the LOOPBACK device. */
@@ -188,6 +215,8 @@
 		loopback_dev.priv = stats;
 		loopback_dev.get_stats = &get_stats;
 	}
+
+	loopback_stats = alloc_percpu(struct net_device_stats);
 	
 	return register_netdev(&loopback_dev);
 };
--- linux.orig//include/linux/netdevice.h	2004-06-08 15:36:51.000000000 -0700
+++ linux//include/linux/netdevice.h	2004-06-14 09:20:26.000000000 -0700
@@ -403,6 +403,7 @@
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
+#define NETIF_F_LLTX		4096	/* LockLess TX */
 
 	/* Called after device is detached from network. */
 	void			(*uninit)(struct net_device *dev);
--- linux.orig//include/net/pkt_sched.h	2004-06-08 15:36:51.000000000 -0700
+++ linux//include/net/pkt_sched.h	2004-06-11 13:46:25.000000000 -0700
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
+#include <linux/rcupdate.h>
 #include <net/pkt_cls.h>
 
 #ifdef CONFIG_X86_TSC
@@ -92,6 +93,7 @@
 	struct net_device	*dev;
 
 	struct tc_stats		stats;
+	struct rcu_head 	q_rcu;
 	int			(*reshape_fail)(struct sk_buff *skb, struct Qdisc *q);
 
 	/* This field is deprecated, but it is still used by CBQ
--- linux.orig//net/core/dev.c	2004-06-08 15:36:53.000000000 -0700
+++ linux//net/core/dev.c	2004-06-14 09:26:18.000000000 -0700
@@ -107,6 +107,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/netpoll.h>
+#include <linux/rcupdate.h>
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 #include <net/iw_handler.h>
@@ -1277,6 +1278,20 @@
 	return 0;
 }
 
+#define HARD_TX_LOCK_BH(dev, cpu) {			\
+	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
+		spin_lock_bh(&dev->xmit_lock);		\
+		dev->xmit_lock_owner = cpu;		\
+	}						\
+}
+
+#define HARD_TX_UNLOCK_BH(dev) {			\
+	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
+		dev->xmit_lock_owner = -1;		\
+		spin_unlock_bh(&dev->xmit_lock);	\
+	}						\
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -1321,18 +1336,34 @@
 			goto out;
 	}
 
-	/* Grab device queue */
-	spin_lock_bh(&dev->queue_lock);
+	rcu_read_lock();
+	/* Updates of qdisc are serialized by queue_lock. 
+	 * The struct Qdisc which is pointed to by qdisc is now a 
+	 * rcu structure - it may be accessed without acquiring 
+	 * a lock (but the structure may be stale.) The freeing of the
+	 * qdisc will be deferred until it's known that there are no 
+	 * more references to it.
+	 * 
+	 * If the qdisc has an enqueue function, we still need to 
+	 * hold the queue_lock before calling it, since queue_lock
+	 * also serializes access to the device queue.
+	 */
+
 	q = dev->qdisc;
 	if (q->enqueue) {
+		/* Grab device queue */
+		spin_lock_bh(&dev->queue_lock);
+
 		rc = q->enqueue(skb, q);
 
 		qdisc_run(dev);
 
 		spin_unlock_bh(&dev->queue_lock);
+		rcu_read_unlock();
 		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
 		goto out;
 	}
+	rcu_read_unlock();
 
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
@@ -1347,17 +1378,12 @@
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
+		preempt_disable();
 		int cpu = smp_processor_id();
 
 		if (dev->xmit_lock_owner != cpu) {
-			/*
-			 * The spin_lock effectivly does a preempt lock, but 
-			 * we are about to drop that...
-			 */
-			preempt_disable();
-			spin_unlock(&dev->queue_lock);
-			spin_lock(&dev->xmit_lock);
-			dev->xmit_lock_owner = cpu;
+
+			HARD_TX_LOCK_BH(dev, cpu);
 			preempt_enable();
 
 			if (!netif_queue_stopped(dev)) {
@@ -1366,18 +1392,17 @@
 
 				rc = 0;
 				if (!dev->hard_start_xmit(skb, dev)) {
-					dev->xmit_lock_owner = -1;
-					spin_unlock_bh(&dev->xmit_lock);
+					HARD_TX_UNLOCK_BH(dev);
 					goto out;
 				}
 			}
-			dev->xmit_lock_owner = -1;
-			spin_unlock_bh(&dev->xmit_lock);
+			HARD_TX_UNLOCK_BH(dev);
 			if (net_ratelimit())
 				printk(KERN_CRIT "Virtual device %s asks to "
 				       "queue packet!\n", dev->name);
 			goto out_enetdown;
 		} else {
+			preempt_enable();
 			/* Recursion is detected! It is possible,
 			 * unfortunately */
 			if (net_ratelimit())
@@ -1385,7 +1410,6 @@
 				       "%s, fix it urgently!\n", dev->name);
 		}
 	}
-	spin_unlock_bh(&dev->queue_lock);
 out_enetdown:
 	rc = -ENETDOWN;
 out_kfree_skb:
--- linux.orig//net/sched/sch_generic.c	2004-06-08 15:36:53.000000000 -0700
+++ linux//net/sched/sch_generic.c	2004-06-11 13:52:06.000000000 -0700
@@ -30,6 +30,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -404,18 +405,36 @@
 		ops->reset(qdisc);
 }
 
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
+
+static void __qdisc_destroy (void * arg) 
+{
+	struct Qdisc    *qdisc = (struct Qdisc *) arg;
+	struct Qdisc_ops  *ops = qdisc->ops;
+
+#ifdef CONFIG_NET_ESTIMATOR
+	qdisc_kill_estimator(&qdisc->stats);
+#endif
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
+	module_put(ops->owner);
+
+	if (!(qdisc->flags&TCQ_F_BUILTIN))
+		kfree(qdisc);
+}
+
 /* Under dev->queue_lock and BH! */
 
 void qdisc_destroy(struct Qdisc *qdisc)
 {
-	struct Qdisc_ops *ops = qdisc->ops;
-	struct net_device *dev;
+	struct net_device *dev = qdisc->dev;
 
 	if (!atomic_dec_and_test(&qdisc->refcnt))
 		return;
 
-	dev = qdisc->dev;
-
 	if (dev) {
 		struct Qdisc *q, **qp;
 		for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
@@ -425,16 +444,9 @@
 			}
 		}
 	}
-#ifdef CONFIG_NET_ESTIMATOR
-	qdisc_kill_estimator(&qdisc->stats);
-#endif
-	if (ops->reset)
-		ops->reset(qdisc);
-	if (ops->destroy)
-		ops->destroy(qdisc);
-	module_put(ops->owner);
-	if (!(qdisc->flags&TCQ_F_BUILTIN))
-		kfree(qdisc);
+
+	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);
+
 }
 
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
  2004-06-14 17:03 [RFC/PATCH] lockless loopback patch for 2.6 (version 2) Arthur Kepner
@ 2004-06-14 18:23 ` Andi Kleen
  2004-06-18 20:12   ` Arthur Kepner
  2004-06-14 18:23 ` Mitchell Blank Jr
  1 sibling, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2004-06-14 18:23 UTC (permalink / raw)
  To: Arthur Kepner; +Cc: David S. Miller, netdev

> +#define LOOPBACK_STAT_INC(field)				\
> +	(per_cpu_ptr(loopback_stats, smp_processor_id())->field++)
> +#define LOOPBACK_STAT_ADD(field, n)				\
> +	(per_cpu_ptr(loopback_stats, smp_processor_id())->field += n)

This is too complicated and not preempt safe. Use 
__get_cpu_var(loopback_stats).field++; 

I would also remove the macros and do this directly.

> +	struct net_device_stats *stats = dev->priv;
> +	int i;
> +
> +	if (unlikely(!stats)) {

Tests for NULL don't need an unlikely, because gcc does that by 
default for itself. But why can the stats here be NULL anyways?

>  #ifdef CONFIG_NET_RADIO
>  #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
>  #include <net/iw_handler.h>
> @@ -1277,6 +1278,20 @@
>  	return 0;
>  }
>  
> +#define HARD_TX_LOCK_BH(dev, cpu) {			\
> +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\

&& instead of & and missing brackets.

> +		spin_lock_bh(&dev->xmit_lock);		\
> +		dev->xmit_lock_owner = cpu;		\
> +	}						\
> +}
> +
> +#define HARD_TX_UNLOCK_BH(dev) {			\
> +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\

Same.

> -	if (ops->reset)
> -		ops->reset(qdisc);
> -	if (ops->destroy)
> -		ops->destroy(qdisc);
> -	module_put(ops->owner);
> -	if (!(qdisc->flags&TCQ_F_BUILTIN))
> -		kfree(qdisc);
> +
> +	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);

I think you need at least a wmb() after 

           if (q == qdisc) {
		         *qp = q->next;
				 break;
			}

Otherwise the order of updates to the readers is no guaranteed. 
Also if you want to support alpha there will need to be 
smp_read_barrier_depends() in the reader walking this list.

-Andi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
  2004-06-14 17:03 [RFC/PATCH] lockless loopback patch for 2.6 (version 2) Arthur Kepner
  2004-06-14 18:23 ` Andi Kleen
@ 2004-06-14 18:23 ` Mitchell Blank Jr
  2004-06-14 19:45   ` Arthur Kepner
  1 sibling, 1 reply; 6+ messages in thread
From: Mitchell Blank Jr @ 2004-06-14 18:23 UTC (permalink / raw)
  To: Arthur Kepner; +Cc: netdev

Arthur Kepner wrote:
> +#define HARD_TX_LOCK_BH(dev, cpu) {			\
> +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
                           ^^

Don't you mean '&' instead of '&&' here?  It looks like that condition is
always false, so you've killed the TX locking for all devices with this
patch.

> +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\

ditto

-Mitch

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
  2004-06-14 18:23 ` Mitchell Blank Jr
@ 2004-06-14 19:45   ` Arthur Kepner
  0 siblings, 0 replies; 6+ messages in thread
From: Arthur Kepner @ 2004-06-14 19:45 UTC (permalink / raw)
  To: Mitchell Blank Jr; +Cc: netdev


On Mon, 14 Jun 2004, Mitchell Blank Jr wrote:

> Arthur Kepner wrote:
> > +#define HARD_TX_LOCK_BH(dev, cpu) {			\
> > +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
>                            ^^
>
> Don't you mean '&' instead of '&&' here?  It looks like that condition is
> always false, so you've killed the TX locking for all devices with this
> patch.

Ummm, yes. (I believe the appropriate response here is "D'oh!") Thanks.

--

Arthur

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
  2004-06-14 18:23 ` Andi Kleen
@ 2004-06-18 20:12   ` Arthur Kepner
  2004-06-21  0:39     ` David S. Miller
  0 siblings, 1 reply; 6+ messages in thread
From: Arthur Kepner @ 2004-06-18 20:12 UTC (permalink / raw)
  To: Andi Kleen; +Cc: David S. Miller, netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2967 bytes --]


On Mon, 14 Jun 2004, Andi Kleen wrote:

> > +#define LOOPBACK_STAT_INC(field)				\
> > +	(per_cpu_ptr(loopback_stats, smp_processor_id())->field++)
> > +#define LOOPBACK_STAT_ADD(field, n)				\
> > +	(per_cpu_ptr(loopback_stats, smp_processor_id())->field += n)
>
> This is too complicated and not preempt safe. Use
> __get_cpu_var(loopback_stats).field++;

This is too complicated (and preemptible). But I think you mean to use:

get_cpu_ptr(loopback_stats)->field1 += blah;
get_cpu_ptr(loopback_stats)->field2++;
....
put_cpu_var(loopback_stats);

(loopback_stats is allocated with alloc_percpu().)

>
> I would also remove the macros and do this directly.

With this simplification the macros are not helpful. I'll kill 'em.

>
> > +	struct net_device_stats *stats = dev->priv;
> > +	int i;
> > +
> > +	if (unlikely(!stats)) {
>
> Tests for NULL don't need an unlikely, because gcc does that by
> default for itself.

OK, I didn't know that.

> But why can the stats here be NULL anyways?
>

stats can be NULL. loopback_init() might have failed when it tried to
kmalloc() it.

> >  #ifdef CONFIG_NET_RADIO
> >  #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
> >  #include <net/iw_handler.h>
> > @@ -1277,6 +1278,20 @@
> >  	return 0;
> >  }
> >
> > +#define HARD_TX_LOCK_BH(dev, cpu) {			\
> > +	if ( dev->features && NETIF_F_LLTX  == 0 ) {	\
>
> && instead of & and missing brackets.
> ....

Fixed (both instances.)

>
> > -	if (ops->reset)
> > -		ops->reset(qdisc);
> > -	if (ops->destroy)
> > -		ops->destroy(qdisc);
> > -	module_put(ops->owner);
> > -	if (!(qdisc->flags&TCQ_F_BUILTIN))
> > -		kfree(qdisc);
> > +
> > +	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);
>
> I think you need at least a wmb() after
>
>            if (q == qdisc) {
> 		         *qp = q->next;
> 				 break;
> 			}
>
> Otherwise the order of updates to the readers is no guaranteed.
> Also if you want to support alpha there will need to be
> smp_read_barrier_depends() in the reader walking this list.

Hmmm, I'm not sure.

The only place where the struct Qdisc is accessed locklessly
is in dev_queue_xmit(). (Or at least that's the only place
where the locking behavior when accessing the Qdisc _changes_
with this patch.)

It *does* look as if a smp_read_barrier_depends() is appropriate
in dev_queue_xmit() as follows:

  rcu_read_lock();
  ....
  q = dev->qdisc;
+ smp_read_barrier_depends();
  if (q->enqueue) {
     ....
     spin_lock_bh(&dev->queue_lock);
     rc = q->enqueue(skb, q);
     qdisc_run(dev);
     spin_unlock_bh(&dev->queue_lock);
     ....
  }
  rcu_read_unlock();

Also, there should be a memory barrier after the Qdisc's
"enqueue" pointer is modified but before the netdevice's qdisc
pointer is modified (although it looks as if there's always a
spinlock acquisition which does this implicitly.) But I've
added the smp_wmb()s to qdisc_create() and qdisc_create_dflt()
to be safe.

A patch with these changes is attached.

--

Arthur

[-- Attachment #2: lockless loopback patch 3 --]
[-- Type: TEXT/PLAIN, Size: 9172 bytes --]

--- linux.orig/drivers/net/loopback.c	2004-06-08 15:36:50.000000000 -0700
+++ linux/drivers/net/loopback.c	2004-06-15 15:39:56.000000000 -0700
@@ -56,6 +56,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 
+static struct net_device_stats *loopback_stats;
 
 #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)
 
@@ -123,7 +124,6 @@
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct net_device_stats *stats = dev->priv;
 
 	skb_orphan(skb);
 
@@ -142,11 +142,12 @@
 	}
 
 	dev->last_rx = jiffies;
-	if (likely(stats)) {
-		stats->rx_bytes+=skb->len;
-		stats->tx_bytes+=skb->len;
-		stats->rx_packets++;
-		stats->tx_packets++;
+	if (likely(loopback_stats)) {
+		get_cpu_ptr(loopback_stats)->rx_bytes += skb->len;
+		get_cpu_ptr(loopback_stats)->tx_bytes += skb->len;
+		get_cpu_ptr(loopback_stats)->rx_packets++;
+		get_cpu_ptr(loopback_stats)->tx_packets++;
+		put_cpu_ptr(loopback_stats);
 	}
 
 	netif_rx(skb);
@@ -156,7 +157,28 @@
 
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	return (struct net_device_stats *)dev->priv;
+	struct net_device_stats *stats = dev->priv;
+	int i;
+
+	if (!stats) {
+		return NULL;
+	}
+
+	memset(stats, 0, sizeof(struct net_device_stats));
+	if (!loopback_stats) {
+		return stats;
+	}
+
+	for (i=0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i)) 
+			continue;
+		stats->rx_bytes   += per_cpu_ptr(loopback_stats, i)->rx_bytes;
+		stats->tx_bytes   += per_cpu_ptr(loopback_stats, i)->tx_bytes;
+		stats->rx_packets += per_cpu_ptr(loopback_stats, i)->rx_packets;
+		stats->tx_packets += per_cpu_ptr(loopback_stats, i)->tx_packets;
+	}
+				
+	return stats;
 }
 
 struct net_device loopback_dev = {
@@ -173,7 +195,8 @@
 	.rebuild_header		= eth_rebuild_header,
 	.flags			= IFF_LOOPBACK,
 	.features 		= NETIF_F_SG|NETIF_F_FRAGLIST
-				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA,
+				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA
+				  |NETIF_F_LLTX,
 };
 
 /* Setup and register the of the LOOPBACK device. */
@@ -188,6 +211,8 @@
 		loopback_dev.priv = stats;
 		loopback_dev.get_stats = &get_stats;
 	}
+
+	loopback_stats = alloc_percpu(struct net_device_stats);
 	
 	return register_netdev(&loopback_dev);
 };
--- linux.orig/include/linux/netdevice.h	2004-06-08 15:36:51.000000000 -0700
+++ linux/include/linux/netdevice.h	2004-06-14 09:20:26.000000000 -0700
@@ -403,6 +403,7 @@
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
+#define NETIF_F_LLTX		4096	/* LockLess TX */
 
 	/* Called after device is detached from network. */
 	void			(*uninit)(struct net_device *dev);
--- linux.orig/include/net/pkt_sched.h	2004-06-08 15:36:51.000000000 -0700
+++ linux/include/net/pkt_sched.h	2004-06-11 13:46:25.000000000 -0700
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
+#include <linux/rcupdate.h>
 #include <net/pkt_cls.h>
 
 #ifdef CONFIG_X86_TSC
@@ -92,6 +93,7 @@
 	struct net_device	*dev;
 
 	struct tc_stats		stats;
+	struct rcu_head 	q_rcu;
 	int			(*reshape_fail)(struct sk_buff *skb, struct Qdisc *q);
 
 	/* This field is deprecated, but it is still used by CBQ
--- linux.orig/net/core/dev.c	2004-06-08 15:36:53.000000000 -0700
+++ linux/net/core/dev.c	2004-06-15 16:19:39.000000000 -0700
@@ -107,6 +107,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/netpoll.h>
+#include <linux/rcupdate.h>
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 #include <net/iw_handler.h>
@@ -1277,6 +1278,20 @@
 	return 0;
 }
 
+#define HARD_TX_LOCK_BH(dev, cpu) {			\
+	if ((dev->features & NETIF_F_LLTX) == 0) {	\
+		spin_lock_bh(&dev->xmit_lock);		\
+		dev->xmit_lock_owner = cpu;		\
+	}						\
+}
+
+#define HARD_TX_UNLOCK_BH(dev) {			\
+	if ((dev->features & NETIF_F_LLTX) == 0) {	\
+		dev->xmit_lock_owner = -1;		\
+		spin_unlock_bh(&dev->xmit_lock);	\
+	}						\
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -1321,18 +1336,35 @@
 			goto out;
 	}
 
-	/* Grab device queue */
-	spin_lock_bh(&dev->queue_lock);
+	rcu_read_lock();
+	/* Updates of qdisc are serialized by queue_lock. 
+	 * The struct Qdisc which is pointed to by qdisc is now a 
+	 * rcu structure - it may be accessed without acquiring 
+	 * a lock (but the structure may be stale.) The freeing of the
+	 * qdisc will be deferred until it's known that there are no 
+	 * more references to it.
+	 * 
+	 * If the qdisc has an enqueue function, we still need to 
+	 * hold the queue_lock before calling it, since queue_lock
+	 * also serializes access to the device queue.
+	 */
+
 	q = dev->qdisc;
+	smp_read_barrier_depends();
 	if (q->enqueue) {
+		/* Grab device queue */
+		spin_lock_bh(&dev->queue_lock);
+
 		rc = q->enqueue(skb, q);
 
 		qdisc_run(dev);
 
 		spin_unlock_bh(&dev->queue_lock);
+		rcu_read_unlock();
 		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
 		goto out;
 	}
+	rcu_read_unlock();
 
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
@@ -1347,17 +1379,12 @@
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
+		preempt_disable();
 		int cpu = smp_processor_id();
 
 		if (dev->xmit_lock_owner != cpu) {
-			/*
-			 * The spin_lock effectivly does a preempt lock, but 
-			 * we are about to drop that...
-			 */
-			preempt_disable();
-			spin_unlock(&dev->queue_lock);
-			spin_lock(&dev->xmit_lock);
-			dev->xmit_lock_owner = cpu;
+
+			HARD_TX_LOCK_BH(dev, cpu);
 			preempt_enable();
 
 			if (!netif_queue_stopped(dev)) {
@@ -1366,18 +1393,17 @@
 
 				rc = 0;
 				if (!dev->hard_start_xmit(skb, dev)) {
-					dev->xmit_lock_owner = -1;
-					spin_unlock_bh(&dev->xmit_lock);
+					HARD_TX_UNLOCK_BH(dev);
 					goto out;
 				}
 			}
-			dev->xmit_lock_owner = -1;
-			spin_unlock_bh(&dev->xmit_lock);
+			HARD_TX_UNLOCK_BH(dev);
 			if (net_ratelimit())
 				printk(KERN_CRIT "Virtual device %s asks to "
 				       "queue packet!\n", dev->name);
 			goto out_enetdown;
 		} else {
+			preempt_enable();
 			/* Recursion is detected! It is possible,
 			 * unfortunately */
 			if (net_ratelimit())
@@ -1385,7 +1411,6 @@
 				       "%s, fix it urgently!\n", dev->name);
 		}
 	}
-	spin_unlock_bh(&dev->queue_lock);
 out_enetdown:
 	rc = -ENETDOWN;
 out_kfree_skb:
--- linux.orig/net/sched/sch_generic.c	2004-06-08 15:36:53.000000000 -0700
+++ linux/net/sched/sch_generic.c	2004-06-18 12:48:57.000000000 -0700
@@ -30,6 +30,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -387,6 +388,9 @@
 	sch->dev = dev;
 	sch->stats.lock = &dev->queue_lock;
 	atomic_set(&sch->refcnt, 1);
+	/* enqueue is accessed locklessly - make sure it's visible
+	 * before we set a netdevice's qdisc pointer to sch */
+	smp_wmb();
 	if (!ops->init || ops->init(sch, NULL) == 0)
 		return sch;
 
@@ -404,18 +408,36 @@
 		ops->reset(qdisc);
 }
 
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
+
+static void __qdisc_destroy (void * arg) 
+{
+	struct Qdisc    *qdisc = (struct Qdisc *) arg;
+	struct Qdisc_ops  *ops = qdisc->ops;
+
+#ifdef CONFIG_NET_ESTIMATOR
+	qdisc_kill_estimator(&qdisc->stats);
+#endif
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
+	module_put(ops->owner);
+
+	if (!(qdisc->flags&TCQ_F_BUILTIN))
+		kfree(qdisc);
+}
+
 /* Under dev->queue_lock and BH! */
 
 void qdisc_destroy(struct Qdisc *qdisc)
 {
-	struct Qdisc_ops *ops = qdisc->ops;
-	struct net_device *dev;
+	struct net_device *dev = qdisc->dev;
 
 	if (!atomic_dec_and_test(&qdisc->refcnt))
 		return;
 
-	dev = qdisc->dev;
-
 	if (dev) {
 		struct Qdisc *q, **qp;
 		for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
@@ -425,16 +447,9 @@
 			}
 		}
 	}
-#ifdef CONFIG_NET_ESTIMATOR
-	qdisc_kill_estimator(&qdisc->stats);
-#endif
-	if (ops->reset)
-		ops->reset(qdisc);
-	if (ops->destroy)
-		ops->destroy(qdisc);
-	module_put(ops->owner);
-	if (!(qdisc->flags&TCQ_F_BUILTIN))
-		kfree(qdisc);
+
+	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);
+
 }
 
 
--- linux.orig/net/sched/sch_api.c	2004-06-08 15:36:53.000000000 -0700
+++ linux/net/sched/sch_api.c	2004-06-18 12:51:53.000000000 -0700
@@ -450,6 +450,9 @@
 	if (!try_module_get(ops->owner))
 		goto err_out;
 
+	/* enqueue is accessed locklessly - make sure it's visible
+	 * before we set a netdevice's qdisc pointer to sch */
+	smp_wmb();
 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
 		write_lock(&qdisc_tree_lock);
 		sch->next = dev->qdisc_list;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC/PATCH] lockless loopback patch for 2.6 (version 2)
  2004-06-18 20:12   ` Arthur Kepner
@ 2004-06-21  0:39     ` David S. Miller
  0 siblings, 0 replies; 6+ messages in thread
From: David S. Miller @ 2004-06-21  0:39 UTC (permalink / raw)
  To: Arthur Kepner; +Cc: ak, netdev

On Fri, 18 Jun 2004 13:12:23 -0700
Arthur Kepner <akepner@sgi.com> wrote:

> A patch with these changes is attached.

This looks fine to me.  I'm going to add this to my tree
and push upstream.

Thanks Arthur.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2004-06-21  0:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-14 17:03 [RFC/PATCH] lockless loopback patch for 2.6 (version 2) Arthur Kepner
2004-06-14 18:23 ` Andi Kleen
2004-06-18 20:12   ` Arthur Kepner
2004-06-21  0:39     ` David S. Miller
2004-06-14 18:23 ` Mitchell Blank Jr
2004-06-14 19:45   ` Arthur Kepner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).