[RFC] loopback: optimization

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] loopback: optimization
@ 2008-11-04  5:37 Stephen Hemminger
  2008-11-04  6:36 ` Eric Dumazet
  2008-11-05 20:36 ` Stephen Hemminger
  0 siblings, 2 replies; 6+ messages in thread
From: Stephen Hemminger @ 2008-11-04  5:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

This is something I was trying out to try improving loopback performance.
It moves loopback processing from the generic backlog per-cpu queues,
to its own set of queues. I thought this might help the cache locality
and the loopback doesn't have to do relatively expensive local_irq_disable()
operations.

BUT it didn't seem to help with tbench, but it might help others who
want to go farther with it. Code runs but treat it ASIS at this point.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/drivers/net/loopback.c	2008-11-03 10:13:31.000000000 -0800
+++ b/drivers/net/loopback.c	2008-11-03 11:05:52.000000000 -0800
@@ -59,51 +59,79 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 
-struct pcpu_lstats {
+struct loopback_per_cpu {
+	struct sk_buff_head rxq;
+	struct napi_struct napi;
+
 	unsigned long packets;
 	unsigned long bytes;
 };
 
+
 /*
  * The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct pcpu_lstats *pcpu_lstats, *lb_stats;
+	struct loopback_per_cpu *lo = dev->ml_priv, *pcpu;
 
 	skb_orphan(skb);
 
-	skb->protocol = eth_type_trans(skb,dev);
+	skb->protocol = eth_type_trans(skb, dev);
 
 	dev->last_rx = jiffies;
 
 	/* it's OK to use per_cpu_ptr() because BHs are off */
-	pcpu_lstats = dev->ml_priv;
-	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
-	lb_stats->bytes += skb->len;
-	lb_stats->packets++;
+	pcpu = per_cpu_ptr(lo, smp_processor_id());
 
-	netif_rx(skb);
+	if (likely(pcpu->rxq.qlen < netdev_max_backlog)) {
+		if (pcpu->rxq.qlen == 0)
+			__napi_schedule(&lo->napi);
+
+		__skb_queue_tail(&pcpu->rxq, skb);
+		pcpu->bytes += skb->len;
+		pcpu->packets++;
+
+		return NET_XMIT_SUCCESS;
+	} else {
+		dev->stats.rx_dropped++;
+		dev_kfree_skb_any(skb);
+		return NET_XMIT_DROP;
+	}
+}
 
-	return 0;
+static int loopback_poll(struct napi_struct *napi, int work_limit)
+{
+	struct loopback_per_cpu *pcpu
+		= container_of(napi, struct loopback_per_cpu, napi);
+	struct sk_buff *skb;
+	int work_done = 0;
+
+	while ( (skb = __skb_dequeue(&pcpu->rxq)) ) {
+		netif_receive_skb(skb);
+
+		if (++work_done >= work_limit)
+			goto done;
+	}
+
+	__napi_complete(napi);
+done:
+	return work_done;
 }
 
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	const struct pcpu_lstats *pcpu_lstats;
+	const struct loopback_per_cpu *lo = dev->ml_priv;
 	struct net_device_stats *stats = &dev->stats;
 	unsigned long bytes = 0;
 	unsigned long packets = 0;
 	int i;
 
-	pcpu_lstats = dev->ml_priv;
 	for_each_possible_cpu(i) {
-		const struct pcpu_lstats *lb_stats;
-
-		lb_stats = per_cpu_ptr(pcpu_lstats, i);
-		bytes   += lb_stats->bytes;
-		packets += lb_stats->packets;
+		const struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
+		bytes   += pcpu->bytes;
+		packets += pcpu->packets;
 	}
 	stats->rx_packets = packets;
 	stats->tx_packets = packets;
@@ -127,21 +155,43 @@ static const struct ethtool_ops loopback
 
 static int loopback_dev_init(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats;
+	struct loopback_per_cpu *lo;
+	int i;
 
-	lstats = alloc_percpu(struct pcpu_lstats);
-	if (!lstats)
+	lo = alloc_percpu(struct loopback_per_cpu);
+	if (!lo)
 		return -ENOMEM;
 
-	dev->ml_priv = lstats;
+	dev->ml_priv = lo;
+
+	for_each_possible_cpu(i) {
+		struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
+		skb_queue_head_init(&pcpu->rxq);
+		netif_napi_add(dev, &pcpu->napi, loopback_poll, 64);
+	}
+
+	return 0;
+}
+
+static int loopback_dev_stop(struct net_device *dev)
+{
+	struct loopback_per_cpu *lo = dev->ml_priv;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
+
+		napi_synchronize(&pcpu->napi);
+		__skb_queue_purge(&pcpu->rxq);
+	}
 	return 0;
 }
 
 static void loopback_dev_free(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats = dev->ml_priv;
+	struct loopback_per_cpu *lo = dev->ml_priv;
 
-	free_percpu(lstats);
+	free_percpu(lo);
 	free_netdev(dev);
 }
 
@@ -169,6 +219,7 @@ static void loopback_setup(struct net_de
 	dev->header_ops		= &eth_header_ops;
 	dev->init = loopback_dev_init;
 	dev->destructor = loopback_dev_free;
+	dev->stop = loopback_dev_stop;
 }
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] loopback: optimization
  2008-11-04  5:37 [RFC] loopback: optimization Stephen Hemminger
@ 2008-11-04  6:36 ` Eric Dumazet
  2008-11-05  9:49   ` David Miller
  2008-11-05 20:36 ` Stephen Hemminger
  1 sibling, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2008-11-04  6:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> This is something I was trying out to try improving loopback performance.
> It moves loopback processing from the generic backlog per-cpu queues,
> to its own set of queues. I thought this might help the cache locality
> and the loopback doesn't have to do relatively expensive local_irq_disable()
> operations.
> 
> BUT it didn't seem to help with tbench, but it might help others who
> want to go farther with it. Code runs but treat it ASIS at this point.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> --- a/drivers/net/loopback.c	2008-11-03 10:13:31.000000000 -0800
> +++ b/drivers/net/loopback.c	2008-11-03 11:05:52.000000000 -0800
> @@ -59,51 +59,79 @@
>  #include <linux/percpu.h>
>  #include <net/net_namespace.h>
>  
> -struct pcpu_lstats {
> +struct loopback_per_cpu {
> +	struct sk_buff_head rxq;
> +	struct napi_struct napi;
> +
>  	unsigned long packets;
>  	unsigned long bytes;
>  };
>  
> +
>  /*
>   * The higher levels take care of making this non-reentrant (it's
>   * called with bh's disabled).
>   */
>  static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
> -	struct pcpu_lstats *pcpu_lstats, *lb_stats;
> +	struct loopback_per_cpu *lo = dev->ml_priv, *pcpu;
>  
>  	skb_orphan(skb);
>  
> -	skb->protocol = eth_type_trans(skb,dev);
> +	skb->protocol = eth_type_trans(skb, dev);
>  
>  	dev->last_rx = jiffies;
>  
>  	/* it's OK to use per_cpu_ptr() because BHs are off */
> -	pcpu_lstats = dev->ml_priv;
> -	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
> -	lb_stats->bytes += skb->len;
> -	lb_stats->packets++;
> +	pcpu = per_cpu_ptr(lo, smp_processor_id());
>  
> -	netif_rx(skb);
> +	if (likely(pcpu->rxq.qlen < netdev_max_backlog)) {

> +		if (pcpu->rxq.qlen == 0)
> +			__napi_schedule(&lo->napi);

Hum...

Most of the time I suspect we take this __napi_schedule() thing.

So are you sure you use the right one (&lo->napi, not &pcpu->napi) ?

This is the where point we still do a cache line ping-pong.

> +
> +		__skb_queue_tail(&pcpu->rxq, skb);
> +		pcpu->bytes += skb->len;
> +		pcpu->packets++;
> +
> +		return NET_XMIT_SUCCESS;
> +	} else {
> +		dev->stats.rx_dropped++;
> +		dev_kfree_skb_any(skb);
> +		return NET_XMIT_DROP;
> +	}
> +}
>  
> -	return 0;
> +static int loopback_poll(struct napi_struct *napi, int work_limit)
> +{
> +	struct loopback_per_cpu *pcpu
> +		= container_of(napi, struct loopback_per_cpu, napi);
> +	struct sk_buff *skb;
> +	int work_done = 0;
> +
> +	while ( (skb = __skb_dequeue(&pcpu->rxq)) ) {
> +		netif_receive_skb(skb);
> +
> +		if (++work_done >= work_limit)
> +			goto done;
> +	}
> +
> +	__napi_complete(napi);
> +done:
> +	return work_done;
>  }
>  
>  static struct net_device_stats *get_stats(struct net_device *dev)
>  {
> -	const struct pcpu_lstats *pcpu_lstats;
> +	const struct loopback_per_cpu *lo = dev->ml_priv;
>  	struct net_device_stats *stats = &dev->stats;
>  	unsigned long bytes = 0;
>  	unsigned long packets = 0;
>  	int i;
>  
> -	pcpu_lstats = dev->ml_priv;
>  	for_each_possible_cpu(i) {
> -		const struct pcpu_lstats *lb_stats;
> -
> -		lb_stats = per_cpu_ptr(pcpu_lstats, i);
> -		bytes   += lb_stats->bytes;
> -		packets += lb_stats->packets;
> +		const struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
> +		bytes   += pcpu->bytes;
> +		packets += pcpu->packets;
>  	}
>  	stats->rx_packets = packets;
>  	stats->tx_packets = packets;
> @@ -127,21 +155,43 @@ static const struct ethtool_ops loopback
>  
>  static int loopback_dev_init(struct net_device *dev)
>  {
> -	struct pcpu_lstats *lstats;
> +	struct loopback_per_cpu *lo;
> +	int i;
>  
> -	lstats = alloc_percpu(struct pcpu_lstats);
> -	if (!lstats)
> +	lo = alloc_percpu(struct loopback_per_cpu);
> +	if (!lo)
>  		return -ENOMEM;
>  
> -	dev->ml_priv = lstats;
> +	dev->ml_priv = lo;
> +
> +	for_each_possible_cpu(i) {
> +		struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
> +		skb_queue_head_init(&pcpu->rxq);
> +		netif_napi_add(dev, &pcpu->napi, loopback_poll, 64);
> +	}
> +
> +	return 0;
> +}
> +
> +static int loopback_dev_stop(struct net_device *dev)
> +{
> +	struct loopback_per_cpu *lo = dev->ml_priv;
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i);
> +
> +		napi_synchronize(&pcpu->napi);
> +		__skb_queue_purge(&pcpu->rxq);
> +	}
>  	return 0;
>  }
>  
>  static void loopback_dev_free(struct net_device *dev)
>  {
> -	struct pcpu_lstats *lstats = dev->ml_priv;
> +	struct loopback_per_cpu *lo = dev->ml_priv;
>  
> -	free_percpu(lstats);
> +	free_percpu(lo);
>  	free_netdev(dev);
>  }
>  
> @@ -169,6 +219,7 @@ static void loopback_setup(struct net_de
>  	dev->header_ops		= &eth_header_ops;
>  	dev->init = loopback_dev_init;
>  	dev->destructor = loopback_dev_free;
> +	dev->stop = loopback_dev_stop;
>  }
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] loopback: optimization
  2008-11-04  6:36 ` Eric Dumazet
@ 2008-11-05  9:49   ` David Miller
  0 siblings, 0 replies; 6+ messages in thread
From: David Miller @ 2008-11-05  9:49 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 04 Nov 2008 07:36:48 +0100

> So are you sure you use the right one (&lo->napi, not &pcpu->napi) ?

Yes I think this logic error in Stephen's patch might
explain why it had no effect.

Stephen please try with lo->napi replaced with pcpu->napi
here.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC] loopback: optimization
  2008-11-04  5:37 [RFC] loopback: optimization Stephen Hemminger
  2008-11-04  6:36 ` Eric Dumazet
@ 2008-11-05 20:36 ` Stephen Hemminger
  2008-11-05 23:14   ` Eric Dumazet
  1 sibling, 1 reply; 6+ messages in thread
From: Stephen Hemminger @ 2008-11-05 20:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 4899 bytes --]

Convert loopback device from using common network queues to a per-cpu
receive queue with NAPI. This gives a small 1% performance gain when
measured over 5 runs of tbench. Not sure if it's worth bothering
though.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/drivers/net/loopback.c	2008-11-04 15:36:29.000000000 -0800
+++ b/drivers/net/loopback.c	2008-11-05 10:00:20.000000000 -0800
@@ -59,7 +59,10 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 
-struct pcpu_lstats {
+struct loopback_queue {
+	struct sk_buff_head rxq;
+	struct napi_struct napi;
+
 	unsigned long packets;
 	unsigned long bytes;
 };
@@ -70,36 +73,60 @@ struct pcpu_lstats {
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct pcpu_lstats *pcpu_lstats, *lb_stats;
+	struct loopback_queue *pcpu;
 
 	skb_orphan(skb);
 
 	skb->protocol = eth_type_trans(skb,dev);
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	pcpu_lstats = dev->ml_priv;
-	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
-	lb_stats->bytes += skb->len;
-	lb_stats->packets++;
-
-	netif_rx(skb);
+	pcpu = per_cpu_ptr(dev->ml_priv, smp_processor_id());
+	if (likely(pcpu->rxq.qlen <= netdev_max_backlog)) {
+		__skb_queue_tail(&pcpu->rxq, skb);
+		pcpu->bytes += skb->len;
+		pcpu->packets++;
+		napi_schedule_irq(&pcpu->napi);
+
+		return NET_XMIT_SUCCESS;
+	} else {
+		dev->stats.rx_dropped++;
+		dev_kfree_skb_any(skb);
+		return NET_XMIT_DROP;
+	}
 
 	return 0;
 }
 
+static int loopback_poll(struct napi_struct *arg, int quota)
+{
+	struct loopback_queue *pcpu = container_of(arg, struct loopback_queue, napi);
+	int work = 0;
+
+	do {
+		struct sk_buff *skb = __skb_dequeue(&pcpu->rxq);
+
+		if (!skb) {
+			__napi_complete(arg);
+			break;
+		}
+
+		netif_receive_skb(skb);
+	} while (++work < quota);
+
+	return work;
+}
+
+
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	const struct pcpu_lstats *pcpu_lstats;
 	struct net_device_stats *stats = &dev->stats;
 	unsigned long bytes = 0;
 	unsigned long packets = 0;
 	int i;
 
-	pcpu_lstats = dev->ml_priv;
 	for_each_possible_cpu(i) {
-		const struct pcpu_lstats *lb_stats;
+		const struct loopback_queue *lb_stats;
 
-		lb_stats = per_cpu_ptr(pcpu_lstats, i);
+		lb_stats = per_cpu_ptr(dev->ml_priv, i);
 		bytes   += lb_stats->bytes;
 		packets += lb_stats->packets;
 	}
@@ -125,21 +152,57 @@ static const struct ethtool_ops loopback
 
 static int loopback_dev_init(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats;
+	void *p;
+	int i;
 
-	lstats = alloc_percpu(struct pcpu_lstats);
-	if (!lstats)
+	p = alloc_percpu(struct loopback_queue);
+	if (!p)
 		return -ENOMEM;
 
-	dev->ml_priv = lstats;
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(p, i);
+		skb_queue_head_init(&pcpu->rxq);
+		netif_napi_add(dev, &pcpu->napi, loopback_poll, 64);
+	}
+
+	dev->ml_priv = p;
+
+	return 0;
+}
+
+static int loopback_dev_start(struct net_device *dev)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_enable(&pcpu->napi);
+	}
+	return 0;
+}
+
+static int loopback_dev_stop(struct net_device *dev)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_disable(&pcpu->napi);
+		__skb_queue_purge(&pcpu->rxq);
+	}
 	return 0;
 }
 
 static void loopback_dev_free(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats = dev->ml_priv;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_disable(&pcpu->napi);
+	}
 
-	free_percpu(lstats);
+	free_percpu(dev->ml_priv);
 	free_netdev(dev);
 }
 
@@ -166,6 +229,8 @@ static void loopback_setup(struct net_de
 	dev->ethtool_ops	= &loopback_ethtool_ops;
 	dev->header_ops		= &eth_header_ops;
 	dev->init = loopback_dev_init;
+	dev->open = loopback_dev_start;
+	dev->stop = loopback_dev_stop;
 	dev->destructor = loopback_dev_free;
 }
 
--- a/include/linux/netdevice.h	2008-11-05 08:18:01.000000000 -0800
+++ b/include/linux/netdevice.h	2008-11-05 08:18:19.000000000 -0800
@@ -366,6 +366,8 @@ static inline int napi_reschedule(struct
 	return 0;
 }
 
+extern void napi_schedule_irq(struct napi_struct *n);
+
 /**
  *	napi_complete - NAPI processing complete
  *	@n: napi context
--- a/net/core/dev.c	2008-11-05 08:17:32.000000000 -0800
+++ b/net/core/dev.c	2008-11-05 09:54:36.000000000 -0800
@@ -2369,6 +2369,15 @@ void __napi_schedule(struct napi_struct 
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/* Special case version of napi_schedule since loopback device has no hard irq */
+void napi_schedule_irq(struct napi_struct *n)
+{
+	if (napi_schedule_prep(n)) {
+		list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	}
+}
+
 
 static void net_rx_action(struct softirq_action *h)
 {

[-- Attachment #2: loopback-napi.patch --]
[-- Type: text/x-patch, Size: 4933 bytes --]

Convert loopback device from using common network queues to a per-cpu
receive queue with NAPI. This gives a small 1% performance gain when
measured over 5 runs of tbench. It does make the code larger and more space
needs to be allocated as well.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/drivers/net/loopback.c	2008-11-04 15:36:29.000000000 -0800
+++ b/drivers/net/loopback.c	2008-11-05 10:00:20.000000000 -0800
@@ -59,7 +59,10 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 
-struct pcpu_lstats {
+struct loopback_queue {
+	struct sk_buff_head rxq;
+	struct napi_struct napi;
+
 	unsigned long packets;
 	unsigned long bytes;
 };
@@ -70,36 +73,60 @@ struct pcpu_lstats {
  */
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct pcpu_lstats *pcpu_lstats, *lb_stats;
+	struct loopback_queue *pcpu;
 
 	skb_orphan(skb);
 
 	skb->protocol = eth_type_trans(skb,dev);
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	pcpu_lstats = dev->ml_priv;
-	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
-	lb_stats->bytes += skb->len;
-	lb_stats->packets++;
-
-	netif_rx(skb);
+	pcpu = per_cpu_ptr(dev->ml_priv, smp_processor_id());
+	if (likely(pcpu->rxq.qlen <= netdev_max_backlog)) {
+		__skb_queue_tail(&pcpu->rxq, skb);
+		pcpu->bytes += skb->len;
+		pcpu->packets++;
+		napi_schedule_irq(&pcpu->napi);
+
+		return NET_XMIT_SUCCESS;
+	} else {
+		dev->stats.rx_dropped++;
+		dev_kfree_skb_any(skb);
+		return NET_XMIT_DROP;
+	}
 
 	return 0;
 }
 
+static int loopback_poll(struct napi_struct *arg, int quota)
+{
+	struct loopback_queue *pcpu = container_of(arg, struct loopback_queue, napi);
+	int work = 0;
+
+	do {
+		struct sk_buff *skb = __skb_dequeue(&pcpu->rxq);
+
+		if (!skb) {
+			__napi_complete(arg);
+			break;
+		}
+
+		netif_receive_skb(skb);
+	} while (++work < quota);
+
+	return work;
+}
+
+
 static struct net_device_stats *get_stats(struct net_device *dev)
 {
-	const struct pcpu_lstats *pcpu_lstats;
 	struct net_device_stats *stats = &dev->stats;
 	unsigned long bytes = 0;
 	unsigned long packets = 0;
 	int i;
 
-	pcpu_lstats = dev->ml_priv;
 	for_each_possible_cpu(i) {
-		const struct pcpu_lstats *lb_stats;
+		const struct loopback_queue *lb_stats;
 
-		lb_stats = per_cpu_ptr(pcpu_lstats, i);
+		lb_stats = per_cpu_ptr(dev->ml_priv, i);
 		bytes   += lb_stats->bytes;
 		packets += lb_stats->packets;
 	}
@@ -125,21 +152,57 @@ static const struct ethtool_ops loopback
 
 static int loopback_dev_init(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats;
+	void *p;
+	int i;
 
-	lstats = alloc_percpu(struct pcpu_lstats);
-	if (!lstats)
+	p = alloc_percpu(struct loopback_queue);
+	if (!p)
 		return -ENOMEM;
 
-	dev->ml_priv = lstats;
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(p, i);
+		skb_queue_head_init(&pcpu->rxq);
+		netif_napi_add(dev, &pcpu->napi, loopback_poll, 64);
+	}
+
+	dev->ml_priv = p;
+
+	return 0;
+}
+
+static int loopback_dev_start(struct net_device *dev)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_enable(&pcpu->napi);
+	}
+	return 0;
+}
+
+static int loopback_dev_stop(struct net_device *dev)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_disable(&pcpu->napi);
+		__skb_queue_purge(&pcpu->rxq);
+	}
 	return 0;
 }
 
 static void loopback_dev_free(struct net_device *dev)
 {
-	struct pcpu_lstats *lstats = dev->ml_priv;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct loopback_queue *pcpu = per_cpu_ptr(dev->ml_priv, i);
+		napi_disable(&pcpu->napi);
+	}
 
-	free_percpu(lstats);
+	free_percpu(dev->ml_priv);
 	free_netdev(dev);
 }
 
@@ -166,6 +229,8 @@ static void loopback_setup(struct net_de
 	dev->ethtool_ops	= &loopback_ethtool_ops;
 	dev->header_ops		= &eth_header_ops;
 	dev->init = loopback_dev_init;
+	dev->open = loopback_dev_start;
+	dev->stop = loopback_dev_stop;
 	dev->destructor = loopback_dev_free;
 }
 
--- a/include/linux/netdevice.h	2008-11-05 08:18:01.000000000 -0800
+++ b/include/linux/netdevice.h	2008-11-05 08:18:19.000000000 -0800
@@ -366,6 +366,8 @@ static inline int napi_reschedule(struct
 	return 0;
 }
 
+extern void napi_schedule_irq(struct napi_struct *n);
+
 /**
  *	napi_complete - NAPI processing complete
  *	@n: napi context
--- a/net/core/dev.c	2008-11-05 08:17:32.000000000 -0800
+++ b/net/core/dev.c	2008-11-05 09:54:36.000000000 -0800
@@ -2369,6 +2369,15 @@ void __napi_schedule(struct napi_struct 
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/* Special case version of napi_schedule since loopback device has no hard irq */
+void napi_schedule_irq(struct napi_struct *n)
+{
+	if (napi_schedule_prep(n)) {
+		list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	}
+}
+
 
 static void net_rx_action(struct softirq_action *h)
 {

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] loopback: optimization
  2008-11-05 20:36 ` Stephen Hemminger
@ 2008-11-05 23:14   ` Eric Dumazet
  2008-11-06  0:42     ` Stephen Hemminger
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2008-11-05 23:14 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Convert loopback device from using common network queues to a per-cpu
> receive queue with NAPI. This gives a small 1% performance gain when
> measured over 5 runs of tbench. Not sure if it's worth bothering
> though.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> --- a/drivers/net/loopback.c	2008-11-04 15:36:29.000000000 -0800
> +++ b/drivers/net/loopback.c	2008-11-05 10:00:20.000000000 -0800
> @@ -59,7 +59,10 @@
>  
> +/* Special case version of napi_schedule since loopback device has no hard irq */
> +void napi_schedule_irq(struct napi_struct *n)
> +{
> +	if (napi_schedule_prep(n)) {
> +		list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
> +		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
> +	}
> +}
> +

Stephen, I dont get it.

Sure loopback device cannot generate hard irqs, but what prevent's a real hardware
interrupt to call NIC driver that can call napi_schedule() and corrupt softnet_data.poll_list ?

Why not using a queue dedicated on loopback directly in cpu_var(softnet_data) ?

(ie not using a napi structure for each cpu and each loopback dev)

This queue would be irq safe yes.

net_rx_action could handle this list without local_irq_disable()/local_irq_enable() games.

Hum, maybe complex for loopback_dev_stop() to purge all queues without interfering with other namespaces.





^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] loopback: optimization
  2008-11-05 23:14   ` Eric Dumazet
@ 2008-11-06  0:42     ` Stephen Hemminger
  0 siblings, 0 replies; 6+ messages in thread
From: Stephen Hemminger @ 2008-11-06  0:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Thu, 06 Nov 2008 00:14:16 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > Convert loopback device from using common network queues to a per-cpu
> > receive queue with NAPI. This gives a small 1% performance gain when
> > measured over 5 runs of tbench. Not sure if it's worth bothering
> > though.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > 
> > --- a/drivers/net/loopback.c	2008-11-04 15:36:29.000000000 -0800
> > +++ b/drivers/net/loopback.c	2008-11-05 10:00:20.000000000 -0800
> > @@ -59,7 +59,10 @@
> >  
> > +/* Special case version of napi_schedule since loopback device has no hard irq */
> > +void napi_schedule_irq(struct napi_struct *n)
> > +{
> > +	if (napi_schedule_prep(n)) {
> > +		list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
> > +		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
> > +	}
> > +}
> > +
> 
> Stephen, I dont get it.
> 
> Sure loopback device cannot generate hard irqs, but what prevent's a real hardware
> interrupt to call NIC driver that can call napi_schedule() and corrupt softnet_data.poll_list ?
> 
> Why not using a queue dedicated on loopback directly in cpu_var(softnet_data) ?
> 
> (ie not using a napi structure for each cpu and each loopback dev)
> 
> This queue would be irq safe yes.
> 
> net_rx_action could handle this list without local_irq_disable()/local_irq_enable() games.
> 
> Hum, maybe complex for loopback_dev_stop() to purge all queues without interfering with other namespaces.

I did try a workqueue and kthread version previously, but they both had much worse
performance. Forgot that the NAPI schedule is shared, so yes that would have to locked.

Doing it purely for loopback would mean using a tasklet or another softirq.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-11-06  0:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-04  5:37 [RFC] loopback: optimization Stephen Hemminger
2008-11-04  6:36 ` Eric Dumazet
2008-11-05  9:49   ` David Miller
2008-11-05 20:36 ` Stephen Hemminger
2008-11-05 23:14   ` Eric Dumazet
2008-11-06  0:42     ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).