netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: xiaosuo@gmail.com
Cc: "David S. Miller" <davem@davemloft.net>,
	netdev@vger.kernel.org, Tom Herbert <therbert@google.com>
Subject: Re: [PATCH] ifb: add multi-queue support
Date: Tue, 10 Nov 2009 10:07:57 +0100	[thread overview]
Message-ID: <4AF92D6D.8060300@gmail.com> (raw)
In-Reply-To: <4AF924A5.1050303@gmail.com>

Changli Gao a écrit :
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0

Seems pretty cool !

I have some comments 

> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> drivers/net/ifb.c | 309
> ++++++++++++++++++++++++++++++++----------------------
> 1 file changed, 186 insertions(+), 123 deletions(-)
> 
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 030913f..6e04188 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -33,139 +33,101 @@
>  #include <linux/etherdevice.h>
>  #include <linux/init.h>
>  #include <linux/moduleparam.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/kthread.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <net/ip.h>
>  #include <net/pkt_sched.h>
>  #include <net/net_namespace.h>
>  
> -#define TX_TIMEOUT  (2*HZ)
> -
>  #define TX_Q_LIMIT    32
> +
>  struct ifb_private {
> -	struct tasklet_struct   ifb_tasklet;
> -	int     tasklet_pending;
> -	/* mostly debug stats leave in for now */
> -	unsigned long   st_task_enter; /* tasklet entered */
> -	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
> -	unsigned long   st_rxq_enter; /* receive queue entered */
> -	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
> -	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
> -	unsigned long   st_rx_frm_egr; /* received from egress path */
> -	unsigned long   st_rx_frm_ing; /* received from ingress path */
> -	unsigned long   st_rxq_check;
> -	unsigned long   st_rxq_rsch;
> -	struct sk_buff_head     rq;
> -	struct sk_buff_head     tq;
> +	struct net_device	*dev;
> +	struct sk_buff_head	rq;
> +	struct sk_buff_head	tq;
> +	wait_queue_head_t	wq;
> +	struct task_struct	*task;
>  };

Maybe you should allocate true per_cpu structure, to avoid cache line sharing
and get appropriate NUMA properties.

At least use a __cacheline_aligned_in_smp ...

>  
> +/* Number of ifb devices to be set up by this module. */
>  static int numifbs = 2;
> +module_param(numifbs, int, 0444);
> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>  
> -static void ri_tasklet(unsigned long dev);
> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
> -static int ifb_open(struct net_device *dev);
> -static int ifb_close(struct net_device *dev);
> +/* Number of TX queues per ifb */
> +static int numtxqs = 1;
> +module_param(numtxqs, int, 0444);
> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>  
> -static void ri_tasklet(unsigned long dev)
> +static int ifb_thread(void *priv)
>  {
> -
> -	struct net_device *_dev = (struct net_device *)dev;
> -	struct ifb_private *dp = netdev_priv(_dev);
> -	struct net_device_stats *stats = &_dev->stats;
> -	struct netdev_queue *txq;

> +	struct ifb_private *dp = (struct ifb_private*)priv;

A space is required before * : (struct ifb_private *)priv;
(in many places in your patch)

> +	struct net_device *dev = dp->dev;


> +	struct net_device_stats *stats = &dev->stats;

Here you use a net_device_stats that is shared by all your queues,
your updates wont be protected and some will be lost.
You should use txq->tx_ counters.

> +	unsigned int num = dp - (struct ifb_private*)netdev_priv(dev);
> +	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
>  	struct sk_buff *skb;
> -
> -	txq = netdev_get_tx_queue(_dev, 0);
> -	dp->st_task_enter++;
> -	if ((skb = skb_peek(&dp->tq)) == NULL) {
> -		dp->st_txq_refl_try++;
> -		if (__netif_tx_trylock(txq)) {
> -			dp->st_rxq_enter++;
> -			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
> +	DEFINE_WAIT(wait);
> +
> +	while (1) {
> +		/* move skb from rq to tq */
> +		while (1) {
> +			prepare_to_wait(&dp->wq, &wait, TASK_UNINTERRUPTIBLE);
> +			while (!__netif_tx_trylock(txq))
> +				yield();
> +			while ((skb = skb_dequeue(&dp->rq)) != NULL)
>  				skb_queue_tail(&dp->tq, skb);
> -				dp->st_rx2tx_tran++;
> -			}
> +			if (netif_queue_stopped(dev))
> +				netif_wake_queue(dev);
>  			__netif_tx_unlock(txq);
> -		} else {
> -			/* reschedule */
> -			dp->st_rxq_notenter++;
> -			goto resched;
> +			if (kthread_should_stop() || !skb_queue_empty(&dp->tq))
> +				break;
> +			schedule();
>  		}
> -	}
> -
> -	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
> -		u32 from = G_TC_FROM(skb->tc_verd);
> -
> -		skb->tc_verd = 0;
> -		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
> -		stats->tx_packets++;
> -		stats->tx_bytes +=skb->len;
> -
> -		skb->dev = dev_get_by_index(&init_net, skb->iif);
> -		if (!skb->dev) {
> -			dev_kfree_skb(skb);
> -			stats->tx_dropped++;
> +		finish_wait(&dp->wq, &wait);
> +		if (kthread_should_stop())
>  			break;
> -		}
> -		dev_put(skb->dev);
> -		skb->iif = _dev->ifindex;
> -
> -		if (from & AT_EGRESS) {
> -			dp->st_rx_frm_egr++;
> -			dev_queue_xmit(skb);
> -		} else if (from & AT_INGRESS) {
> -			dp->st_rx_frm_ing++;
> -			skb_pull(skb, skb->dev->hard_header_len);
> -			netif_rx(skb);
> -		} else
> -			BUG();
> -	}
>  
> -	if (__netif_tx_trylock(txq)) {
> -		dp->st_rxq_check++;
> -		if ((skb = skb_peek(&dp->rq)) == NULL) {
> -			dp->tasklet_pending = 0;
> -			if (netif_queue_stopped(_dev))
> -				netif_wake_queue(_dev);
> -		} else {
> -			dp->st_rxq_rsch++;
> -			__netif_tx_unlock(txq);
> -			goto resched;
> +		/* transfer packets */
> +		while ((skb = skb_dequeue(&dp->tq)) != NULL) {
> +			u32 from = G_TC_FROM(skb->tc_verd);
> +	
> +			skb->tc_verd = 0;
> +			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);

> +			stats->tx_packets++;
> +			stats->tx_bytes +=skb->len;

Use :			txq->tx_packets++
			txq->tx_bytes += skb->len;
> +	
> +			skb->dev = dev_get_by_index(&init_net, skb->iif);
> +			if (!skb->dev) {
> +				dev_kfree_skb(skb);

> +				stats->tx_dropped++;
			txq->tx_dropped ?

> +				break;
> +			}
> +			dev_put(skb->dev);
> +			skb->iif = dev->ifindex;
> +	
> +			if (from & AT_EGRESS) {
> +				dev_queue_xmit(skb);
> +			} else if (from & AT_INGRESS) {
> +				skb_pull(skb, skb->dev->hard_header_len);
> +				netif_rx_ni(skb);
> +			} else
> +				BUG();
>  		}
> -		__netif_tx_unlock(txq);
> -	} else {
> -resched:
> -		dp->tasklet_pending = 1;
> -		tasklet_schedule(&dp->ifb_tasklet);
>  	}
>  
> -}
> -
> -static const struct net_device_ops ifb_netdev_ops = {
> -	.ndo_open	= ifb_open,
> -	.ndo_stop	= ifb_close,
> -	.ndo_start_xmit	= ifb_xmit,
> -	.ndo_validate_addr = eth_validate_addr,
> -};
> -
> -static void ifb_setup(struct net_device *dev)
> -{
> -	/* Initialize the device structure. */
> -	dev->destructor = free_netdev;
> -	dev->netdev_ops = &ifb_netdev_ops;
> -
> -	/* Fill in device structure with ethernet-generic values. */
> -	ether_setup(dev);
> -	dev->tx_queue_len = TX_Q_LIMIT;
> -
> -	dev->flags |= IFF_NOARP;
> -	dev->flags &= ~IFF_MULTICAST;
> -	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
> -	random_ether_addr(dev->dev_addr);
> +	return 0;
>  }
>  
>  static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
> -	struct ifb_private *dp = netdev_priv(dev);
>  	struct net_device_stats *stats = &dev->stats;
>  	u32 from = G_TC_FROM(skb->tc_verd);
> +	int num = skb_get_queue_mapping(skb);
> +	struct ifb_private *dp = ((struct ifb_private*)netdev_priv(dev)) + num;
>  

>  	stats->rx_packets++;
>  	stats->rx_bytes+=skb->len;

Not sure how to solve this problem (several cpus can updates counter in //)

Thanks

  reply	other threads:[~2009-11-10  9:08 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-11-10  8:30 [PATCH] ifb: add multi-queue support Changli Gao
2009-11-10  9:07 ` Eric Dumazet [this message]
2009-11-10  9:43   ` Changli Gao
2009-11-10 10:57     ` Eric Dumazet
2009-11-10 11:14       ` Changli Gao
2009-11-10 11:41         ` Patrick McHardy
2009-11-10 12:14           ` Changli Gao
2009-11-10 12:19             ` Patrick McHardy
2009-11-10 12:37               ` Changli Gao
2009-11-10 12:45                 ` Patrick McHardy
2009-11-10 13:06                   ` Changli Gao
2009-11-10 13:34                     ` Eric Dumazet
2009-11-10 13:49                       ` Changli Gao
2009-11-10 16:45                         ` Stephen Hemminger
2009-11-11  6:30                           ` Changli Gao
2009-11-10 10:29 ` Patrick McHardy
2009-11-10 10:48   ` Changli Gao
2009-11-10 10:55     ` Eric Dumazet
  -- strict thread matches above, loose matches on Subject: below --
2009-11-11  9:51 Changli Gao
2009-11-11  9:56 ` Changli Gao
2009-11-11 10:30 ` Eric Dumazet
2009-11-11 10:57   ` Changli Gao
2009-11-11 15:59 ` Patrick McHardy
2009-11-12  3:12   ` Changli Gao
2009-11-12  8:52     ` Jarek Poplawski
2009-11-12  9:32       ` Changli Gao
2009-11-12 15:10     ` Patrick McHardy
2009-11-13  1:28       ` Changli Gao
2009-11-12  9:44 ` Changli Gao
2009-11-12  9:48   ` Changli Gao
2009-11-12 15:11     ` Patrick McHardy
2009-11-13  1:32       ` Changli Gao
2009-11-13  7:18         ` Patrick McHardy
2009-11-12 12:48   ` Eric Dumazet
2009-11-13  1:26     ` Changli Gao
2009-11-13  5:56       ` Eric Dumazet
2009-11-13  6:16         ` Changli Gao
2009-11-13  7:45           ` Jarek Poplawski
2009-11-13  8:54             ` Changli Gao
2009-11-13  9:18               ` Jarek Poplawski
2009-11-13  9:38                 ` Changli Gao
2009-11-13  9:57                   ` Jarek Poplawski
2009-11-13 11:25                     ` Changli Gao
2009-11-13 12:32                       ` Jarek Poplawski
2009-11-13 13:10                       ` Eric Dumazet
2009-11-13 16:15                   ` Stephen Hemminger
2009-11-13 23:28                     ` Changli Gao
2009-11-13 23:32                       ` Stephen Hemminger
2009-11-13 23:42                         ` Changli Gao
2009-11-14 12:53                           ` Eric Dumazet
2009-11-14 13:30                             ` Changli Gao
2009-11-13 13:55               ` Eric Dumazet
2009-11-13  4:37   ` Changli Gao
2009-11-16 16:39     ` Stephen Hemminger
2009-11-17  3:10       ` David Miller
2009-11-17  5:38         ` Changli Gao
2009-11-17  6:02           ` Stephen Hemminger
2009-11-13  4:42 Changli Gao
2009-11-13  4:46 ` Changli Gao
2009-11-16  7:31 Changli Gao
2009-11-16  8:19 ` Eric Dumazet
2009-11-16  8:43   ` Changli Gao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4AF92D6D.8060300@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=therbert@google.com \
    --cc=xiaosuo@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).