netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Rusty Russell <rusty@rustcorp.com.au>
Cc: Herbert Xu <herbert@gondor.apana.org.au>,
	Sridhar Samudrala <sri@us.ibm.com>,
	netdev@vger.kernel.org
Subject: Re: [RFC PATCH] Regression in linux 2.6.32 virtio_net seen with vhost-net
Date: Thu, 17 Dec 2009 11:25:31 +0200	[thread overview]
Message-ID: <20091217092531.GA4905@redhat.com> (raw)
In-Reply-To: <200912171232.26743.rusty@rustcorp.com.au>

On Thu, Dec 17, 2009 at 12:32:26PM +1030, Rusty Russell wrote:
> On Wed, 16 Dec 2009 11:52:18 pm Michael S. Tsirkin wrote:
> > On Wed, Dec 16, 2009 at 11:15:38PM +1030, Rusty Russell wrote:
> > > +	struct virtnet_info *vi =
> > > +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> > > +
> > > +	if (netif_queue_stopped(vi->dev)) {
> > 
> > I am a bit concerned here: for example, on link down
> > you do netif_stop_queue, and start on link up.
> > So is it enough to check netif_queue_stopped
> > to verify that tx is not running and that this is because
> > it was out of capacity?
> > 
> > It would be very bad if this run in parallel with TX ...
> 
> Yeah, I wasn't happy.  This version uses the tx lock (we're single-queued,
> so I used the __ version)
> 
> virtio_net: use NAPI for xmit (UNTESTED)
> 
> This is closer to the way tg3 and ixgbe do it: use the NAPI framework to
> free transmitted packets.  It neatens things a little as well.
> 
> Changes since last version:
> 
> 1) Use the tx lock for the xmit_poll to synchronize against
>    start_xmit; it might be overkill, but it's simple.
> 2) Don't wake queue if the carrier is gone.
> 
> (Note: a side effect of this is that we are lazier in freeing old xmit skbs.
>  This might be a slight win).
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

That's very clean. Some questions below:

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -47,6 +47,9 @@ struct virtnet_info
>  	struct napi_struct napi;
>  	unsigned int status;
>  
> +	/* We free packets and decide whether to restart xmit here. */
> +	struct napi_struct xmit_napi;
> +
>  	/* Number of input buffers, and max we've ever had. */
>  	unsigned int num, max;
>  
> @@ -60,6 +63,9 @@ struct virtnet_info
>  	struct sk_buff_head recv;
>  	struct sk_buff_head send;
>  
> +	/* Capacity left in xmit queue. */
> +	unsigned int capacity;
> +
>  	/* Work struct for refilling if we run low on memory. */
>  	struct delayed_work refill;
>  
> @@ -111,11 +117,8 @@ static void skb_xmit_done(struct virtque
>  {
>  	struct virtnet_info *vi = svq->vdev->priv;
>  
> -	/* Suppress further interrupts. */
> -	svq->vq_ops->disable_cb(svq);
> -
>  	/* We were probably waiting for more output buffers. */
> -	netif_wake_queue(vi->dev);
> +	napi_schedule(&vi->xmit_napi);
>  }
>  
>  static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> @@ -455,6 +458,29 @@ static unsigned int free_old_xmit_skbs(s
>  	return tot_sgs;
>  }
>  
> +static int virtnet_xmit_poll(struct napi_struct *xmit_napi, int budget)
> +{
> +	struct virtnet_info *vi =
> +		container_of(xmit_napi, struct virtnet_info, xmit_napi);
> +
> +	/* Don't access vq/capacity at same time as start_xmit. */
> +	__netif_tx_lock(netdev_get_tx_queue(vi->dev, 0), smp_processor_id());

So now that we are locking, we could build a variant of this
without NAPI (maybe with trylock: we can't spin on xmit lock from
from hard irq context, can we?)? Possibly, if we do, that would be
a small enough change to be applicable in 2.6.32.

> +
> +	vi->capacity += free_old_xmit_skbs(vi);

Should we build a variant of free_old_xmit_skbs
that gets budget, to avoid starving others
while we poll the vq?

> +	if (vi->capacity >= 2 + MAX_SKB_FRAGS) {
> +		/* Suppress further xmit interrupts. */
> +		vi->svq->vq_ops->disable_cb(vi->svq);
> +		napi_complete(xmit_napi);
> +
> +		/* Don't wake it if link is down. */
> +		if (likely(netif_carrier_ok(vi->vdev)))
> +			netif_wake_queue(vi->dev);
> +	}
> +
> +	__netif_tx_unlock(netdev_get_tx_queue(vi->dev, 0));
> +	return 1;
> +}
> +
>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  {
>  	struct scatterlist sg[2+MAX_SKB_FRAGS];
> @@ -509,10 +535,6 @@ static netdev_tx_t start_xmit(struct sk_
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int capacity;
>  
> -again:
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(vi);
> -
>  	/* Try to transmit */
>  	capacity = xmit_skb(vi, skb);
>  
> @@ -520,14 +542,13 @@ again:
>  	if (unlikely(capacity < 0)) {
>  		netif_stop_queue(dev);
>  		dev_warn(&dev->dev, "Unexpected full queue\n");
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			vi->svq->vq_ops->disable_cb(vi->svq);
> -			netif_start_queue(dev);
> -			goto again;
> -		}
> +		/* If we missed an interrupt, we let virtnet_xmit_poll deal. */
> +		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +			napi_schedule(&vi->xmit_napi);
>  		return NETDEV_TX_BUSY;
>  	}
>  	vi->svq->vq_ops->kick(vi->svq);
> +	vi->capacity = capacity;
>  
>  	/*
>  	 * Put new one in send queue.  You'd expect we'd need this before
> @@ -545,14 +566,13 @@ again:
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
>  	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			capacity += free_old_xmit_skbs(vi);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				vi->svq->vq_ops->disable_cb(vi->svq);
> -			}
> +		/* Free old skbs; might make more capacity. */
> +		vi->capacity = capacity + free_old_xmit_skbs(vi);
> +		if (unlikely(vi->capacity < 2+MAX_SKB_FRAGS)) {
> +			netif_stop_queue(dev);
> +			/* Missed xmit irq? virtnet_xmit_poll will deal. */
> +			if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq)))
> +				napi_schedule(&vi->xmit_napi);
>  		}
>  	}
>  
> @@ -590,6 +610,7 @@ static int virtnet_open(struct net_devic
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_enable(&vi->napi);
> +	napi_enable(&vi->xmit_napi);
>  
>  	/* If all buffers were filled by other side before we napi_enabled, we
>  	 * won't get another interrupt, so process any outstanding packets
> @@ -652,6 +673,7 @@ static int virtnet_close(struct net_devi
>  	struct virtnet_info *vi = netdev_priv(dev);
>  
>  	napi_disable(&vi->napi);
> +	napi_disable(&vi->xmit_napi);
>  
>  	return 0;
>  }
> @@ -818,9 +840,13 @@ static void virtnet_update_status(struct
>  
>  	if (vi->status & VIRTIO_NET_S_LINK_UP) {
>  		netif_carrier_on(vi->dev);
> -		netif_wake_queue(vi->dev);
> +		/* Make sure virtnet_xmit_poll sees carrier enabled. */
> +		wmb();

I think this should be smp_wmb, we are not synchronising with hardware
here. Right?

> +		napi_schedule(&vi->xmit_napi);
>  	} else {
>  		netif_carrier_off(vi->dev);
> +		/* Make sure virtnet_xmit_poll sees carrier disabled. */
> +		wmb();

And here.

>  		netif_stop_queue(vi->dev);
>  	}
>  }
> @@ -883,6 +909,7 @@ static int virtnet_probe(struct virtio_d
>  	/* Set up our device-specific information */
>  	vi = netdev_priv(dev);
>  	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
> +	netif_napi_add(dev, &vi->xmit_napi, virtnet_xmit_poll, 64);
>  	vi->dev = dev;
>  	vi->vdev = vdev;
>  	vdev->priv = vi;

  reply	other threads:[~2009-12-17  9:28 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-08 22:50 [RFC PATCH] Regression in linux 2.6.32 virtio_net seen with vhost-net Sridhar Samudrala
2009-12-13 12:25 ` Herbert Xu
2009-12-13 23:40   ` Michael S. Tsirkin
2009-12-15 14:42     ` Herbert Xu
2009-12-15 16:26       ` Sridhar Samudrala
2009-12-16  1:21         ` Herbert Xu
2009-12-15 23:32       ` Michael S. Tsirkin
2009-12-16  1:58         ` Herbert Xu
2009-12-16  4:37         ` Rusty Russell
2009-12-16 10:37           ` Michael S. Tsirkin
2009-12-16  2:41   ` Rusty Russell
2009-12-16  2:53     ` Herbert Xu
2009-12-16 12:45       ` Rusty Russell
2009-12-16 13:22         ` Michael S. Tsirkin
2009-12-16 13:35           ` Herbert Xu
2009-12-16 13:38             ` Michael S. Tsirkin
2009-12-16 13:48               ` Herbert Xu
2009-12-17  2:02           ` Rusty Russell
2009-12-17  9:25             ` Michael S. Tsirkin [this message]
2009-12-18  1:55               ` Rusty Russell
2009-12-16 13:30         ` Herbert Xu
2009-12-17  1:43         ` Sridhar Samudrala
2009-12-17  3:12           ` Herbert Xu
2009-12-17  5:02             ` Sridhar Samudrala
2009-12-17  3:15           ` Herbert Xu
2009-12-17  5:05             ` Sridhar Samudrala
2009-12-17  6:28               ` Herbert Xu
2009-12-17  6:45                 ` Sridhar Samudrala
2009-12-17 10:03                   ` Krishna Kumar2
2009-12-17 11:27                     ` Jarek Poplawski
2009-12-17 11:45                       ` Herbert Xu
2009-12-17 11:49                         ` Herbert Xu
2009-12-17 12:08                           ` Herbert Xu
2009-12-17 12:27                             ` Krishna Kumar2
2009-12-17 12:42                               ` Jarek Poplawski
2009-12-17 12:56                                 ` Herbert Xu
2009-12-17 13:22                                   ` Krishna Kumar2
2009-12-17 13:04                                 ` Krishna Kumar2
2009-12-17 13:44                               ` Herbert Xu
2009-12-17 14:35                                 ` Krishna Kumar2
2009-12-17 14:36                                   ` Herbert Xu
2009-12-17 21:50                                     ` Sridhar Samudrala
2009-12-17 22:28                                       ` Sridhar Samudrala
2009-12-17 22:41                                         ` Jarek Poplawski
2009-12-18 13:46                                       ` Krishna Kumar2
2009-12-18 19:13                                       ` Sridhar Samudrala
2009-12-17 11:59                         ` Krishna Kumar2
2009-12-17 12:19                         ` Jarek Poplawski
2009-12-17 11:56                       ` Krishna Kumar2
2009-12-17 13:17                         ` Jarek Poplawski
2009-12-17 14:10                           ` Krishna Kumar2
2009-12-17 14:16                             ` Herbert Xu
2009-12-16 17:42     ` Sridhar Samudrala
  -- strict thread matches above, loose matches on Subject: below --
2009-12-17 11:20 Krishna Kumar
2009-12-17 19:28 ` Jarek Poplawski
     [not found] <20091217111219.9809.27432.sendpatchset@krkumar2.in.ibm.com>
     [not found] ` <20091217123153.GA31131@gondor.apana.org.au>
2009-12-17 12:56   ` Krishna Kumar2
2009-12-17 13:40     ` Herbert Xu
2009-12-17 13:56       ` Krishna Kumar2

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091217092531.GA4905@redhat.com \
    --to=mst@redhat.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=netdev@vger.kernel.org \
    --cc=rusty@rustcorp.com.au \
    --cc=sri@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).