Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC PATCH net-next 6/6] virtio-net: enable tx interrupt only for the final skb in the chain
From: Michael S. Tsirkin @ 2014-10-15 10:22 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <1413357930-45302-7-git-send-email-jasowang@redhat.com>

On Wed, Oct 15, 2014 at 03:25:30PM +0800, Jason Wang wrote:
> With the help of xmit_more and virtqueue_enable_cb_avail(), this patch
> enable tx interrupt only for the final skb in the chain if
> needed. This will help to mitigate tx interrupts.
> 
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

I think you should split xmit_more stuff out.



> ---
>  drivers/net/virtio_net.c |   10 +++++++---
>  1 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 2afc2e2..5943f3f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -993,12 +993,16 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  				virtqueue_disable_cb(sq->vq);
>  			}
>  		}
> -	} else if (virtqueue_enable_cb(sq->vq)) {
> -		free_old_xmit_skbs(sq, qsize);
>  	}
>  
> -	if (__netif_subqueue_stopped(dev, qnum) || !skb->xmit_more)
> +	if (__netif_subqueue_stopped(dev, qnum) || !skb->xmit_more) {
>  		virtqueue_kick(sq->vq);
> +		if (sq->vq->num_free >= 2 +MAX_SKB_FRAGS &&
> +		    unlikely(virtqueue_enable_cb_avail(sq->vq))) {
> +			free_old_xmit_skbs(sq, qsize);
> +			virtqueue_disable_cb(sq->vq);
> +		}
> +	}
>  
>  	return NETDEV_TX_OK;
>  }
> -- 
> 1.7.1

^ permalink raw reply

* Re: [RFC PATCH net-next 5/6] virtio-net: enable tx interrupt
From: Jason Wang @ 2014-10-15 10:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015101826.GD25776@redhat.com>

On 10/15/2014 06:18 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 03:25:29PM +0800, Jason Wang wrote:
>> > Orphan skb in ndo_start_xmit() breaks socket accounting and packet
>> > queuing. This in fact breaks lots of things such as pktgen and several
>> > TCP optimizations. And also make BQL can't be implemented for
>> > virtio-net.
>> > 
>> > This patch tries to solve this issue by enabling tx interrupt. To
>> > avoid introducing extra spinlocks, a tx napi was scheduled to free
>> > those packets.
>> > 
>> > More tx interrupt mitigation method could be used on top.
>> > 
>> > Cc: Rusty Russell <rusty@rustcorp.com.au>
>> > Cc: Michael S. Tsirkin <mst@redhat.com>
>> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>> > ---
>> >  drivers/net/virtio_net.c |  125 +++++++++++++++++++++++++++++++---------------
>> >  1 files changed, 85 insertions(+), 40 deletions(-)
>> > 
>> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> > index ccf98f9..2afc2e2 100644
>> > --- a/drivers/net/virtio_net.c
>> > +++ b/drivers/net/virtio_net.c
>> > @@ -72,6 +72,8 @@ struct send_queue {
>> >  
>> >  	/* Name of the send queue: output.$index */
>> >  	char name[40];
>> > +
>> > +	struct napi_struct napi;
>> >  };
>> >  
>> >  /* Internal representation of a receive virtqueue */
>> > @@ -217,15 +219,40 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
>> >  	return p;
>> >  }
>> >  
>> > +static int free_old_xmit_skbs(struct send_queue *sq, int budget)
>> > +{
>> > +	struct sk_buff *skb;
>> > +	unsigned int len;
>> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
>> > +	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
>> > +	u64 tx_bytes = 0, tx_packets = 0;
>> > +
>> > +	while (tx_packets < budget &&
>> > +	       (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> > +		pr_debug("Sent skb %p\n", skb);
>> > +
>> > +		tx_bytes += skb->len;
>> > +		tx_packets++;
>> > +
>> > +		dev_kfree_skb_any(skb);
>> > +	}
>> > +
>> > +	u64_stats_update_begin(&stats->tx_syncp);
>> > +	stats->tx_bytes += tx_bytes;
>> > +	stats->tx_packets =+ tx_packets;
>> > +	u64_stats_update_end(&stats->tx_syncp);
>> > +
>> > +	return tx_packets;
>> > +}
>> > +
>> >  static void skb_xmit_done(struct virtqueue *vq)
>> >  {
>> >  	struct virtnet_info *vi = vq->vdev->priv;
>> > +	struct send_queue *sq = &vi->sq[vq2txq(vq)];
>> >  
>> > -	/* Suppress further interrupts. */
>> > -	virtqueue_disable_cb(vq);
>> > -
>> > -	/* We were probably waiting for more output buffers. */
>> > -	netif_wake_subqueue(vi->dev, vq2txq(vq));
>> > +	if (napi_schedule_prep(&sq->napi)) {
>> > +		__napi_schedule(&sq->napi);
>> > +	}
>> >  }
>> >  
>> >  static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
>> > @@ -774,7 +801,39 @@ again:
>> >  	return received;
>> >  }
>> >  
>> > +static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>> > +{
>> > +	struct send_queue *sq =
>> > +		container_of(napi, struct send_queue, napi);
>> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
>> > +	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
>> > +	unsigned int r, sent = 0;
>> > +
>> > +again:
>> > +	__netif_tx_lock(txq, smp_processor_id());
>> > +	virtqueue_disable_cb(sq->vq);
>> > +	sent += free_old_xmit_skbs(sq, budget - sent);
>> > +
>> > +	if (sent < budget) {
>> > +		r = virtqueue_enable_cb_prepare(sq->vq);
>> > +		napi_complete(napi);
>> > +		__netif_tx_unlock(txq);
>> > +		if (unlikely(virtqueue_poll(sq->vq, r)) &&
> So you are enabling callback on the next packet,
> which is almost sure to cause an interrupt storm
> on the guest.
>
>
> I think it's a bad idea, this is why I used
> enable_cb_delayed in my patch.

Right, will do this, but may also need to make sure used event never
goes back since we may call virtqueue_enable_cb_avail().
>
>
>> > +		    napi_schedule_prep(napi)) {
>> > +			virtqueue_disable_cb(sq->vq);
>> > +			__napi_schedule(napi);
>> > +			goto again;
>> > +		}
>> > +	} else {
>> > +		__netif_tx_unlock(txq);
>> > +	}
>> > +
>> > +	netif_wake_subqueue(vi->dev, vq2txq(sq->vq));
>> > +	return sent;
>> > +}
>> > +
>> >  #ifdef CONFIG_NET_RX_BUSY_POLL
>> > +
>> >  /* must be called with local_bh_disable()d */
>> >  static int virtnet_busy_poll(struct napi_struct *napi)
>> >  {
>> > @@ -822,36 +881,12 @@ static int virtnet_open(struct net_device *dev)
>> >  			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
>> >  				schedule_delayed_work(&vi->refill, 0);
>> >  		virtnet_napi_enable(&vi->rq[i]);
>> > +		napi_enable(&vi->sq[i].napi);
>> >  	}
>> >  
>> >  	return 0;
>> >  }
>> >  
>> > -static int free_old_xmit_skbs(struct send_queue *sq)
>> > -{
>> > -	struct sk_buff *skb;
>> > -	unsigned int len;
>> > -	struct virtnet_info *vi = sq->vq->vdev->priv;
>> > -	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
>> > -	u64 tx_bytes = 0, tx_packets = 0;
>> > -
>> > -	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> > -		pr_debug("Sent skb %p\n", skb);
>> > -
>> > -		tx_bytes += skb->len;
>> > -		tx_packets++;
>> > -
>> > -		dev_kfree_skb_any(skb);
>> > -	}
>> > -
>> > -	u64_stats_update_begin(&stats->tx_syncp);
>> > -	stats->tx_bytes += tx_bytes;
>> > -	stats->tx_packets =+ tx_packets;
>> > -	u64_stats_update_end(&stats->tx_syncp);
>> > -
>> > -	return tx_packets;
>> > -}
>> > -
> So you end up moving it all anyway, why bother splitting out
> minor changes in previous patches?

To make review easier, but if you think this complicated it in fact,
will pack them into a single patch.

^ permalink raw reply

* Re: [RFC PATCH net-next 0/6] Always use tx interrupt for virtio-net
From: Michael S. Tsirkin @ 2014-10-15 10:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <1413357930-45302-1-git-send-email-jasowang@redhat.com>

On Wed, Oct 15, 2014 at 03:25:24PM +0800, Jason Wang wrote:
> According to David, proper accounting and queueing (at all levels, not
> just TCP sockets) is more important than trying to skim a bunch of
> cycles by avoiding TX interrupts.

He also mentioned we should find other ways to batch

> Having an event to free the SKB is
> absolutely essential for the stack to operate correctly.
> 
> This series tries to enable tx interrupt for virtio-net. The idea is
> simple: enable tx interrupt and schedule a tx napi to free old xmit
> skbs.
> 
> Several notes:
> - Tx interrupt storm avoidance when queue is about to be full is
>   kept.

But queue is typically *not* full. More important to avoid interrupt
storms in that case IMO.

> Since we may enable callbacks in both ndo_start_xmit() and tx
>   napi, patch 1 adds a check to make sure used event never go
>   back. This will let the napi not enable the callbacks wrongly after
>   delayed callbacks was used.

So why not just use delayed callbacks?


> - For bulk dequeuing, there's no need to enable tx interrupt for each
>   packet. The last patch only enable tx interrupt for the final skb in
>   the chain through xmit_more and a new helper to publish current avail
>   idx as used event.
> 
> This series fixes several issues of original rfc pointed out by Michael.

Could you list the issues, for ease of review?


> 
> Smoking test is done, and will do complete netperf test. Send the
> seires for early review.
> 
> Thanks
> 
> Jason Wang (6):
>   virtio: make sure used event never go backwards
>   virtio: introduce virtio_enable_cb_avail()
>   virtio-net: small optimization on free_old_xmit_skbs()
>   virtio-net: return the number of packets sent in free_old_xmit_skbs()
>   virtio-net: enable tx interrupt
>   virtio-net: enable tx interrupt only for the final skb in the chain
> 
>  drivers/net/virtio_net.c     |  125 ++++++++++++++++++++++++++++++------------
>  drivers/virtio/virtio_ring.c |   25 ++++++++-
>  include/linux/virtio.h       |    2 +
>  3 files changed, 115 insertions(+), 37 deletions(-)

^ permalink raw reply

* Re: [RFC PATCH net-next 6/6] virtio-net: enable tx interrupt only for the final skb in the chain
From: Jason Wang @ 2014-10-15 10:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015102251.GE25776@redhat.com>

On 10/15/2014 06:22 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 03:25:30PM +0800, Jason Wang wrote:
>> With the help of xmit_more and virtqueue_enable_cb_avail(), this patch
>> enable tx interrupt only for the final skb in the chain if
>> needed. This will help to mitigate tx interrupts.
>>
>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>> Cc: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> I think you should split xmit_more stuff out.

No much difference but if you prefer I will do this.

^ permalink raw reply

* Re: [RFC PATCH net-next 1/6] virtio: make sure used event never go backwards
From: Michael S. Tsirkin @ 2014-10-15 10:32 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E48BF.4080502@redhat.com>

On Wed, Oct 15, 2014 at 06:13:19PM +0800, Jason Wang wrote:
> On 10/15/2014 05:34 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 03:25:25PM +0800, Jason Wang wrote:
> >> This patch checks the new event idx to make sure used event idx never
> >> goes back. This is used to synchronize the calls between
> >> virtqueue_enable_cb_delayed() and virtqueue_enable_cb().
> >>
> >> Cc: Rusty Russell <rusty@rustcorp.com.au>
> >> Cc: Michael S. Tsirkin <mst@redhat.com>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > the implication being that moving event idx back might cause some race
> > condition?  
> 
> This will cause race condition when tx interrupt is enabled. Consider
> the following cases
> 
> 1) tx napi was scheduled
> 2) start_xmit() call virtqueue_enable_cb_delayed() and disable cb, [used
> event is vq->last_used_idx + 3/4 pendg bufs]
> 3) tx napi enable the callback by virtqueue_enable_cb() [ used event is
> vq->last_used_idx ]
>  
> After step 3, used event was moved back, unnecessary tx interrupt was
> triggered.

Well unnecessary interrupts are safe.
With your patch caller of virtqueue_enable_cb will not get an
interrupt on the next buffer which is not safe.

If you don't want an interrupt on the next buffer, don't
call virtqueue_enable_cb.


> > If yes but please describe the race explicitly.
> > Is there a bug we need to fix on stable?
> 
> Looks not, current code does not have such race condition.
> > Please also explicitly describe a configuration that causes event idx
> > to go back.
> >
> > All this info should go in the commit log.
> 
> Will do this.
> >> ---
> >>  drivers/virtio/virtio_ring.c |    7 +++++--
> >>  1 files changed, 5 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >> index 3b1f89b..1b3929f 100644
> >> --- a/drivers/virtio/virtio_ring.c
> >> +++ b/drivers/virtio/virtio_ring.c
> >> @@ -559,14 +559,17 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >>  	u16 last_used_idx;
> >>  
> >>  	START_USE(vq);
> >> -
> >> +	last_used_idx = vq->last_used_idx;
> >>  	/* We optimistically turn back on interrupts, then check if there was
> >>  	 * more to do. */
> >>  	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> >>  	 * either clear the flags bit or point the event index at the next
> >>  	 * entry. Always do both to keep code simple. */
> >>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >> -	vring_used_event(&vq->vring) = last_used_idx = vq->last_used_idx;
> >> +	/* Make sure used event never go backwards */
> > s/go/goes/
> >
> >> +	if (!vring_need_event(vring_used_event(&vq->vring),
> >> +			      vq->vring.avail->idx, last_used_idx))
> >> +		vring_used_event(&vq->vring) = last_used_idx;
> > The result will be that driver will *not* get an interrupt
> > on the next consumed buffer, which is likely not what driver
> > intended when it called virtqueue_enable_cb.
> 
> This will only happen when we want to delay the interrupt for next few
> consumed buffers (virtqueue_enable_cb_delayed() was called). For the
> other case, vq->last_used_idx should be ahead of previous used event. Do
> you see any other case?

Call virtqueue_enable_cb_delayed, later call virtqueue_enable_cb.  If
event index is not updated in virtqueue_enable_cb, driver will not get
an interrupt on the next buffer.


> >
> > Instead, how about we simply document the requirement that drivers either
> > always call virtqueue_enable_cb_delayed or virtqueue_enable_cb
> > but not both?
> 
> We need call them both when tx interrupt is enabled I believe.

Can you pls reply to my patch and document issues you see?

> >
> >>  	END_USE(vq);
> >>  	return last_used_idx;
> >>  }
> >> -- 
> >> 1.7.1

^ permalink raw reply

* [PATCH] tcp: refine autocork condition check
From: Weiping Pan @ 2014-10-15 10:34 UTC (permalink / raw)
  To: netdev; +Cc: edumazet

Inspired by commit b2532eb9abd8 (tcp: fix ooo_okay setting vs Small Queues).

The last check in tcp_should_autocork() was meant to check that whether we
only have an ACK in Qdisc/NIC queues, or if TX completion was delayed after we
processed ACK packet, if so, we should push the packet immediately instead of
corking it.
Therefore we should compare sk_wmem_alloc with SKB_TRUESIZE(1) instead of
skb->truesize.

After this patch, tcp should have more chances to be corked, and the
performance should be a little better.  And netperf shows that this patch
works as expected.

./super_netperf.sh 300 -H 10.16.42.249 -t TCP_STREAM -- -m 1 -M 1
	      speed	TCPAutoCorking
Before patch: 169.38    222278
After patch:  173.27    232988

Signed-off-by: Weiping Pan <panweiping3@gmail.com>
---
 net/ipv4/tcp.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 461003d..44f0bc6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -647,7 +647,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
 	return skb->len < size_goal &&
 	       sysctl_tcp_autocorking &&
 	       skb != tcp_write_queue_head(sk) &&
-	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+	       atomic_read(&sk->sk_wmem_alloc) > SKB_TRUESIZE(1);
 }

 static void tcp_push(struct sock *sk, int flags, int mss_now,
@@ -675,7 +675,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED.
 		 */
-		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+		if (atomic_read(&sk->sk_wmem_alloc) > SKB_TRUESIZE(1))
 			return;
 	}

-- 
1.7.4

^ permalink raw reply related

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Krzysztof Kolasa @ 2014-10-15 10:35 UTC (permalink / raw)
  To: Eric Dumazet, Cong Wang; +Cc: David Miller, Eric Dumazet, netdev
In-Reply-To: <1413323024.17109.11.camel@edumazet-glaptop2.roam.corp.google.com>

W dniu 14.10.2014 o 23:43, Eric Dumazet pisze:
> On Tue, 2014-10-14 at 14:24 -0700, Cong Wang wrote:
>
>> Since we are still in merge window, I don't think we have to use
>> a one-line fix for a bug introduced in this merge window.
> You are clearly refactoring here. Its a nice cleanup.
>
> If I was the maintainer, I would prefer the one line fix.
>
> Then when net-next is open, you refactor.
>
> As I said, I wont argue, do whatever you want.
>
> Thanks
>
>
>

one-line patch not resolve problem, fix created by Cong Wang resolves 
problem !!!

tested on 64bit system and working OK

^ permalink raw reply

* Re: [RFC PATCH net-next 2/6] virtio: introduce virtio_enable_cb_avail()
From: Michael S. Tsirkin @ 2014-10-15 10:41 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E4A23.7060903@redhat.com>

On Wed, Oct 15, 2014 at 06:19:15PM +0800, Jason Wang wrote:
> On 10/15/2014 05:28 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 03:25:26PM +0800, Jason Wang wrote:
> >> This patch introduces virtio_enable_cb_avail() to publish avail idx
> >> and used event. This could be used by batched buffer submitting to
> >> reduce the number of tx interrupts.
> >>
> >> Cc: Rusty Russell <rusty@rustcorp.com.au>
> >> Cc: Michael S. Tsirkin <mst@redhat.com>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >> ---
> >>  drivers/virtio/virtio_ring.c |   22 ++++++++++++++++++++--
> >>  include/linux/virtio.h       |    2 ++
> >>  2 files changed, 22 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >> index 1b3929f..d67fbf8 100644
> >> --- a/drivers/virtio/virtio_ring.c
> >> +++ b/drivers/virtio/virtio_ring.c
> >> @@ -567,14 +567,32 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >>  	 * entry. Always do both to keep code simple. */
> >>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >>  	/* Make sure used event never go backwards */
> >> -	if (!vring_need_event(vring_used_event(&vq->vring),
> >> -			      vq->vring.avail->idx, last_used_idx))
> >> +	if (vq->vring.avail->idx != vring_used_event(&vq->vring) &&
> >> +	    !vring_need_event(vring_used_event(&vq->vring),
> >> +			      vq->vring.avail->idx, last_used_idx)) {
> >>  		vring_used_event(&vq->vring) = last_used_idx;
> >> +	}
> >>  	END_USE(vq);
> >>  	return last_used_idx;
> >>  }
> >>  EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
> >>
> > I see you are also changing virtqueue_enable_cb_prepare, why?
> 
> This is also used to prevent it from moving the used event backwards.
> This may happens when we handle tx napi after we publish avail idx as
> used event (virtqueue_enable_cb_avail() was called).

So it's wrong exactly in the same way.

But also, please document this stuff, don't put
unrelated changes in a patch called "introduce
virtqueue_enable_cb_avail".


> >
> >> +bool virtqueue_enable_cb_avail(struct virtqueue *_vq)
> >> +{
> >> +	struct vring_virtqueue *vq = to_vvq(_vq);
> >> +	bool ret;
> >> +
> >> +	START_USE(vq);
> >> +	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >> +	vring_used_event(&vq->vring) = vq->vring.avail->idx;
> >> +	ret = vring_need_event(vq->vring.avail->idx,
> >> +			       vq->last_used_idx, vq->vring.used->idx);
> >> +	END_USE(vq);
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_avail);
> >> +
> >>  /**
> >>   * virtqueue_poll - query pending used buffers
> >>   * @vq: the struct virtqueue we're talking about.
> > Could not figure out what this does.
> > Please add documentation.
> >
> 
> Sure, does something like below explain what does this function do?
> 
> /**                                                                            
> 
>  * virtqueue_enable_cb_avail - restart callbacks after
> disable_cb.           
>  * @vq: the struct virtqueue we're talking
> about.                              
>  *                                                                             
> 
>  * This re-enables callbacks but hints to the other side to
> delay              
>  * interrupts all of the available buffers have been processed;         


So this is like virtqueue_enable_cb_delayed but even more
aggressive?
I think it's too agressive: it's better to wake up guest
after you are through most of buffers, but not all,
so guest and host can work in parallel.


>  * it returns "false" if there are at least one pending buffer in the
> queue,          
>  * to detect a possible race between the driver checking for more
> work,        
>  * and enabling
> callbacks.                                                     
>  *                                                                             
> 
>  * Caller must ensure we don't call this with other
> virtqueue                  
>  * operations at the same time (except where
> noted).                           
>  */
> 
> >> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> >> index b46671e..bfaf058 100644
> >> --- a/include/linux/virtio.h
> >> +++ b/include/linux/virtio.h
> >> @@ -65,6 +65,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
> >>  
> >>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
> >>  
> >> +bool virtqueue_enable_cb_avail(struct virtqueue *vq);
> >> +
> >>  bool virtqueue_poll(struct virtqueue *vq, unsigned);
> >>  
> >>  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
> >> -- 
> >> 1.7.1
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [RFC PATCH net-next 5/6] virtio-net: enable tx interrupt
From: Michael S. Tsirkin @ 2014-10-15 10:43 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E4B95.2040209@redhat.com>

On Wed, Oct 15, 2014 at 06:25:25PM +0800, Jason Wang wrote:
> On 10/15/2014 06:18 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 03:25:29PM +0800, Jason Wang wrote:
> >> > Orphan skb in ndo_start_xmit() breaks socket accounting and packet
> >> > queuing. This in fact breaks lots of things such as pktgen and several
> >> > TCP optimizations. And also make BQL can't be implemented for
> >> > virtio-net.
> >> > 
> >> > This patch tries to solve this issue by enabling tx interrupt. To
> >> > avoid introducing extra spinlocks, a tx napi was scheduled to free
> >> > those packets.
> >> > 
> >> > More tx interrupt mitigation method could be used on top.
> >> > 
> >> > Cc: Rusty Russell <rusty@rustcorp.com.au>
> >> > Cc: Michael S. Tsirkin <mst@redhat.com>
> >> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> >> > ---
> >> >  drivers/net/virtio_net.c |  125 +++++++++++++++++++++++++++++++---------------
> >> >  1 files changed, 85 insertions(+), 40 deletions(-)
> >> > 
> >> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> > index ccf98f9..2afc2e2 100644
> >> > --- a/drivers/net/virtio_net.c
> >> > +++ b/drivers/net/virtio_net.c
> >> > @@ -72,6 +72,8 @@ struct send_queue {
> >> >  
> >> >  	/* Name of the send queue: output.$index */
> >> >  	char name[40];
> >> > +
> >> > +	struct napi_struct napi;
> >> >  };
> >> >  
> >> >  /* Internal representation of a receive virtqueue */
> >> > @@ -217,15 +219,40 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
> >> >  	return p;
> >> >  }
> >> >  
> >> > +static int free_old_xmit_skbs(struct send_queue *sq, int budget)
> >> > +{
> >> > +	struct sk_buff *skb;
> >> > +	unsigned int len;
> >> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
> >> > +	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> >> > +	u64 tx_bytes = 0, tx_packets = 0;
> >> > +
> >> > +	while (tx_packets < budget &&
> >> > +	       (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >> > +		pr_debug("Sent skb %p\n", skb);
> >> > +
> >> > +		tx_bytes += skb->len;
> >> > +		tx_packets++;
> >> > +
> >> > +		dev_kfree_skb_any(skb);
> >> > +	}
> >> > +
> >> > +	u64_stats_update_begin(&stats->tx_syncp);
> >> > +	stats->tx_bytes += tx_bytes;
> >> > +	stats->tx_packets =+ tx_packets;
> >> > +	u64_stats_update_end(&stats->tx_syncp);
> >> > +
> >> > +	return tx_packets;
> >> > +}
> >> > +
> >> >  static void skb_xmit_done(struct virtqueue *vq)
> >> >  {
> >> >  	struct virtnet_info *vi = vq->vdev->priv;
> >> > +	struct send_queue *sq = &vi->sq[vq2txq(vq)];
> >> >  
> >> > -	/* Suppress further interrupts. */
> >> > -	virtqueue_disable_cb(vq);
> >> > -
> >> > -	/* We were probably waiting for more output buffers. */
> >> > -	netif_wake_subqueue(vi->dev, vq2txq(vq));
> >> > +	if (napi_schedule_prep(&sq->napi)) {
> >> > +		__napi_schedule(&sq->napi);
> >> > +	}
> >> >  }
> >> >  
> >> >  static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
> >> > @@ -774,7 +801,39 @@ again:
> >> >  	return received;
> >> >  }
> >> >  
> >> > +static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> >> > +{
> >> > +	struct send_queue *sq =
> >> > +		container_of(napi, struct send_queue, napi);
> >> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
> >> > +	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
> >> > +	unsigned int r, sent = 0;
> >> > +
> >> > +again:
> >> > +	__netif_tx_lock(txq, smp_processor_id());
> >> > +	virtqueue_disable_cb(sq->vq);
> >> > +	sent += free_old_xmit_skbs(sq, budget - sent);
> >> > +
> >> > +	if (sent < budget) {
> >> > +		r = virtqueue_enable_cb_prepare(sq->vq);
> >> > +		napi_complete(napi);
> >> > +		__netif_tx_unlock(txq);
> >> > +		if (unlikely(virtqueue_poll(sq->vq, r)) &&
> > So you are enabling callback on the next packet,
> > which is almost sure to cause an interrupt storm
> > on the guest.
> >
> >
> > I think it's a bad idea, this is why I used
> > enable_cb_delayed in my patch.
> 
> Right, will do this, but may also need to make sure used event never
> goes back since we may call virtqueue_enable_cb_avail().

That's why my patch always calls virtqueue_enable_cb_delayed.
So no need for hacks.

Maybe you can review my patch and comment?




> >
> >
> >> > +		    napi_schedule_prep(napi)) {
> >> > +			virtqueue_disable_cb(sq->vq);
> >> > +			__napi_schedule(napi);
> >> > +			goto again;
> >> > +		}
> >> > +	} else {
> >> > +		__netif_tx_unlock(txq);
> >> > +	}
> >> > +
> >> > +	netif_wake_subqueue(vi->dev, vq2txq(sq->vq));
> >> > +	return sent;
> >> > +}
> >> > +
> >> >  #ifdef CONFIG_NET_RX_BUSY_POLL
> >> > +
> >> >  /* must be called with local_bh_disable()d */
> >> >  static int virtnet_busy_poll(struct napi_struct *napi)
> >> >  {
> >> > @@ -822,36 +881,12 @@ static int virtnet_open(struct net_device *dev)
> >> >  			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> >> >  				schedule_delayed_work(&vi->refill, 0);
> >> >  		virtnet_napi_enable(&vi->rq[i]);
> >> > +		napi_enable(&vi->sq[i].napi);
> >> >  	}
> >> >  
> >> >  	return 0;
> >> >  }
> >> >  
> >> > -static int free_old_xmit_skbs(struct send_queue *sq)
> >> > -{
> >> > -	struct sk_buff *skb;
> >> > -	unsigned int len;
> >> > -	struct virtnet_info *vi = sq->vq->vdev->priv;
> >> > -	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> >> > -	u64 tx_bytes = 0, tx_packets = 0;
> >> > -
> >> > -	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >> > -		pr_debug("Sent skb %p\n", skb);
> >> > -
> >> > -		tx_bytes += skb->len;
> >> > -		tx_packets++;
> >> > -
> >> > -		dev_kfree_skb_any(skb);
> >> > -	}
> >> > -
> >> > -	u64_stats_update_begin(&stats->tx_syncp);
> >> > -	stats->tx_bytes += tx_bytes;
> >> > -	stats->tx_packets =+ tx_packets;
> >> > -	u64_stats_update_end(&stats->tx_syncp);
> >> > -
> >> > -	return tx_packets;
> >> > -}
> >> > -
> > So you end up moving it all anyway, why bother splitting out
> > minor changes in previous patches?
> 
> To make review easier, but if you think this complicated it in fact,
> will pack them into a single patch.

^ permalink raw reply

* Re: [RFC PATCH net-next 1/6] virtio: make sure used event never go backwards
From: Jason Wang @ 2014-10-15 10:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015103232.GG25776@redhat.com>

On 10/15/2014 06:32 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 06:13:19PM +0800, Jason Wang wrote:
>> On 10/15/2014 05:34 PM, Michael S. Tsirkin wrote:
>>> On Wed, Oct 15, 2014 at 03:25:25PM +0800, Jason Wang wrote:
>>>> This patch checks the new event idx to make sure used event idx never
>>>> goes back. This is used to synchronize the calls between
>>>> virtqueue_enable_cb_delayed() and virtqueue_enable_cb().
>>>>
>>>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> the implication being that moving event idx back might cause some race
>>> condition?  
>> This will cause race condition when tx interrupt is enabled. Consider
>> the following cases
>>
>> 1) tx napi was scheduled
>> 2) start_xmit() call virtqueue_enable_cb_delayed() and disable cb, [used
>> event is vq->last_used_idx + 3/4 pendg bufs]
>> 3) tx napi enable the callback by virtqueue_enable_cb() [ used event is
>> vq->last_used_idx ]
>>  
>> After step 3, used event was moved back, unnecessary tx interrupt was
>> triggered.
> Well unnecessary interrupts are safe.

But it that is what we want to reduce.
> With your patch caller of virtqueue_enable_cb will not get an
> interrupt on the next buffer which is not safe.
>
> If you don't want an interrupt on the next buffer, don't
> call virtqueue_enable_cb.

So something like this patch should be done in virtio core somewhere
else. Virtio-net can not do this since it does not have the knowledge of
event index.
>
>>> If yes but please describe the race explicitly.
>>> Is there a bug we need to fix on stable?
>> Looks not, current code does not have such race condition.
>>> Please also explicitly describe a configuration that causes event idx
>>> to go back.
>>>
>>> All this info should go in the commit log.
>> Will do this.
>>>> ---
>>>>  drivers/virtio/virtio_ring.c |    7 +++++--
>>>>  1 files changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>>> index 3b1f89b..1b3929f 100644
>>>> --- a/drivers/virtio/virtio_ring.c
>>>> +++ b/drivers/virtio/virtio_ring.c
>>>> @@ -559,14 +559,17 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>>>>  	u16 last_used_idx;
>>>>  
>>>>  	START_USE(vq);
>>>> -
>>>> +	last_used_idx = vq->last_used_idx;
>>>>  	/* We optimistically turn back on interrupts, then check if there was
>>>>  	 * more to do. */
>>>>  	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>>>>  	 * either clear the flags bit or point the event index at the next
>>>>  	 * entry. Always do both to keep code simple. */
>>>>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
>>>> -	vring_used_event(&vq->vring) = last_used_idx = vq->last_used_idx;
>>>> +	/* Make sure used event never go backwards */
>>> s/go/goes/
>>>
>>>> +	if (!vring_need_event(vring_used_event(&vq->vring),
>>>> +			      vq->vring.avail->idx, last_used_idx))
>>>> +		vring_used_event(&vq->vring) = last_used_idx;
>>> The result will be that driver will *not* get an interrupt
>>> on the next consumed buffer, which is likely not what driver
>>> intended when it called virtqueue_enable_cb.
>> This will only happen when we want to delay the interrupt for next few
>> consumed buffers (virtqueue_enable_cb_delayed() was called). For the
>> other case, vq->last_used_idx should be ahead of previous used event. Do
>> you see any other case?
> Call virtqueue_enable_cb_delayed, later call virtqueue_enable_cb.  If
> event index is not updated in virtqueue_enable_cb, driver will not get
> an interrupt on the next buffer.

This is just what we want I think. The interrupt was not lost but fired
after 3/4 pending buffers were consumed. Do you see any real issue on this?
>
>>> Instead, how about we simply document the requirement that drivers either
>>> always call virtqueue_enable_cb_delayed or virtqueue_enable_cb
>>> but not both?
>> We need call them both when tx interrupt is enabled I believe.
> Can you pls reply to my patch and document issues you see?
>

In the previous reply you said you're using
virtuqueue_enable_cb_delayed(), so no race in your patch.

^ permalink raw reply

* Re: [RFC PATCH net-next 6/6] virtio-net: enable tx interrupt only for the final skb in the chain
From: Michael S. Tsirkin @ 2014-10-15 10:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E4CF7.1040405@redhat.com>

On Wed, Oct 15, 2014 at 06:31:19PM +0800, Jason Wang wrote:
> On 10/15/2014 06:22 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 03:25:30PM +0800, Jason Wang wrote:
> >> With the help of xmit_more and virtqueue_enable_cb_avail(), this patch
> >> enable tx interrupt only for the final skb in the chain if
> >> needed. This will help to mitigate tx interrupts.
> >>
> >> Cc: Rusty Russell <rusty@rustcorp.com.au>
> >> Cc: Michael S. Tsirkin <mst@redhat.com>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > I think you should split xmit_more stuff out.
> 
> No much difference but if you prefer I will do this.

In fact I did it exactly like this already :)

-- 
MST

^ permalink raw reply

* Re: [RFC PATCH net-next 3/6] virtio-net: small optimization on free_old_xmit_skbs()
From: Michael S. Tsirkin @ 2014-10-15 10:48 UTC (permalink / raw)
  To: David Laight
  Cc: eric.dumazet@gmail.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, davem@davemloft.net
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6D1C9D752A@AcuExch.aculab.com>

On Wed, Oct 15, 2014 at 09:49:01AM +0000, David Laight wrote:
> From: Of Michael S. Tsirkin
> > On Wed, Oct 15, 2014 at 03:25:27PM +0800, Jason Wang wrote:
> > > Accumulate the sent packets and sent bytes in local variables and perform a
> > > single u64_stats_update_begin/end() after.
> > >
> > > Cc: Rusty Russell <rusty@rustcorp.com.au>
> > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > 
> > Not sure how much it's worth but since Eric suggested it ...
> 
> Probably depends on the actual cost of u64_stats_update_begin/end
> against the likely extra saving of the tx_bytes and tx_packets
> values onto the stack across the call to dev_kfree_skb_any().
> (Which depends on the number of caller saved registers.)

Yea, some benchmark results would be nice to see.


> > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > > ---
> > >  drivers/net/virtio_net.c |   12 ++++++++----
> > >  1 files changed, 8 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 3d0ce44..a4d56b8 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -833,17 +833,21 @@ static void free_old_xmit_skbs(struct send_queue *sq)
> > >  	unsigned int len;
> > >  	struct virtnet_info *vi = sq->vq->vdev->priv;
> > >  	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> > > +	u64 tx_bytes = 0, tx_packets = 0;
> 
> tx_packets need only be 'unsigned int'.
> The same is almost certainly true of tx_bytes.
> 
> 	David
> 
> > >  	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > >  		pr_debug("Sent skb %p\n", skb);
> > >
> > > -		u64_stats_update_begin(&stats->tx_syncp);
> > > -		stats->tx_bytes += skb->len;
> > > -		stats->tx_packets++;
> > > -		u64_stats_update_end(&stats->tx_syncp);
> > > +		tx_bytes += skb->len;
> > > +		tx_packets++;
> > >
> > >  		dev_kfree_skb_any(skb);
> > >  	}
> > > +
> > > +	u64_stats_update_begin(&stats->tx_syncp);
> > > +	stats->tx_bytes += tx_bytes;
> > > +	stats->tx_packets =+ tx_packets;
> > > +	u64_stats_update_end(&stats->tx_syncp);
> > >  }
> > >
> > >  static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > > --
> > > 1.7.1
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* RE: [RFC PATCH net-next 3/6] virtio-net: small optimization on free_old_xmit_skbs()
From: David Laight @ 2014-10-15 10:51 UTC (permalink / raw)
  To: 'Michael S. Tsirkin'
  Cc: eric.dumazet@gmail.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, davem@davemloft.net
In-Reply-To: <20141015104806.GK25776@redhat.com>

From: Michael S. Tsirkin
> On Wed, Oct 15, 2014 at 09:49:01AM +0000, David Laight wrote:
> > From: Of Michael S. Tsirkin
> > > On Wed, Oct 15, 2014 at 03:25:27PM +0800, Jason Wang wrote:
> > > > Accumulate the sent packets and sent bytes in local variables and perform a
> > > > single u64_stats_update_begin/end() after.
> > > >
> > > > Cc: Rusty Russell <rusty@rustcorp.com.au>
> > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > > Not sure how much it's worth but since Eric suggested it ...
> >
> > Probably depends on the actual cost of u64_stats_update_begin/end
> > against the likely extra saving of the tx_bytes and tx_packets
> > values onto the stack across the call to dev_kfree_skb_any().
> > (Which depends on the number of caller saved registers.)
> 
> Yea, some benchmark results would be nice to see.

I there are likely to be multiple skb on the queue the fastest
code would probably do one 'virtqueue_get_all()' that returned
a linked list of buffers, then follow the list to get the stats,
and follow it again to free the skb.

	David

> > > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > >
> > > > ---
> > > >  drivers/net/virtio_net.c |   12 ++++++++----
> > > >  1 files changed, 8 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 3d0ce44..a4d56b8 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -833,17 +833,21 @@ static void free_old_xmit_skbs(struct send_queue *sq)
> > > >  	unsigned int len;
> > > >  	struct virtnet_info *vi = sq->vq->vdev->priv;
> > > >  	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> > > > +	u64 tx_bytes = 0, tx_packets = 0;
> >
> > tx_packets need only be 'unsigned int'.
> > The same is almost certainly true of tx_bytes.
> >
> > 	David
> >
> > > >  	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > > >  		pr_debug("Sent skb %p\n", skb);
> > > >
> > > > -		u64_stats_update_begin(&stats->tx_syncp);
> > > > -		stats->tx_bytes += skb->len;
> > > > -		stats->tx_packets++;
> > > > -		u64_stats_update_end(&stats->tx_syncp);
> > > > +		tx_bytes += skb->len;
> > > > +		tx_packets++;
> > > >
> > > >  		dev_kfree_skb_any(skb);
> > > >  	}
> > > > +
> > > > +	u64_stats_update_begin(&stats->tx_syncp);
> > > > +	stats->tx_bytes += tx_bytes;
> > > > +	stats->tx_packets =+ tx_packets;
> > > > +	u64_stats_update_end(&stats->tx_syncp);
> > > >  }
> > > >
> > > >  static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > > > --
> > > > 1.7.1
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >

^ permalink raw reply

* performance of two 82599 nics at a host
From: Bokhan Artem @ 2014-10-15 10:52 UTC (permalink / raw)
  To: netdev

Hello, everyone.

I have two 82599 nics connected to different numa nodes. Each nic can 
recieve up to 12Mpps 64-byte packets.

But when I send traffic to both nics only 5Mpps+5Mpps can be recieved.

May someboby help if it's driver, kernel or hardware bottleneck?

Configuration:

2 x E5-2667 v2@3.30GHz
3.14.19 kernel, 3.22.3 ixgbe
8 queues at each nic with affinity set

NIC1
--pkt/eth2- --pkt/eth5-
#recv #send:#recv #send
    0     0 :12.6M    0
    0     0 :12.7M    0
    0     0 :12.7M    0
    0     0 :12.7M    0
    0     0 :12.7M    0

NIC2
--pkt/eth2- --pkt/eth5-
#recv #send:#recv #send
12.7M    0 :   0     0
12.7M    0 :   0     0
12.6M    0 :   0     0
12.7M    0 :   0     0
12.6M    0 :   0     0

both
--pkt/eth2- --pkt/eth5-
#recv #send:#recv #send
5099k    0 :5758k    0
5467k    0 :5325k    0
5050k    0 :5432k    0
5206k    0 :5133k    0
5162k    0 :5164k    0

^ permalink raw reply

* Re: [RFC PATCH net-next 2/6] virtio: introduce virtio_enable_cb_avail()
From: Jason Wang @ 2014-10-15 10:58 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015104130.GH25776@redhat.com>

On 10/15/2014 06:41 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 06:19:15PM +0800, Jason Wang wrote:
>> On 10/15/2014 05:28 PM, Michael S. Tsirkin wrote:
>>> On Wed, Oct 15, 2014 at 03:25:26PM +0800, Jason Wang wrote:
>>>> This patch introduces virtio_enable_cb_avail() to publish avail idx
>>>> and used event. This could be used by batched buffer submitting to
>>>> reduce the number of tx interrupts.
>>>>
>>>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>>  drivers/virtio/virtio_ring.c |   22 ++++++++++++++++++++--
>>>>  include/linux/virtio.h       |    2 ++
>>>>  2 files changed, 22 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>>> index 1b3929f..d67fbf8 100644
>>>> --- a/drivers/virtio/virtio_ring.c
>>>> +++ b/drivers/virtio/virtio_ring.c
>>>> @@ -567,14 +567,32 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>>>>  	 * entry. Always do both to keep code simple. */
>>>>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
>>>>  	/* Make sure used event never go backwards */
>>>> -	if (!vring_need_event(vring_used_event(&vq->vring),
>>>> -			      vq->vring.avail->idx, last_used_idx))
>>>> +	if (vq->vring.avail->idx != vring_used_event(&vq->vring) &&
>>>> +	    !vring_need_event(vring_used_event(&vq->vring),
>>>> +			      vq->vring.avail->idx, last_used_idx)) {
>>>>  		vring_used_event(&vq->vring) = last_used_idx;
>>>> +	}
>>>>  	END_USE(vq);
>>>>  	return last_used_idx;
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
>>>>
>>> I see you are also changing virtqueue_enable_cb_prepare, why?
>> This is also used to prevent it from moving the used event backwards.
>> This may happens when we handle tx napi after we publish avail idx as
>> used event (virtqueue_enable_cb_avail() was called).
> So it's wrong exactly in the same way.
>
> But also, please document this stuff, don't put
> unrelated changes in a patch called "introduce
> virtqueue_enable_cb_avail".
>
>
>>>> +bool virtqueue_enable_cb_avail(struct virtqueue *_vq)
>>>> +{
>>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>>> +	bool ret;
>>>> +
>>>> +	START_USE(vq);
>>>> +	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
>>>> +	vring_used_event(&vq->vring) = vq->vring.avail->idx;
>>>> +	ret = vring_need_event(vq->vring.avail->idx,
>>>> +			       vq->last_used_idx, vq->vring.used->idx);
>>>> +	END_USE(vq);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_avail);
>>>> +
>>>>  /**
>>>>   * virtqueue_poll - query pending used buffers
>>>>   * @vq: the struct virtqueue we're talking about.
>>> Could not figure out what this does.
>>> Please add documentation.
>>>
>> Sure, does something like below explain what does this function do?
>>
>> /**                                                                            
>>
>>  * virtqueue_enable_cb_avail - restart callbacks after
>> disable_cb.           
>>  * @vq: the struct virtqueue we're talking
>> about.                              
>>  *                                                                             
>>
>>  * This re-enables callbacks but hints to the other side to
>> delay              
>>  * interrupts all of the available buffers have been processed;         
>
> So this is like virtqueue_enable_cb_delayed but even more
> aggressive?
> I think it's too agressive: it's better to wake up guest
> after you are through most of buffers, but not all,
> so guest and host can work in parallel.

Note that:

- it was only used when there are still few of free slots (which is
greater than 2 + MAX_SKB_FRAGS)
- my patch keeps the free_old_xmit_skbs() in the beginning of
start_xmit(), so the tx skb reclaiming does not depends totally on tx
interrupt. If more packets comes, we'd expect some of them were freed in
ndo_start_xmit(). If not, finally we may trigger the
virtqueue_enable_cb_delayed().

So probably not as aggressive as it looks. I will do benchmark on this.
>
>
>>  * it returns "false" if there are at least one pending buffer in the
>> queue,          
>>  * to detect a possible race between the driver checking for more
>> work,        
>>  * and enabling
>> callbacks.                                                     
>>  *                                                                             
>>
>>  * Caller must ensure we don't call this with other
>> virtqueue                  
>>  * operations at the same time (except where
>> noted).                           
>>  */
>>
>>>> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
>>>> index b46671e..bfaf058 100644
>>>> --- a/include/linux/virtio.h
>>>> +++ b/include/linux/virtio.h
>>>> @@ -65,6 +65,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
>>>>  
>>>>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
>>>>  
>>>> +bool virtqueue_enable_cb_avail(struct virtqueue *vq);
>>>> +
>>>>  bool virtqueue_poll(struct virtqueue *vq, unsigned);
>>>>  
>>>>  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
>>>> -- 
>>>> 1.7.1
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [RFC PATCH net-next 5/6] virtio-net: enable tx interrupt
From: Jason Wang @ 2014-10-15 11:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015104309.GI25776@redhat.com>

On 10/15/2014 06:43 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 06:25:25PM +0800, Jason Wang wrote:
>> > On 10/15/2014 06:18 PM, Michael S. Tsirkin wrote:
>>> > > On Wed, Oct 15, 2014 at 03:25:29PM +0800, Jason Wang wrote:
>>>>> > >> > Orphan skb in ndo_start_xmit() breaks socket accounting and packet
>>>>> > >> > queuing. This in fact breaks lots of things such as pktgen and several
>>>>> > >> > TCP optimizations. And also make BQL can't be implemented for
>>>>> > >> > virtio-net.
>>>>> > >> > 
>>>>> > >> > This patch tries to solve this issue by enabling tx interrupt. To
>>>>> > >> > avoid introducing extra spinlocks, a tx napi was scheduled to free
>>>>> > >> > those packets.
>>>>> > >> > 
>>>>> > >> > More tx interrupt mitigation method could be used on top.
>>>>> > >> > 
>>>>> > >> > Cc: Rusty Russell <rusty@rustcorp.com.au>
>>>>> > >> > Cc: Michael S. Tsirkin <mst@redhat.com>
>>>>> > >> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>>> > >> > ---
>>>>> > >> >  drivers/net/virtio_net.c |  125 +++++++++++++++++++++++++++++++---------------
>>>>> > >> >  1 files changed, 85 insertions(+), 40 deletions(-)
>>>>> > >> > 
>>>>> > >> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>>> > >> > index ccf98f9..2afc2e2 100644
>>>>> > >> > --- a/drivers/net/virtio_net.c
>>>>> > >> > +++ b/drivers/net/virtio_net.c
>>>>> > >> > @@ -72,6 +72,8 @@ struct send_queue {
>>>>> > >> >  
>>>>> > >> >  	/* Name of the send queue: output.$index */
>>>>> > >> >  	char name[40];
>>>>> > >> > +
>>>>> > >> > +	struct napi_struct napi;
>>>>> > >> >  };
>>>>> > >> >  
>>>>> > >> >  /* Internal representation of a receive virtqueue */
>>>>> > >> > @@ -217,15 +219,40 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
>>>>> > >> >  	return p;
>>>>> > >> >  }
>>>>> > >> >  
>>>>> > >> > +static int free_old_xmit_skbs(struct send_queue *sq, int budget)
>>>>> > >> > +{
>>>>> > >> > +	struct sk_buff *skb;
>>>>> > >> > +	unsigned int len;
>>>>> > >> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
>>>>> > >> > +	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
>>>>> > >> > +	u64 tx_bytes = 0, tx_packets = 0;
>>>>> > >> > +
>>>>> > >> > +	while (tx_packets < budget &&
>>>>> > >> > +	       (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>>>>> > >> > +		pr_debug("Sent skb %p\n", skb);
>>>>> > >> > +
>>>>> > >> > +		tx_bytes += skb->len;
>>>>> > >> > +		tx_packets++;
>>>>> > >> > +
>>>>> > >> > +		dev_kfree_skb_any(skb);
>>>>> > >> > +	}
>>>>> > >> > +
>>>>> > >> > +	u64_stats_update_begin(&stats->tx_syncp);
>>>>> > >> > +	stats->tx_bytes += tx_bytes;
>>>>> > >> > +	stats->tx_packets =+ tx_packets;
>>>>> > >> > +	u64_stats_update_end(&stats->tx_syncp);
>>>>> > >> > +
>>>>> > >> > +	return tx_packets;
>>>>> > >> > +}
>>>>> > >> > +
>>>>> > >> >  static void skb_xmit_done(struct virtqueue *vq)
>>>>> > >> >  {
>>>>> > >> >  	struct virtnet_info *vi = vq->vdev->priv;
>>>>> > >> > +	struct send_queue *sq = &vi->sq[vq2txq(vq)];
>>>>> > >> >  
>>>>> > >> > -	/* Suppress further interrupts. */
>>>>> > >> > -	virtqueue_disable_cb(vq);
>>>>> > >> > -
>>>>> > >> > -	/* We were probably waiting for more output buffers. */
>>>>> > >> > -	netif_wake_subqueue(vi->dev, vq2txq(vq));
>>>>> > >> > +	if (napi_schedule_prep(&sq->napi)) {
>>>>> > >> > +		__napi_schedule(&sq->napi);
>>>>> > >> > +	}
>>>>> > >> >  }
>>>>> > >> >  
>>>>> > >> >  static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
>>>>> > >> > @@ -774,7 +801,39 @@ again:
>>>>> > >> >  	return received;
>>>>> > >> >  }
>>>>> > >> >  
>>>>> > >> > +static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>>>> > >> > +{
>>>>> > >> > +	struct send_queue *sq =
>>>>> > >> > +		container_of(napi, struct send_queue, napi);
>>>>> > >> > +	struct virtnet_info *vi = sq->vq->vdev->priv;
>>>>> > >> > +	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
>>>>> > >> > +	unsigned int r, sent = 0;
>>>>> > >> > +
>>>>> > >> > +again:
>>>>> > >> > +	__netif_tx_lock(txq, smp_processor_id());
>>>>> > >> > +	virtqueue_disable_cb(sq->vq);
>>>>> > >> > +	sent += free_old_xmit_skbs(sq, budget - sent);
>>>>> > >> > +
>>>>> > >> > +	if (sent < budget) {
>>>>> > >> > +		r = virtqueue_enable_cb_prepare(sq->vq);
>>>>> > >> > +		napi_complete(napi);
>>>>> > >> > +		__netif_tx_unlock(txq);
>>>>> > >> > +		if (unlikely(virtqueue_poll(sq->vq, r)) &&
>>> > > So you are enabling callback on the next packet,
>>> > > which is almost sure to cause an interrupt storm
>>> > > on the guest.
>>> > >
>>> > >
>>> > > I think it's a bad idea, this is why I used
>>> > > enable_cb_delayed in my patch.
>> > 
>> > Right, will do this, but may also need to make sure used event never
>> > goes back since we may call virtqueue_enable_cb_avail().
> That's why my patch always calls virtqueue_enable_cb_delayed.
> So no need for hacks.
>
> Maybe you can review my patch and comment?

I think I miss the virtqueue_enable_cb_delayed() in your patch when I'm
reviewing it. Will do.

^ permalink raw reply

* Re: [PATCH v3 net] ixgbe: check adapter->vfinfo before dereference
From: Jeff Kirsher @ 2014-10-15 11:00 UTC (permalink / raw)
  To: Thierry Herbelot; +Cc: Jesse Brandeburg, Bruce Allan, netdev, emil.s.tantilov
In-Reply-To: <1413367080-31540-1-git-send-email-thierry.herbelot@6wind.com>

[-- Attachment #1: Type: text/plain, Size: 811 bytes --]

On Wed, 2014-10-15 at 11:58 +0200, Thierry Herbelot wrote:
> this protects against the following panic:
> (before a VF was actually created on p96p1 PF Ethernet port)
> 
> ip link set p96p1 vf 0 spoofchk off
> BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000052
> IP: [<ffffffffa044a1c1>] ixgbe_ndo_set_vf_spoofchk+0x51/0x150 [ixgbe]
> 
> Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
> ---
> 
> v2:
>   compilation fixes
> 
> v3:
>   remove checks in functions where vfinfo is known not to be NULL
>   return -EINVAL as error code
> 
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   42
> ++++++++++++++++++++++--
>  1 file changed, 40 insertions(+), 2 deletions(-)

Thanks Thierry, I have added this patch to my queue (and dropped your
v2).

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH RFC] virtio_net: enable tx interrupt
From: Jason Wang @ 2014-10-15 11:04 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, virtualization
In-Reply-To: <1413323524-23380-1-git-send-email-mst@redhat.com>

On 10/15/2014 05:53 AM, Michael S. Tsirkin wrote:
> On newer hosts that support delayed tx interrupts,
> we probably don't have much to gain from orphaning
> packets early.
>
> Based on patch by Jason Wang.
>
> Note: this will likely degrade performance for hosts without event idx
> support.  Various fallback options are available, including
> orphaning conditionally.
> Testing TBD.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/net/virtio_net.c | 119 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 83 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6b6e136..62c059d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -72,6 +72,8 @@ struct send_queue {
>  
>  	/* Name of the send queue: output.$index */
>  	char name[40];
> +
> +	struct napi_struct napi;
>  };
>  
>  /* Internal representation of a receive virtqueue */
> @@ -211,15 +213,38 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
>  	return p;
>  }
>  
> +static int free_old_xmit_skbs(struct send_queue *sq, int budget)
> +{
> +	struct sk_buff *skb;
> +	unsigned int len;
> +	struct virtnet_info *vi = sq->vq->vdev->priv;
> +	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> +	int sent = 0;
> +
> +	while (sent < budget &&
> +	       (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> +		pr_debug("Sent skb %p\n", skb);
> +
> +		u64_stats_update_begin(&stats->tx_syncp);
> +		stats->tx_bytes += skb->len;
> +		stats->tx_packets++;
> +		u64_stats_update_end(&stats->tx_syncp);
> +
> +		dev_kfree_skb_any(skb);
> +		sent++;
> +	}
> +
> +	return sent;
> +}
> +
>  static void skb_xmit_done(struct virtqueue *vq)
>  {
>  	struct virtnet_info *vi = vq->vdev->priv;
> +	struct send_queue *sq = &vi->sq[vq2txq(vq)];
>  
> -	/* Suppress further interrupts. */
> -	virtqueue_disable_cb(vq);
> -
> -	/* We were probably waiting for more output buffers. */
> -	netif_wake_subqueue(vi->dev, vq2txq(vq));
> +	if (napi_schedule_prep(&sq->napi)) {
> +		__napi_schedule(&sq->napi);
> +	}
>  }
>  
>  static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
> @@ -766,6 +791,37 @@ again:
>  	return received;
>  }
>  
> +static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> +{
> +	struct send_queue *sq =
> +		container_of(napi, struct send_queue, napi);
> +	struct virtnet_info *vi = sq->vq->vdev->priv;
> +	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
> +	unsigned int r, sent = 0;
> +
> +again:
> +	__netif_tx_lock(txq, smp_processor_id());
> +	virtqueue_disable_cb(sq->vq);
> +	sent += free_old_xmit_skbs(sq, budget - sent);
> +
> +	if (sent < budget) {
> +		r = virtqueue_enable_cb_prepare(sq->vq);

So even virtqueue_enable_cb_delayed() was used in start_xmit(). This can
move used index backwards to trigger unnecessary interrupts.
> +		napi_complete(napi);
> +		__netif_tx_unlock(txq);
> +		if (unlikely(virtqueue_poll(sq->vq, r)) &&
> +		    napi_schedule_prep(napi)) {
> +			virtqueue_disable_cb(sq->vq);
> +			__napi_schedule(napi);
> +			goto again;
> +		}
> +	} else {
> +		__netif_tx_unlock(txq);
> +	}
> +
> +	netif_wake_subqueue(vi->dev, vq2txq(sq->vq));
> +	return sent;
> +}
> +
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>  /* must be called with local_bh_disable()d */
>  static int virtnet_busy_poll(struct napi_struct *napi)
> @@ -814,30 +870,12 @@ static int virtnet_open(struct net_device *dev)
>  			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
>  				schedule_delayed_work(&vi->refill, 0);
>  		virtnet_napi_enable(&vi->rq[i]);
> +		napi_enable(&vi->sq[i].napi);
>  	}
>  
>  	return 0;
>  }
>  
> -static void free_old_xmit_skbs(struct send_queue *sq)
> -{
> -	struct sk_buff *skb;
> -	unsigned int len;
> -	struct virtnet_info *vi = sq->vq->vdev->priv;
> -	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
> -
> -	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		pr_debug("Sent skb %p\n", skb);
> -
> -		u64_stats_update_begin(&stats->tx_syncp);
> -		stats->tx_bytes += skb->len;
> -		stats->tx_packets++;
> -		u64_stats_update_end(&stats->tx_syncp);
> -
> -		dev_kfree_skb_any(skb);
> -	}
> -}
> -
>  static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  {
>  	struct skb_vnet_hdr *hdr;
> @@ -902,7 +940,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  		sg_set_buf(sq->sg, hdr, hdr_len);
>  		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
>  	}
> -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> +
> +	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb,
> +				    GFP_ATOMIC);
>  }
>  
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> @@ -910,10 +950,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int qnum = skb_get_queue_mapping(skb);
>  	struct send_queue *sq = &vi->sq[qnum];
> -	int err;
> +	int err, qsize = virtqueue_get_vring_size(sq->vq);
>  
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(sq);

I think we'd better keep this. Since it may the tx skb freeing not
totally depends on the tx interrupt, delayed interrupt may work better
without damage the latency.
> +	virtqueue_disable_cb(sq->vq);
>  
>  	/* Try to transmit */
>  	err = xmit_skb(sq, skb);
> @@ -930,22 +969,20 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	}
>  	virtqueue_kick(sq->vq);
>  
> -	/* Don't wait up for transmitted skbs to be freed. */
> -	skb_orphan(skb);
> -	nf_reset(skb);
> -
>  	/* Apparently nice girls don't return TX_BUSY; stop the queue
>  	 * before it gets out of hand.  Naturally, this wastes entries. */
>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
>  		netif_stop_subqueue(dev, qnum);
>  		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
> -			free_old_xmit_skbs(sq);
> +			free_old_xmit_skbs(sq, qsize);
>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
>  			}
>  		}
> +	} else if (virtqueue_enable_cb_delayed(sq->vq)) {
> +		free_old_xmit_skbs(sq, qsize);
>  	}
>  
>  	return NETDEV_TX_OK;
> @@ -1124,8 +1161,10 @@ static int virtnet_close(struct net_device *dev)
>  	/* Make sure refill_work doesn't re-enable napi! */
>  	cancel_delayed_work_sync(&vi->refill);
>  
> -	for (i = 0; i < vi->max_queue_pairs; i++)
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		napi_disable(&vi->rq[i].napi);
> +		napi_disable(&vi->sq[i].napi);
> +	}
>  
>  	return 0;
>  }
> @@ -1438,8 +1477,10 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  {
>  	int i;
>  
> -	for (i = 0; i < vi->max_queue_pairs; i++)
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		netif_napi_del(&vi->rq[i].napi);
> +		netif_napi_del(&vi->sq[i].napi);
> +	}
>  
>  	kfree(vi->rq);
>  	kfree(vi->sq);
> @@ -1593,6 +1634,8 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
>  		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
>  			       napi_weight);
>  		napi_hash_add(&vi->rq[i].napi);
> +		netif_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
> +			       napi_weight);
>  
>  		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
>  		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
> @@ -1893,8 +1936,10 @@ static int virtnet_freeze(struct virtio_device *vdev)
>  	if (netif_running(vi->dev)) {
>  		for (i = 0; i < vi->max_queue_pairs; i++) {
>  			napi_disable(&vi->rq[i].napi);
> +			napi_disable(&vi->sq[i].napi);
>  			napi_hash_del(&vi->rq[i].napi);
>  			netif_napi_del(&vi->rq[i].napi);
> +			netif_napi_del(&vi->sq[i].napi);
>  		}
>  	}
>  
> @@ -1919,8 +1964,10 @@ static int virtnet_restore(struct virtio_device *vdev)
>  			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
>  				schedule_delayed_work(&vi->refill, 0);
>  
> -		for (i = 0; i < vi->max_queue_pairs; i++)
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
>  			virtnet_napi_enable(&vi->rq[i]);
> +			napi_enable(&vi->sq[i].napi);
> +		}
>  	}
>  
>  	netif_device_attach(vi->dev);

^ permalink raw reply

* Re: [PATCH RFC net-next] sfc: add support for skb->xmit_more
From: Edward Cree @ 2014-10-15 11:05 UTC (permalink / raw)
  To: David Miller; +Cc: dborkman, nikolay, netdev, sshah, jcooper, linux-net-drivers
In-Reply-To: <20141014.171501.448630399243434370.davem@davemloft.net>

On 14/10/14 22:15, David Miller wrote:
> From: Edward Cree <ecree@solarflare.com>
> Date: Tue, 14 Oct 2014 19:41:37 +0100
>
>> Don't ring the doorbell, and don't do PIO.  This will also prevent
>>  TX Push, because there will be more than one buffer waiting when
>>  the doorbell is rung.
>>
>> Signed-off-by: Edward Cree <ecree@solarflare.com>
> This looks good to me, mind if I apply this now?
I'd rather wait until Jon Cooper's had a chance to look at it; he
understands our TX path better than I.

-Edward

^ permalink raw reply

* Re: [RFC PATCH net-next 0/6] Always use tx interrupt for virtio-net
From: Jason Wang @ 2014-10-15 11:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20141015102553.GF25776@redhat.com>

On 10/15/2014 06:25 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 15, 2014 at 03:25:24PM +0800, Jason Wang wrote:
>> According to David, proper accounting and queueing (at all levels, not
>> just TCP sockets) is more important than trying to skim a bunch of
>> cycles by avoiding TX interrupts.
> He also mentioned we should find other ways to batch
>

Right.
>> Having an event to free the SKB is
>> absolutely essential for the stack to operate correctly.
>>
>> This series tries to enable tx interrupt for virtio-net. The idea is
>> simple: enable tx interrupt and schedule a tx napi to free old xmit
>> skbs.
>>
>> Several notes:
>> - Tx interrupt storm avoidance when queue is about to be full is
>>   kept.
> But queue is typically *not* full. More important to avoid interrupt
> storms in that case IMO.

Yes.
>> Since we may enable callbacks in both ndo_start_xmit() and tx
>>   napi, patch 1 adds a check to make sure used event never go
>>   back. This will let the napi not enable the callbacks wrongly after
>>   delayed callbacks was used.
> So why not just use delayed callbacks?

This means the tx interrupt are coalesced in a somewhat adaptive way.
Need benchmark to see its effect.
>
>> - For bulk dequeuing, there's no need to enable tx interrupt for each
>>   packet. The last patch only enable tx interrupt for the final skb in
>>   the chain through xmit_more and a new helper to publish current avail
>>   idx as used event.
>>
>> This series fixes several issues of original rfc pointed out by Michael.
> Could you list the issues, for ease of review?

Probably just one:

- Move the virtqueue_disable_cb() from skb_xmit_done() into
virtnet_poll_tx() under tx lock.

^ permalink raw reply

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Eric Dumazet @ 2014-10-15 11:27 UTC (permalink / raw)
  To: Krzysztof Kolasa; +Cc: Cong Wang, David Miller, Eric Dumazet, netdev
In-Reply-To: <543E4DD8.80203@winsoft.pl>

On Wed, 2014-10-15 at 12:35 +0200, Krzysztof Kolasa wrote:
> W dniu 14.10.2014 o 23:43, Eric Dumazet pisze:
> > On Tue, 2014-10-14 at 14:24 -0700, Cong Wang wrote:
> >
> >> Since we are still in merge window, I don't think we have to use
> >> a one-line fix for a bug introduced in this merge window.
> > You are clearly refactoring here. Its a nice cleanup.
> >
> > If I was the maintainer, I would prefer the one line fix.
> >
> > Then when net-next is open, you refactor.
> >
> > As I said, I wont argue, do whatever you want.
> >
> > Thanks
> >
> >
> >
> 
> one-line patch not resolve problem, fix created by Cong Wang resolves 
> problem !!!

Hmm, there should be no difference with either patch.

tcp_v4_rcv() 
...
memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
        sizeof(struct inet_skb_parm));
...
-> tcp_v4_do_rcv() 
   ->  tcp_v4_hnd_req()
      -> cookie_v4_check(... , &TCP_SKB_CB(skb)->header.h4.opt)

Hmm...

^ permalink raw reply

* Re: [PATCH v2] ipv4: dst_entry leak in ip_append_data()
From: Vasily Averin @ 2014-10-15 11:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy
In-Reply-To: <1413365433.12304.53.camel@edumazet-glaptop2.roam.corp.google.com>

On 15.10.2014 13:30, Eric Dumazet wrote:
> On Wed, 2014-10-15 at 10:56 +0400, Vasily Averin wrote:
>> On 15.10.2014 08:46, Eric Dumazet wrote:
>>> On Tue, 2014-10-14 at 08:57 +0400, Vasily Averin wrote:
>>>> v2: adjust the indentation of the arguments __ip_append_data() call
>>>>
>>>> Fixes: 2e77d89b2fa8 ("net: avoid a pair of dst_hold()/dst_release() in ip_append_data()")
>>>>
>>>> If sk_write_queue is empty ip_append_data() executes ip_setup_cork()
>>>> that "steals" dst entry from rt to cork. Later it calls __ip_append_data()
>>>> that creates skb and adds it to sk_write_queue.
>>>>
>>>> If skb was added successfully following ip_push_pending_frames() call
>>>> reassign dst entries from cork to skb, and kfree_skb frees dst_entry.
>>>>
>>>> However nobody frees stolen dst_entry if skb was not added into sk_write_queue.
>>>
>>> I thought this was done by ip_flush_pending_frames() ?
>>
>> Take look at ip_send_unicast_reply():
> 
> So maybe the bug is here ?

Thank you, I'll remake my patch.

^ permalink raw reply

* Re: [RFC PATCH net-next 1/6] virtio: make sure used event never go backwards
From: Michael S. Tsirkin @ 2014-10-15 11:38 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E5019.9020507@redhat.com>

On Wed, Oct 15, 2014 at 06:44:41PM +0800, Jason Wang wrote:
> On 10/15/2014 06:32 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 06:13:19PM +0800, Jason Wang wrote:
> >> On 10/15/2014 05:34 PM, Michael S. Tsirkin wrote:
> >>> On Wed, Oct 15, 2014 at 03:25:25PM +0800, Jason Wang wrote:
> >>>> This patch checks the new event idx to make sure used event idx never
> >>>> goes back. This is used to synchronize the calls between
> >>>> virtqueue_enable_cb_delayed() and virtqueue_enable_cb().
> >>>>
> >>>> Cc: Rusty Russell <rusty@rustcorp.com.au>
> >>>> Cc: Michael S. Tsirkin <mst@redhat.com>
> >>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >>> the implication being that moving event idx back might cause some race
> >>> condition?  
> >> This will cause race condition when tx interrupt is enabled. Consider
> >> the following cases
> >>
> >> 1) tx napi was scheduled
> >> 2) start_xmit() call virtqueue_enable_cb_delayed() and disable cb, [used
> >> event is vq->last_used_idx + 3/4 pendg bufs]
> >> 3) tx napi enable the callback by virtqueue_enable_cb() [ used event is
> >> vq->last_used_idx ]
> >>  
> >> After step 3, used event was moved back, unnecessary tx interrupt was
> >> triggered.
> > Well unnecessary interrupts are safe.
> 
> But it that is what we want to reduce.

It's all about correctness. I don't think mixing enable_cb
and enable_cb_delayed makes sense, let's just make
virtio behave correctly if that happens, no need to
optimize for that.


> > With your patch caller of virtqueue_enable_cb will not get an
> > interrupt on the next buffer which is not safe.
> >
> > If you don't want an interrupt on the next buffer, don't
> > call virtqueue_enable_cb.
> 
> So something like this patch should be done in virtio core somewhere
> else. Virtio-net can not do this since it does not have the knowledge of
> event index.

Take a look at my patch - no calls to enable_cb, only
enable_cb_delayed, so we should be fine.

> >
> >>> If yes but please describe the race explicitly.
> >>> Is there a bug we need to fix on stable?
> >> Looks not, current code does not have such race condition.
> >>> Please also explicitly describe a configuration that causes event idx
> >>> to go back.
> >>>
> >>> All this info should go in the commit log.
> >> Will do this.
> >>>> ---
> >>>>  drivers/virtio/virtio_ring.c |    7 +++++--
> >>>>  1 files changed, 5 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >>>> index 3b1f89b..1b3929f 100644
> >>>> --- a/drivers/virtio/virtio_ring.c
> >>>> +++ b/drivers/virtio/virtio_ring.c
> >>>> @@ -559,14 +559,17 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >>>>  	u16 last_used_idx;
> >>>>  
> >>>>  	START_USE(vq);
> >>>> -
> >>>> +	last_used_idx = vq->last_used_idx;
> >>>>  	/* We optimistically turn back on interrupts, then check if there was
> >>>>  	 * more to do. */
> >>>>  	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> >>>>  	 * either clear the flags bit or point the event index at the next
> >>>>  	 * entry. Always do both to keep code simple. */
> >>>>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >>>> -	vring_used_event(&vq->vring) = last_used_idx = vq->last_used_idx;
> >>>> +	/* Make sure used event never go backwards */
> >>> s/go/goes/
> >>>
> >>>> +	if (!vring_need_event(vring_used_event(&vq->vring),
> >>>> +			      vq->vring.avail->idx, last_used_idx))
> >>>> +		vring_used_event(&vq->vring) = last_used_idx;
> >>> The result will be that driver will *not* get an interrupt
> >>> on the next consumed buffer, which is likely not what driver
> >>> intended when it called virtqueue_enable_cb.
> >> This will only happen when we want to delay the interrupt for next few
> >> consumed buffers (virtqueue_enable_cb_delayed() was called). For the
> >> other case, vq->last_used_idx should be ahead of previous used event. Do
> >> you see any other case?
> > Call virtqueue_enable_cb_delayed, later call virtqueue_enable_cb.  If
> > event index is not updated in virtqueue_enable_cb, driver will not get
> > an interrupt on the next buffer.
> 
> This is just what we want I think. The interrupt was not lost but fired
> after 3/4 pending buffers were consumed. Do you see any real issue on this?

Yes, this violates the API. For example device might never
consume the rest of buffers.


> >
> >>> Instead, how about we simply document the requirement that drivers either
> >>> always call virtqueue_enable_cb_delayed or virtqueue_enable_cb
> >>> but not both?
> >> We need call them both when tx interrupt is enabled I believe.
> > Can you pls reply to my patch and document issues you see?
> >
> 
> In the previous reply you said you're using
> virtuqueue_enable_cb_delayed(), so no race in your patch.

OK so you think my patch is also correct, but that yours gives better
efficiency?

-- 
MST

^ permalink raw reply

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Krzysztof Kolasa @ 2014-10-15 11:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Cong Wang, David Miller, Eric Dumazet, netdev
In-Reply-To: <1413372422.17365.10.camel@edumazet-glaptop2.roam.corp.google.com>

W dniu 15.10.2014 o 13:27, Eric Dumazet pisze:
> On Wed, 2014-10-15 at 12:35 +0200, Krzysztof Kolasa wrote:
>> W dniu 14.10.2014 o 23:43, Eric Dumazet pisze:
>>> On Tue, 2014-10-14 at 14:24 -0700, Cong Wang wrote:
>>>
>>>> Since we are still in merge window, I don't think we have to use
>>>> a one-line fix for a bug introduced in this merge window.
>>> You are clearly refactoring here. Its a nice cleanup.
>>>
>>> If I was the maintainer, I would prefer the one line fix.
>>>
>>> Then when net-next is open, you refactor.
>>>
>>> As I said, I wont argue, do whatever you want.
>>>
>>> Thanks
>>>
>>>
>>>
>> one-line patch not resolve problem, fix created by Cong Wang resolves
>> problem !!!
> Hmm, there should be no difference with either patch.
>
> tcp_v4_rcv()
> ...
> memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
>          sizeof(struct inet_skb_parm));
> ...
> -> tcp_v4_do_rcv()
>     ->  tcp_v4_hnd_req()
>        -> cookie_v4_check(... , &TCP_SKB_CB(skb)->header.h4.opt)
>
> Hmm...
>
>

on a 32bit system, the patch did not solve the problem :(
I have exactly the same problem as before the patch

I do not understand this, perhaps the problem is hidden somewhere else,
one thing is certain after revert commit 971f10eca1 everything works 
correctly

^ permalink raw reply

* Re: [RFC PATCH net-next 2/6] virtio: introduce virtio_enable_cb_avail()
From: Michael S. Tsirkin @ 2014-10-15 11:43 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, virtualization, davem
In-Reply-To: <543E5347.7060106@redhat.com>

On Wed, Oct 15, 2014 at 06:58:15PM +0800, Jason Wang wrote:
> On 10/15/2014 06:41 PM, Michael S. Tsirkin wrote:
> > On Wed, Oct 15, 2014 at 06:19:15PM +0800, Jason Wang wrote:
> >> On 10/15/2014 05:28 PM, Michael S. Tsirkin wrote:
> >>> On Wed, Oct 15, 2014 at 03:25:26PM +0800, Jason Wang wrote:
> >>>> This patch introduces virtio_enable_cb_avail() to publish avail idx
> >>>> and used event. This could be used by batched buffer submitting to
> >>>> reduce the number of tx interrupts.
> >>>>
> >>>> Cc: Rusty Russell <rusty@rustcorp.com.au>
> >>>> Cc: Michael S. Tsirkin <mst@redhat.com>
> >>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >>>> ---
> >>>>  drivers/virtio/virtio_ring.c |   22 ++++++++++++++++++++--
> >>>>  include/linux/virtio.h       |    2 ++
> >>>>  2 files changed, 22 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >>>> index 1b3929f..d67fbf8 100644
> >>>> --- a/drivers/virtio/virtio_ring.c
> >>>> +++ b/drivers/virtio/virtio_ring.c
> >>>> @@ -567,14 +567,32 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >>>>  	 * entry. Always do both to keep code simple. */
> >>>>  	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >>>>  	/* Make sure used event never go backwards */
> >>>> -	if (!vring_need_event(vring_used_event(&vq->vring),
> >>>> -			      vq->vring.avail->idx, last_used_idx))
> >>>> +	if (vq->vring.avail->idx != vring_used_event(&vq->vring) &&
> >>>> +	    !vring_need_event(vring_used_event(&vq->vring),
> >>>> +			      vq->vring.avail->idx, last_used_idx)) {
> >>>>  		vring_used_event(&vq->vring) = last_used_idx;
> >>>> +	}
> >>>>  	END_USE(vq);
> >>>>  	return last_used_idx;
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
> >>>>
> >>> I see you are also changing virtqueue_enable_cb_prepare, why?
> >> This is also used to prevent it from moving the used event backwards.
> >> This may happens when we handle tx napi after we publish avail idx as
> >> used event (virtqueue_enable_cb_avail() was called).
> > So it's wrong exactly in the same way.
> >
> > But also, please document this stuff, don't put
> > unrelated changes in a patch called "introduce
> > virtqueue_enable_cb_avail".
> >
> >
> >>>> +bool virtqueue_enable_cb_avail(struct virtqueue *_vq)
> >>>> +{
> >>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
> >>>> +	bool ret;
> >>>> +
> >>>> +	START_USE(vq);
> >>>> +	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
> >>>> +	vring_used_event(&vq->vring) = vq->vring.avail->idx;
> >>>> +	ret = vring_need_event(vq->vring.avail->idx,
> >>>> +			       vq->last_used_idx, vq->vring.used->idx);
> >>>> +	END_USE(vq);
> >>>> +
> >>>> +	return ret;
> >>>> +}
> >>>> +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_avail);
> >>>> +
> >>>>  /**
> >>>>   * virtqueue_poll - query pending used buffers
> >>>>   * @vq: the struct virtqueue we're talking about.
> >>> Could not figure out what this does.
> >>> Please add documentation.
> >>>
> >> Sure, does something like below explain what does this function do?
> >>
> >> /**                                                                            
> >>
> >>  * virtqueue_enable_cb_avail - restart callbacks after
> >> disable_cb.           
> >>  * @vq: the struct virtqueue we're talking
> >> about.                              
> >>  *                                                                             
> >>
> >>  * This re-enables callbacks but hints to the other side to
> >> delay              
> >>  * interrupts all of the available buffers have been processed;         
> >
> > So this is like virtqueue_enable_cb_delayed but even more
> > aggressive?
> > I think it's too agressive: it's better to wake up guest
> > after you are through most of buffers, but not all,
> > so guest and host can work in parallel.
> 
> Note that:
> 
> - it was only used when there are still few of free slots (which is
> greater than 2 + MAX_SKB_FRAGS)
> - my patch keeps the free_old_xmit_skbs() in the beginning of
> start_xmit(), so the tx skb reclaiming does not depends totally on tx
> interrupt. If more packets comes, we'd expect some of them were freed in
> ndo_start_xmit(). If not, finally we may trigger the
> virtqueue_enable_cb_delayed().
> 
> So probably not as aggressive as it looks. I will do benchmark on this.

Mine too:
        } else if (virtqueue_enable_cb_delayed(sq->vq)) {
                free_old_xmit_skbs(txq, sq, qsize);
        }



> >
> >
> >>  * it returns "false" if there are at least one pending buffer in the
> >> queue,          
> >>  * to detect a possible race between the driver checking for more
> >> work,        
> >>  * and enabling
> >> callbacks.                                                     
> >>  *                                                                             
> >>
> >>  * Caller must ensure we don't call this with other
> >> virtqueue                  
> >>  * operations at the same time (except where
> >> noted).                           
> >>  */
> >>
> >>>> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> >>>> index b46671e..bfaf058 100644
> >>>> --- a/include/linux/virtio.h
> >>>> +++ b/include/linux/virtio.h
> >>>> @@ -65,6 +65,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
> >>>>  
> >>>>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
> >>>>  
> >>>> +bool virtqueue_enable_cb_avail(struct virtqueue *vq);
> >>>> +
> >>>>  bool virtqueue_poll(struct virtqueue *vq, unsigned);
> >>>>  
> >>>>  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
> >>>> -- 
> >>>> 1.7.1
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>> Please read the FAQ at  http://www.tux.org/lkml/
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox