Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530536228-17462-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..2790959 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sock->ops->peek_len)
+		return sock->ops->peek_len(sock);
+
+	return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+				struct vhost_virtqueue *rvq,
+				struct vhost_virtqueue *tvq,
+				bool rx)
+{
+	unsigned long uninitialized_var(endtime);
+	unsigned long busyloop_timeout;
+	struct socket *sock;
+	struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+
+	vhost_disable_notify(&net->dev, vq);
+	sock = rvq->private_data;
+	busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
+
+	preempt_disable();
+	endtime = busy_clock() + busyloop_timeout;
+	while (vhost_can_busy_poll(tvq->dev, endtime) &&
+	       !(sock && sk_has_rx_data(sock->sk)) &&
+	       vhost_vq_avail_empty(tvq->dev, tvq))
+		cpu_relax();
+	preempt_enable();
+
+	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+		vhost_poll_queue(&vq->poll);
+	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+		vhost_disable_notify(&net->dev, vq);
+		vhost_poll_queue(&vq->poll);
+	}
+
+	mutex_unlock(&vq->mutex);
+}
+
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	return len;
 }
 
-static int sk_has_rx_data(struct sock *sk)
-{
-	struct socket *sock = sk->sk_socket;
-
-	if (sock->ops->peek_len)
-		return sock->ops->peek_len(sock);
-
-	return skb_queue_empty(&sk->sk_receive_queue);
-}
-
 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
-	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
-	struct vhost_virtqueue *vq = &nvq->vq;
-	unsigned long uninitialized_var(endtime);
-	int len = peek_head_len(rvq, sk);
+	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
 
-	if (!len && vq->busyloop_timeout) {
-		/* Flush batched heads first */
-		vhost_rx_signal_used(rvq);
-		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
-		vhost_disable_notify(&net->dev, vq);
+	int len = peek_head_len(rnvq, sk);
 
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-
-		while (vhost_can_busy_poll(&net->dev, endtime) &&
-		       !sk_has_rx_data(sk) &&
-		       vhost_vq_avail_empty(&net->dev, vq))
-			cpu_relax();
-
-		preempt_enable();
-
-		if (!vhost_vq_avail_empty(&net->dev, vq))
-			vhost_poll_queue(&vq->poll);
-		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-			vhost_disable_notify(&net->dev, vq);
-			vhost_poll_queue(&vq->poll);
-		}
+	if (!len && rnvq->vq.busyloop_timeout) {
+		/* Flush batched heads first */
+		vhost_rx_signal_used(rnvq);
 
-		mutex_unlock(&vq->mutex);
+		/* Both tx vq and rx socket were polled here */
+		vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
 
-		len = peek_head_len(rvq, sk);
+		len = peek_head_len(rnvq, sk);
 	}
 
 	return len;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v4 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530536228-17462-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..62bb8e8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
-	mutex_lock(&vq->mutex);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
@@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 		/* Flush batched heads first */
 		vhost_rx_signal_used(rvq);
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, 1);
+		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 		vhost_disable_notify(&net->dev, vq);
 
 		preempt_disable();
@@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
 	__virtio16 num_buffers;
 	int recv_pkts = 0;
 
-	mutex_lock_nested(&vq->mutex, 0);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v4 1/4] vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530536228-17462-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 895eaa2..4ca9383 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 {
 	int i;
 
-	for (i = 0; i < d->nvqs; ++i)
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
 		__vhost_vq_meta_reset(d->vqs[i]);
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
 	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_unlock(&d->vqs[i]->mutex);
-}
-
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 		if (msg->iova <= vq_msg->iova &&
 		    msg->iova + msg->size - 1 > vq_msg->iova &&
 		    vq_msg->type == VHOST_IOTLB_MISS) {
+			mutex_lock(&node->vq->mutex);
 			vhost_poll_queue(&node->vq->poll);
+			mutex_unlock(&node->vq->mutex);
+
 			list_del(&node->node);
 			kfree(node);
 		}
@@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 	int ret = 0;
 
 	mutex_lock(&dev->mutex);
-	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
 	case VHOST_IOTLB_UPDATE:
 		if (!dev->iotlb) {
@@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 		break;
 	}
 
-	vhost_dev_unlock_vqs(dev);
 	mutex_unlock(&dev->mutex);
 
 	return ret;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v4 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, mst

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patches improve the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the same time.
handle_rx do that in the same way.

For more performance report, see patch 4.

v3 -> v4:
fix some issues

v2 -> v3:
This patches are splited from previous big patch:
http://patchwork.ozlabs.org/patch/934673/

Tonghao Zhang (4):
  vhost: lock the vqs one by one
  net: vhost: replace magic number of lock annotation
  net: vhost: factor out busy polling logic to vhost_net_busy_poll()
  net: vhost: add rx busy polling in tx path

 drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c |  24 ++++-------
 2 files changed, 67 insertions(+), 65 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  8:54 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <c81775fd-c74f-d334-cbba-90decb55dfc2@lab.ntt.co.jp>



On 2018年07月02日 16:05, Toshiaki Makita wrote:
> On 2018/07/02 16:52, Jason Wang wrote:
>> On 2018年07月02日 15:11, Toshiaki Makita wrote:
>>> On 2018/07/02 15:17, Jason Wang wrote:
>>>> On 2018年07月02日 12:37, Toshiaki Makita wrote:
>>>>> On 2018/07/02 11:54, Jason Wang wrote:
>>>>>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>>>>>> Hi Jason,
>>>>>>>
>>>>>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>>>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>>>>>> ...
>>>>>>>>> To fix this, poll the work instead of enabling notification when
>>>>>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>>>>>> interruption, save the endtime in vq field and use it when
>>>>>>>>> reentering
>>>>>>>>> the work function.
>>>>>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>>>>>> I just want to keep it user-controllable. Unless memorizing it
>>>>>>> busypoll
>>>>>>> can run unexpectedly long.
>>>>>> I think the total amount of time for busy polling is bounded. If I was
>>>>>> wrong, it should be a bug somewhere.
>>>>> Consider this kind of scenario:
>>>>> 0. Set 100us busypoll for example.
>>>>> 1. handle_tx() runs busypoll.
>>>>> 2. Something like zerocopy queues tx_work within 100us.
>>>>> 3. busypoll exits and call handle_tx() again.
>>>>> 4. Repeat 1-3.
>>>>>
>>>>> In this case handle_tx() does not process packets but busypoll
>>>>> essentially runs beyond 100us without endtime memorized. This may be
>>>>> just a theoretical problem, but I was worried that more code to poll tx
>>>>> queue can be added in the future and it becomes realistic.
>>>> Yes, but consider zerocopy tends to batch 16 used packets and we will
>>>> finally finish all processing of packets. The above won't be endless, so
>>>> it was probably tolerable.
>>> Right. So endtime memorization is more like a future-proof thing.
>>> Would you like to keep it or change something?
>> I think we'd better introduce it only when we meet real bugs.
> I'll change it to a flag to indicate the previous busypoll is interrupted.
>
>>>>>>>>> Performance numbers:
>>>>>>>>>
>>>>>>>>> - Bulk transfer from guest to external physical server.
>>>>>>>>>          [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>>>>>> [Server]
>>>>>>>> Just to confirm in this case since zerocopy is enabled, we are in
>>>>>>>> fact
>>>>>>>> use the generic XDP datapath?
>>>>>>> For some reason zerocopy was not applied for most packets, so in most
>>>>>>> cases driver XDP was used. I was going to dig into it but not yet.
>>>>>> Right, just to confirm this. This is expected.
>>>>>>
>>>>>> In tuntap, we do native XDP only for small and non zerocopy
>>>>>> packets. See
>>>>>> tun_can_build_skb(). The reason is XDP may adjust packet header
>>>>>> which is
>>>>>> not supported by zercopy. We can only use XDP generic for zerocopy in
>>>>>> this case.
>>>>> I think I understand when driver XDP can be used. What I'm not sure and
>>>>> was going to narrow down is why zerocopy is mostly not applied.
>>>>>
>>>> I see, any touch to the zerocopy packet (clone, header expansion or
>>>> segmentation) that lead a userspace copy will increase the error counter
>>>> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
>>>> zerocopy. So it was probably something in your setup or a bug somewhere.
>>> Thanks for the hint!
> Seems zerocopy packets are always nonlinear and
> netif_receive_generic_xdp() calls skb_linearize() in which
> __pskb_pull_tail() calls skb_zcopy_clear(). Looks like tx_zcopy_err is
> always counted when zerocopy is used with XDP in my env.
>

Right, since it needs do copy there, so we tend to disable zerocopy in 
this case.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  8:05 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <3244a627-f70d-a7c6-46fb-24b5e3995ce2@redhat.com>

On 2018/07/02 16:52, Jason Wang wrote:
> On 2018年07月02日 15:11, Toshiaki Makita wrote:
>> On 2018/07/02 15:17, Jason Wang wrote:
>>> On 2018年07月02日 12:37, Toshiaki Makita wrote:
>>>> On 2018/07/02 11:54, Jason Wang wrote:
>>>>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>>>>> Hi Jason,
>>>>>>
>>>>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>>>>> ...
>>>>>>>> To fix this, poll the work instead of enabling notification when
>>>>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>>>>> interruption, save the endtime in vq field and use it when
>>>>>>>> reentering
>>>>>>>> the work function.
>>>>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>>>>> I just want to keep it user-controllable. Unless memorizing it
>>>>>> busypoll
>>>>>> can run unexpectedly long.
>>>>> I think the total amount of time for busy polling is bounded. If I was
>>>>> wrong, it should be a bug somewhere.
>>>> Consider this kind of scenario:
>>>> 0. Set 100us busypoll for example.
>>>> 1. handle_tx() runs busypoll.
>>>> 2. Something like zerocopy queues tx_work within 100us.
>>>> 3. busypoll exits and call handle_tx() again.
>>>> 4. Repeat 1-3.
>>>>
>>>> In this case handle_tx() does not process packets but busypoll
>>>> essentially runs beyond 100us without endtime memorized. This may be
>>>> just a theoretical problem, but I was worried that more code to poll tx
>>>> queue can be added in the future and it becomes realistic.
>>> Yes, but consider zerocopy tends to batch 16 used packets and we will
>>> finally finish all processing of packets. The above won't be endless, so
>>> it was probably tolerable.
>> Right. So endtime memorization is more like a future-proof thing.
>> Would you like to keep it or change something?
> 
> I think we'd better introduce it only when we meet real bugs.

I'll change it to a flag to indicate the previous busypoll is interrupted.

>>>>>>>> Performance numbers:
>>>>>>>>
>>>>>>>> - Bulk transfer from guest to external physical server.
>>>>>>>>         [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>>>>> [Server]
>>>>>>> Just to confirm in this case since zerocopy is enabled, we are in
>>>>>>> fact
>>>>>>> use the generic XDP datapath?
>>>>>> For some reason zerocopy was not applied for most packets, so in most
>>>>>> cases driver XDP was used. I was going to dig into it but not yet.
>>>>> Right, just to confirm this. This is expected.
>>>>>
>>>>> In tuntap, we do native XDP only for small and non zerocopy
>>>>> packets. See
>>>>> tun_can_build_skb(). The reason is XDP may adjust packet header
>>>>> which is
>>>>> not supported by zercopy. We can only use XDP generic for zerocopy in
>>>>> this case.
>>>> I think I understand when driver XDP can be used. What I'm not sure and
>>>> was going to narrow down is why zerocopy is mostly not applied.
>>>>
>>> I see, any touch to the zerocopy packet (clone, header expansion or
>>> segmentation) that lead a userspace copy will increase the error counter
>>> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
>>> zerocopy. So it was probably something in your setup or a bug somewhere.
>> Thanks for the hint!

Seems zerocopy packets are always nonlinear and
netif_receive_generic_xdp() calls skb_linearize() in which
__pskb_pull_tail() calls skb_zcopy_clear(). Looks like tx_zcopy_err is
always counted when zerocopy is used with XDP in my env.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  7:52 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <0d7a285a-5fa2-0e51-8907-e8bfc508214b@lab.ntt.co.jp>



On 2018年07月02日 15:11, Toshiaki Makita wrote:
> On 2018/07/02 15:17, Jason Wang wrote:
>> On 2018年07月02日 12:37, Toshiaki Makita wrote:
>>> On 2018/07/02 11:54, Jason Wang wrote:
>>>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>>>> Hi Jason,
>>>>>
>>>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>>>> ...
>>>>>>> To fix this, poll the work instead of enabling notification when
>>>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>>>> interruption, save the endtime in vq field and use it when reentering
>>>>>>> the work function.
>>>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>>>> I just want to keep it user-controllable. Unless memorizing it busypoll
>>>>> can run unexpectedly long.
>>>> I think the total amount of time for busy polling is bounded. If I was
>>>> wrong, it should be a bug somewhere.
>>> Consider this kind of scenario:
>>> 0. Set 100us busypoll for example.
>>> 1. handle_tx() runs busypoll.
>>> 2. Something like zerocopy queues tx_work within 100us.
>>> 3. busypoll exits and call handle_tx() again.
>>> 4. Repeat 1-3.
>>>
>>> In this case handle_tx() does not process packets but busypoll
>>> essentially runs beyond 100us without endtime memorized. This may be
>>> just a theoretical problem, but I was worried that more code to poll tx
>>> queue can be added in the future and it becomes realistic.
>> Yes, but consider zerocopy tends to batch 16 used packets and we will
>> finally finish all processing of packets. The above won't be endless, so
>> it was probably tolerable.
> Right. So endtime memorization is more like a future-proof thing.
> Would you like to keep it or change something?

I think we'd better introduce it only when we meet real bugs.

Thanks

>
>>>>>>> Performance numbers:
>>>>>>>
>>>>>>> - Bulk transfer from guest to external physical server.
>>>>>>>         [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>>>> [Server]
>>>>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>>>>> use the generic XDP datapath?
>>>>> For some reason zerocopy was not applied for most packets, so in most
>>>>> cases driver XDP was used. I was going to dig into it but not yet.
>>>> Right, just to confirm this. This is expected.
>>>>
>>>> In tuntap, we do native XDP only for small and non zerocopy packets. See
>>>> tun_can_build_skb(). The reason is XDP may adjust packet header which is
>>>> not supported by zercopy. We can only use XDP generic for zerocopy in
>>>> this case.
>>> I think I understand when driver XDP can be used. What I'm not sure and
>>> was going to narrow down is why zerocopy is mostly not applied.
>>>
>> I see, any touch to the zerocopy packet (clone, header expansion or
>> segmentation) that lead a userspace copy will increase the error counter
>> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
>> zerocopy. So it was probably something in your setup or a bug somewhere.
> Thanks for the hint!
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  7:11 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <b0670083-d938-8b96-56d2-65d41d6b7c8c@redhat.com>

On 2018/07/02 15:17, Jason Wang wrote:
> On 2018年07月02日 12:37, Toshiaki Makita wrote:
>> On 2018/07/02 11:54, Jason Wang wrote:
>>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>>> Hi Jason,
>>>>
>>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>>> ...
>>>>>> To fix this, poll the work instead of enabling notification when
>>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>>> interruption, save the endtime in vq field and use it when reentering
>>>>>> the work function.
>>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>>> I just want to keep it user-controllable. Unless memorizing it busypoll
>>>> can run unexpectedly long.
>>> I think the total amount of time for busy polling is bounded. If I was
>>> wrong, it should be a bug somewhere.
>> Consider this kind of scenario:
>> 0. Set 100us busypoll for example.
>> 1. handle_tx() runs busypoll.
>> 2. Something like zerocopy queues tx_work within 100us.
>> 3. busypoll exits and call handle_tx() again.
>> 4. Repeat 1-3.
>>
>> In this case handle_tx() does not process packets but busypoll
>> essentially runs beyond 100us without endtime memorized. This may be
>> just a theoretical problem, but I was worried that more code to poll tx
>> queue can be added in the future and it becomes realistic.
> 
> Yes, but consider zerocopy tends to batch 16 used packets and we will
> finally finish all processing of packets. The above won't be endless, so
> it was probably tolerable.

Right. So endtime memorization is more like a future-proof thing.
Would you like to keep it or change something?

>>>>>> Performance numbers:
>>>>>>
>>>>>> - Bulk transfer from guest to external physical server.
>>>>>>        [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>>> [Server]
>>>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>>>> use the generic XDP datapath?
>>>> For some reason zerocopy was not applied for most packets, so in most
>>>> cases driver XDP was used. I was going to dig into it but not yet.
>>> Right, just to confirm this. This is expected.
>>>
>>> In tuntap, we do native XDP only for small and non zerocopy packets. See
>>> tun_can_build_skb(). The reason is XDP may adjust packet header which is
>>> not supported by zercopy. We can only use XDP generic for zerocopy in
>>> this case.
>> I think I understand when driver XDP can be used. What I'm not sure and
>> was going to narrow down is why zerocopy is mostly not applied.
>>
> 
> I see, any touch to the zerocopy packet (clone, header expansion or
> segmentation) that lead a userspace copy will increase the error counter
> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
> zerocopy. So it was probably something in your setup or a bug somewhere.

Thanks for the hint!

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  6:17 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <23eaea23-2288-e00a-88df-f13eeb890e89@lab.ntt.co.jp>



On 2018年07月02日 12:37, Toshiaki Makita wrote:
> On 2018/07/02 11:54, Jason Wang wrote:
>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>> Hi Jason,
>>>
>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>> ...
>>>>> To fix this, poll the work instead of enabling notification when
>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>> interruption, save the endtime in vq field and use it when reentering
>>>>> the work function.
>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>> I just want to keep it user-controllable. Unless memorizing it busypoll
>>> can run unexpectedly long.
>> I think the total amount of time for busy polling is bounded. If I was
>> wrong, it should be a bug somewhere.
> Consider this kind of scenario:
> 0. Set 100us busypoll for example.
> 1. handle_tx() runs busypoll.
> 2. Something like zerocopy queues tx_work within 100us.
> 3. busypoll exits and call handle_tx() again.
> 4. Repeat 1-3.
>
> In this case handle_tx() does not process packets but busypoll
> essentially runs beyond 100us without endtime memorized. This may be
> just a theoretical problem, but I was worried that more code to poll tx
> queue can be added in the future and it becomes realistic.

Yes, but consider zerocopy tends to batch 16 used packets and we will 
finally finish all processing of packets. The above won't be endless, so 
it was probably tolerable.

>
>>>>> Performance numbers:
>>>>>
>>>>> - Bulk transfer from guest to external physical server.
>>>>>        [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>> [Server]
>>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>>> use the generic XDP datapath?
>>> For some reason zerocopy was not applied for most packets, so in most
>>> cases driver XDP was used. I was going to dig into it but not yet.
>> Right, just to confirm this. This is expected.
>>
>> In tuntap, we do native XDP only for small and non zerocopy packets. See
>> tun_can_build_skb(). The reason is XDP may adjust packet header which is
>> not supported by zercopy. We can only use XDP generic for zerocopy in
>> this case.
> I think I understand when driver XDP can be used. What I'm not sure and
> was going to narrow down is why zerocopy is mostly not applied.
>

I see, any touch to the zerocopy packet (clone, header expansion or 
segmentation) that lead a userspace copy will increase the error counter 
in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use 
zerocopy. So it was probably something in your setup or a bug somewhere.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  4:37 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <4dc58c49-f804-b720-6e50-5867dc32d7ec@redhat.com>

On 2018/07/02 11:54, Jason Wang wrote:
> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>> Hi Jason,
>>
>> On 2018/06/29 18:30, Jason Wang wrote:
>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>> ...
>>>> To fix this, poll the work instead of enabling notification when
>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>> completely cancel the busypoll, so let's run busypoll after the
>>>> necessary work is done. In order to avoid too long busyloop due to
>>>> interruption, save the endtime in vq field and use it when reentering
>>>> the work function.
>>> I think we don't care long busyloop unless e.g tx can starve rx?
>> I just want to keep it user-controllable. Unless memorizing it busypoll
>> can run unexpectedly long.
> 
> I think the total amount of time for busy polling is bounded. If I was
> wrong, it should be a bug somewhere.

Consider this kind of scenario:
0. Set 100us busypoll for example.
1. handle_tx() runs busypoll.
2. Something like zerocopy queues tx_work within 100us.
3. busypoll exits and call handle_tx() again.
4. Repeat 1-3.

In this case handle_tx() does not process packets but busypoll
essentially runs beyond 100us without endtime memorized. This may be
just a theoretical problem, but I was worried that more code to poll tx
queue can be added in the future and it becomes realistic.

>>>> Performance numbers:
>>>>
>>>> - Bulk transfer from guest to external physical server.
>>>>       [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>> [Server]
>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>> use the generic XDP datapath?
>> For some reason zerocopy was not applied for most packets, so in most
>> cases driver XDP was used. I was going to dig into it but not yet.
> 
> Right, just to confirm this. This is expected.
> 
> In tuntap, we do native XDP only for small and non zerocopy packets. See
> tun_can_build_skb(). The reason is XDP may adjust packet header which is
> not supported by zercopy. We can only use XDP generic for zerocopy in
> this case.

I think I understand when driver XDP can be used. What I'm not sure and
was going to narrow down is why zerocopy is mostly not applied.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Tonghao Zhang @ 2018-07-02  4:05 UTC (permalink / raw)
  To: jasowang
  Cc: Linux Kernel Network Developers, virtualization, Tonghao Zhang,
	mst
In-Reply-To: <fa6a368a-7a01-2467-ba63-322eef4f544a@redhat.com>

On Mon, Jul 2, 2018 at 10:29 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > Factor out generic busy polling logic and will be
> > used for tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> > ---
> >   drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
> >   1 file changed, 53 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 62bb8e8..458f81d 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> >       return vhost_poll_start(poll, sock->file);
> >   }
> >
> > +static int sk_has_rx_data(struct sock *sk)
> > +{
> > +     struct socket *sock = sk->sk_socket;
> > +
> > +     if (sock->ops->peek_len)
> > +             return sock->ops->peek_len(sock);
> > +
> > +     return skb_queue_empty(&sk->sk_receive_queue);
> > +}
> > +
> > +static void vhost_net_busy_poll(struct vhost_net *net,
> > +                             struct vhost_virtqueue *rvq,
> > +                             struct vhost_virtqueue *tvq,
> > +                             bool rx)
> > +{
> > +     unsigned long uninitialized_var(endtime);
> > +     struct socket *sock = rvq->private_data;
> > +     struct vhost_virtqueue *vq = rx ? tvq : rvq;
> > +     unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
> > +                                           tvq->busyloop_timeout;
>
> As simple as vq->busyloop_timeout?
maybe we should allow user set busyloop_timeout for rx or tx
differently. this code should be moved under mutex.

> > +
> > +     mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>
> We need move sock = rvq->private_data under the protection of vq mutex
> if rx is false.
yes, thanks for your review.

> > +     vhost_disable_notify(&net->dev, vq);
> > +
> > +     preempt_disable();
> > +     endtime = busy_clock() + busyloop_timeout;
> > +     while (vhost_can_busy_poll(tvq->dev, endtime) &&
> > +            !(sock && sk_has_rx_data(sock->sk)) &&
> > +            vhost_vq_avail_empty(tvq->dev, tvq))
> > +             cpu_relax();
> > +     preempt_enable();
> > +
> > +     if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> > +         (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> > +             vhost_poll_queue(&vq->poll);
> > +     } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> > +             vhost_disable_notify(&net->dev, vq);
> > +             vhost_poll_queue(&vq->poll);
> > +     }
> > +
> > +     mutex_unlock(&vq->mutex);
> > +}
> > +
> > +
> >   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >                                   struct vhost_virtqueue *vq,
> >                                   struct iovec iov[], unsigned int iov_size,
> > @@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
> >       return len;
> >   }
> >
> > -static int sk_has_rx_data(struct sock *sk)
> > -{
> > -     struct socket *sock = sk->sk_socket;
> > -
> > -     if (sock->ops->peek_len)
> > -             return sock->ops->peek_len(sock);
> > -
> > -     return skb_queue_empty(&sk->sk_receive_queue);
> > -}
> > -
> >   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >   {
> >       struct vhost_virtqueue *vq = &nvq->vq;
> > @@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >
> >   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> >   {
> > -     struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> > -     struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> > -     struct vhost_virtqueue *vq = &nvq->vq;
> > -     unsigned long uninitialized_var(endtime);
> > -     int len = peek_head_len(rvq, sk);
> > +     struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> > +     struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
>
> It looks to me rnvq and tnvq is slightly better.
yes. patch 4 will also update.

> Other looks good to me.
>
> Thanks
>
> >
> > -     if (!len && vq->busyloop_timeout) {
> > -             /* Flush batched heads first */
> > -             vhost_rx_signal_used(rvq);
> > -             /* Both tx vq and rx socket were polled here */
> > -             mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> > -             vhost_disable_notify(&net->dev, vq);
> > +     int len = peek_head_len(nvq_rx, sk);
> >
> > -             preempt_disable();
> > -             endtime = busy_clock() + vq->busyloop_timeout;
> > -
> > -             while (vhost_can_busy_poll(&net->dev, endtime) &&
> > -                    !sk_has_rx_data(sk) &&
> > -                    vhost_vq_avail_empty(&net->dev, vq))
> > -                     cpu_relax();
> > -
> > -             preempt_enable();
> > -
> > -             if (!vhost_vq_avail_empty(&net->dev, vq))
> > -                     vhost_poll_queue(&vq->poll);
> > -             else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> > -                     vhost_disable_notify(&net->dev, vq);
> > -                     vhost_poll_queue(&vq->poll);
> > -             }
> > +     if (!len && nvq_rx->vq.busyloop_timeout) {
> > +             /* Flush batched heads first */
> > +             vhost_rx_signal_used(nvq_rx);
> >
> > -             mutex_unlock(&vq->mutex);
> > +             /* Both tx vq and rx socket were polled here */
> > +             vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
> >
> > -             len = peek_head_len(rvq, sk);
> > +             len = peek_head_len(nvq_rx, sk);
> >       }
> >
> >       return len;
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  3:05 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <c6ef8c74-7efa-707b-d62d-471636dd0a7f@lab.ntt.co.jp>



On 2018年07月02日 10:52, Toshiaki Makita wrote:
> On 2018/07/02 11:41, Jason Wang wrote:
>> On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
>>> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>>>> Under heavy load vhost busypoll may run without suppressing
>>>> notification. For example tx zerocopy callback can push tx work while
>>>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>>>> condition and enables notification but immediately reenters handle_tx()
>>>> because the pushed work was tx. In this case handle_tx() tries to
>>>> disable notification again, but when using event_idx it by design
>>>> cannot. Then busyloop will run without suppressing notification.
>>>> Another example is the case where handle_tx() tries to enable
>>>> notification but avail idx is advanced so disables it again. This case
>>>> also lead to the same situation with event_idx.
>>>>
>>>> The problem is that once we enter this situation busyloop does not work
>>>> under heavy load for considerable amount of time, because notification
>>>> is likely to happen during busyloop and handle_tx() immediately enables
>>>> notification after notification happens. Specifically busyloop detects
>>>> notification by vhost_has_work() and then handle_tx() calls
>>>> vhost_enable_notify().
>>> I'd like to understand the problem a bit better.
>>> Why does this happen?
>>> Doesn't this only happen if ring is empty?
>>>
>> My understanding is:
>>
>> vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause
>> the busy loop in vhost_net_tx_get_vq_desc() to exit because of
>> vhost_has_work() return true. Then handle_tx() tends to enable
>> notification. Then guest may kick us even if handle_tx() call
>> vhost_disable_notify() which in fact did nothing for even index.
> Yes.
>
>> Maybe we can try to call vhost_zerocopy_signal_used() if we found
>> there's pending used from zerocopy instead.
> Note that even when zerocopy is disabled the problem happens as I wrote.
> When vhost_enable_notify() detects avail_idx advanced it tries to
> disable notification again but it fails.
>

Yes, and the main reason is need_resched() and rx work. (polling RX will 
be addressed by Tonghao's patch I think).

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <20180629193448-mutt-send-email-mst@kernel.org>

On 2018/06/30 1:38, Michael S. Tsirkin wrote:
...
>> Performance numbers:
>>
>> - Bulk transfer from guest to external physical server.
>>     [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
>> - Set 10us busypoll.
>> - Guest disables checksum and TSO because of host XDP.
>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>   (EPT_MISCONFIG event).
>>
>>                             Before              After
>>                           Mbps  kicks/s      Mbps  kicks/s
>> UDP_STREAM 1472byte              247758                 27
>>                 Send   3645.37            6958.10
>>                 Recv   3588.56            6958.10
>>               1byte                9865                 37
>>                 Send      4.34               5.43
>>                 Recv      4.17               5.26
>> TCP_STREAM             8801.03    45794   9592.77     2884
>>
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> 
> Is this with busy poll enabled?

Yes, as I wrote "Set 10us busypoll" above.

> Are there CPU utilization #s?

I used one cpu for one vcpu and one cpu for one vhost.
Each host cpu for vcpu/vhost was like this:

- Before
 vcpu cpu : %guest 70 %sys 30
 vhost cpu: %sys 100

- After
 vcpu cpu : %guest 100
 vhost cpu: %sys 100

I think %sys before patch was caused by vring kick.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  2:54 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <4a5b4456-0f3e-40b3-849c-930ebb97ef3c@lab.ntt.co.jp>



On 2018年07月02日 10:45, Toshiaki Makita wrote:
> Hi Jason,
>
> On 2018/06/29 18:30, Jason Wang wrote:
>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
> ...
>>> To fix this, poll the work instead of enabling notification when
>>> busypoll is interrupted by something. IMHO signal_pending() and
>>> vhost_has_work() are kind of interruptions rather than signals to
>>> completely cancel the busypoll, so let's run busypoll after the
>>> necessary work is done. In order to avoid too long busyloop due to
>>> interruption, save the endtime in vq field and use it when reentering
>>> the work function.
>> I think we don't care long busyloop unless e.g tx can starve rx?
> I just want to keep it user-controllable. Unless memorizing it busypoll
> can run unexpectedly long.

I think the total amount of time for busy polling is bounded. If I was 
wrong, it should be a bug somewhere.

>
>>> I observed this problem makes tx performance poor but does not rx. I
>>> guess it is because rx notification from socket is not that heavy so
>>> did not impact the performance even when we failed to mask the
>>> notification.
>> I think the reason is:
>>
>> For tx, we may only process used ring under handle_tx(). Busy polling
>> code can not recognize this case.
>> For rx, we call peek_head_len() after exiting busy loop, so if the work
>> is for rx, it can be processed in handle_rx() immediately.
> Sorry but I don't see the difference. Tx busypoll calls
> vhost_get_vq_desc() immediately after busypoll exits in
> vhost_net_tx_get_vq_desc().

Yes, so for the problem of zerocopy callback issue is we don't poll used 
(done_idx != upend_idx). Maybe we can try to add this and avoid the 
check of vhost_has_worker().

>
> The scenario I described above (in 2nd paragraph) is a bit simplified.
> To be clearer what I think is happening is:
>
> 1. handle_tx() runs busypoll with notification enabled (due to zerocopy
>     callback or something).

I think it was due to we enable notification when we exit handle_tx().

> 2. Guest produces a packet in vring.
> 3. handle_tx() busypoll detects the produced packet and get it.
> 4. While vhost is processing the packet, guest kicks vring because it
>     produced the packet. Vring notification is disabled automatically by
>     event_idx and tx work is queued.
> 5. After processing the packet vhost enters busyloop again, but detects
>     tx work and exits busyloop immediately. Then handle_tx() exits after
>     enabling the notification.
> 6. Because the queued work was tx, handle_tx() is called immediately
>     again. handle_tx() runs busypoll with notification enabled.
> 7. Repeat 2-6.

Exactly what I understand.

>
>>> Anyway for consistency I fixed rx routine as well as tx.
>>>
>>> Performance numbers:
>>>
>>> - Bulk transfer from guest to external physical server.
>>>       [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
>> Just to confirm in this case since zerocopy is enabled, we are in fact
>> use the generic XDP datapath?
> For some reason zerocopy was not applied for most packets, so in most
> cases driver XDP was used. I was going to dig into it but not yet.

Right, just to confirm this. This is expected.

In tuntap, we do native XDP only for small and non zerocopy packets. See 
tun_can_build_skb(). The reason is XDP may adjust packet header which is 
not supported by zercopy. We can only use XDP generic for zerocopy in 
this case.

>
>>> - Set 10us busypoll.
>>> - Guest disables checksum and TSO because of host XDP.
>>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>>     (EPT_MISCONFIG event).
>>>
>>>                               Before              After
>>>                             Mbps  kicks/s      Mbps  kicks/s
>>> UDP_STREAM 1472byte              247758                 27
>>>                   Send   3645.37            6958.10
>>>                   Recv   3588.56            6958.10
>>>                 1byte                9865                 37
>>>                   Send      4.34               5.43
>>>                   Recv      4.17               5.26
>>> TCP_STREAM             8801.03    45794   9592.77     2884
>> Impressive numbers.
>>
>>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>>> ---
> ...
>>> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
>>> +{
>>> +    return unlikely(signal_pending(current)) || vhost_has_work(dev);
>>>    }
>>>      static void vhost_net_disable_vq(struct vhost_net *n,
>>> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct
>>> vhost_net *net,
>>>          if (r == vq->num && vq->busyloop_timeout) {
>>>            preempt_disable();
>>> -        endtime = busy_clock() + vq->busyloop_timeout;
>>> -        while (vhost_can_busy_poll(vq->dev, endtime) &&
>>> -               vhost_vq_avail_empty(vq->dev, vq))
>>> +        if (vq->busyloop_endtime) {
>>> +            endtime = vq->busyloop_endtime;
>>> +            vq->busyloop_endtime = 0;
>> Looks like endtime may be before current time. So we skip busy loop in
>> this case.
> Sure, I'll add a condition.
>
>>> +        } else {
>>> +            endtime = busy_clock() + vq->busyloop_timeout;
>>> +        }
>>> +        while (vhost_can_busy_poll(endtime)) {
>>> +            if (vhost_busy_poll_interrupted(vq->dev)) {
>>> +                vq->busyloop_endtime = endtime;
>>> +                break;
>>> +            }
>>> +            if (!vhost_vq_avail_empty(vq->dev, vq))
>>> +                break;
>>>                cpu_relax();
>>> +        }
>>>            preempt_enable();
>>>            r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>>>                          out_num, in_num, NULL, NULL);
> ...
>>> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct
>>> vhost_net_virtqueue *nvq)
>>>      static int vhost_net_rx_peek_head_len(struct vhost_net *net,
>>> struct sock *sk)
>>>    {
>>> -    struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>>> -    struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>>> -    struct vhost_virtqueue *vq = &nvq->vq;
>>> +    struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>>> +    struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>>> +    struct vhost_virtqueue *rvq = &rnvq->vq;
>>> +    struct vhost_virtqueue *tvq = &tnvq->vq;
>> NIt: I admit the variable name is confusing. It would be better to do
>> the renaming in a separate patch to ease review.
> OK.
>
>>>        unsigned long uninitialized_var(endtime);
>>> -    int len = peek_head_len(rvq, sk);
>>> +    int len = peek_head_len(rnvq, sk);
>>>    -    if (!len && vq->busyloop_timeout) {
>>> +    if (!len && tvq->busyloop_timeout) {
>>>            /* Flush batched heads first */
>>> -        vhost_rx_signal_used(rvq);
>>> +        vhost_rx_signal_used(rnvq);
>>>            /* Both tx vq and rx socket were polled here */
>>> -        mutex_lock_nested(&vq->mutex, 1);
>>> -        vhost_disable_notify(&net->dev, vq);
>>> +        mutex_lock_nested(&tvq->mutex, 1);
>>> +        vhost_disable_notify(&net->dev, tvq);
>>>              preempt_disable();
>>> -        endtime = busy_clock() + vq->busyloop_timeout;
>>> +        if (rvq->busyloop_endtime) {
>>> +            endtime = rvq->busyloop_endtime;
>>> +            rvq->busyloop_endtime = 0;
>>> +        } else {
>>> +            endtime = busy_clock() + tvq->busyloop_timeout;
>>> +        }
>>>    -        while (vhost_can_busy_poll(&net->dev, endtime) &&
>>> -               !sk_has_rx_data(sk) &&
>>> -               vhost_vq_avail_empty(&net->dev, vq))
>>> +        while (vhost_can_busy_poll(endtime)) {
>>> +            if (vhost_busy_poll_interrupted(&net->dev)) {
>>> +                rvq->busyloop_endtime = endtime;
>>> +                break;
>>> +            }
>>> +            if (sk_has_rx_data(sk) ||
>>> +                !vhost_vq_avail_empty(&net->dev, tvq))
>>> +                break;
>>>                cpu_relax();
>> Actually, I plan to poll guest rx refilling as well here to avoid rx
>> kicks. Want to draft another patch to do it as well? It's as simple as
>> add a vhost_vq_avail_empty for rvq above.
> OK. will try it.
>
>>> +        }
>>>              preempt_enable();
>>>    -        if (!vhost_vq_avail_empty(&net->dev, vq))
>>> -            vhost_poll_queue(&vq->poll);
>>> -        else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>>> -            vhost_disable_notify(&net->dev, vq);
>>> -            vhost_poll_queue(&vq->poll);
>>> +        if (!vhost_vq_avail_empty(&net->dev, tvq)) {
>>> +            vhost_poll_queue(&tvq->poll);
>>> +        } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
>>> +            vhost_disable_notify(&net->dev, tvq);
>>> +            vhost_poll_queue(&tvq->poll);
>>>            }
>>>    -        mutex_unlock(&vq->mutex);
>>> +        mutex_unlock(&tvq->mutex);
>>>    -        len = peek_head_len(rvq, sk);
>>> +        len = peek_head_len(rnvq, sk);
>>>        }
>>>          return len;
> ...
>>> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>>>                goto out;
>>>            }
>>>        }
>>> -    vhost_net_enable_vq(net, vq);
>>> +    if (unlikely(vq->busyloop_endtime)) {
>>> +        /* Busy poll is interrupted. */
>>> +        vhost_poll_queue(&vq->poll);
>>> +    } else {
>>> +        vhost_net_enable_vq(net, vq);
>>> +    }
>> This is to reduce the rx wake ups. Better with another patch.
>>
>> So I suggest to split the patches into:
>>
>> 1 better naming of variable of vhost_net_rx_peek_head_len().
>> 2 avoid tx kicks (most of what this patch did)
>> 3 avoid rx wakeups (exactly the above 6 lines did)
>> 4 avoid rx kicks
> OK, I'll reorganize the patch.
> Thank you for your feedback!
>
>> Btw, tonghao is doing some refactoring of busy polling as well. Depends
>> on the order of being merged, one of you may need rebasing.
> Thanks for sharing. I'll take a look.
>

Thanks.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:52 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <c21a4671-df85-4805-a4ec-2441859e4e1b@redhat.com>

On 2018/07/02 11:41, Jason Wang wrote:
> On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
>> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>>> Under heavy load vhost busypoll may run without suppressing
>>> notification. For example tx zerocopy callback can push tx work while
>>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>>> condition and enables notification but immediately reenters handle_tx()
>>> because the pushed work was tx. In this case handle_tx() tries to
>>> disable notification again, but when using event_idx it by design
>>> cannot. Then busyloop will run without suppressing notification.
>>> Another example is the case where handle_tx() tries to enable
>>> notification but avail idx is advanced so disables it again. This case
>>> also lead to the same situation with event_idx.
>>>
>>> The problem is that once we enter this situation busyloop does not work
>>> under heavy load for considerable amount of time, because notification
>>> is likely to happen during busyloop and handle_tx() immediately enables
>>> notification after notification happens. Specifically busyloop detects
>>> notification by vhost_has_work() and then handle_tx() calls
>>> vhost_enable_notify().
>> I'd like to understand the problem a bit better.
>> Why does this happen?
>> Doesn't this only happen if ring is empty?
>>
> 
> My understanding is:
> 
> vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause
> the busy loop in vhost_net_tx_get_vq_desc() to exit because of
> vhost_has_work() return true. Then handle_tx() tends to enable
> notification. Then guest may kick us even if handle_tx() call
> vhost_disable_notify() which in fact did nothing for even index.

Yes.

> Maybe we can try to call vhost_zerocopy_signal_used() if we found
> there's pending used from zerocopy instead.

Note that even when zerocopy is disabled the problem happens as I wrote.
When vhost_enable_notify() detects avail_idx advanced it tries to
disable notification again but it fails.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:45 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <62908af8-ae65-c8d3-6f37-941471e0780e@redhat.com>

Hi Jason,

On 2018/06/29 18:30, Jason Wang wrote:
> On 2018年06月29日 16:09, Toshiaki Makita wrote:
...
>> To fix this, poll the work instead of enabling notification when
>> busypoll is interrupted by something. IMHO signal_pending() and
>> vhost_has_work() are kind of interruptions rather than signals to
>> completely cancel the busypoll, so let's run busypoll after the
>> necessary work is done. In order to avoid too long busyloop due to
>> interruption, save the endtime in vq field and use it when reentering
>> the work function.
> 
> I think we don't care long busyloop unless e.g tx can starve rx?

I just want to keep it user-controllable. Unless memorizing it busypoll
can run unexpectedly long.

>> I observed this problem makes tx performance poor but does not rx. I
>> guess it is because rx notification from socket is not that heavy so
>> did not impact the performance even when we failed to mask the
>> notification.
> 
> I think the reason is:
> 
> For tx, we may only process used ring under handle_tx(). Busy polling
> code can not recognize this case.
> For rx, we call peek_head_len() after exiting busy loop, so if the work
> is for rx, it can be processed in handle_rx() immediately.

Sorry but I don't see the difference. Tx busypoll calls
vhost_get_vq_desc() immediately after busypoll exits in
vhost_net_tx_get_vq_desc().

The scenario I described above (in 2nd paragraph) is a bit simplified.
To be clearer what I think is happening is:

1. handle_tx() runs busypoll with notification enabled (due to zerocopy
   callback or something).
2. Guest produces a packet in vring.
3. handle_tx() busypoll detects the produced packet and get it.
4. While vhost is processing the packet, guest kicks vring because it
   produced the packet. Vring notification is disabled automatically by
   event_idx and tx work is queued.
5. After processing the packet vhost enters busyloop again, but detects
   tx work and exits busyloop immediately. Then handle_tx() exits after
   enabling the notification.
6. Because the queued work was tx, handle_tx() is called immediately
   again. handle_tx() runs busypoll with notification enabled.
7. Repeat 2-6.

> 
>> Anyway for consistency I fixed rx routine as well as tx.
>>
>> Performance numbers:
>>
>> - Bulk transfer from guest to external physical server.
>>      [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
> 
> Just to confirm in this case since zerocopy is enabled, we are in fact
> use the generic XDP datapath?

For some reason zerocopy was not applied for most packets, so in most
cases driver XDP was used. I was going to dig into it but not yet.

> 
>> - Set 10us busypoll.
>> - Guest disables checksum and TSO because of host XDP.
>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>    (EPT_MISCONFIG event).
>>
>>                              Before              After
>>                            Mbps  kicks/s      Mbps  kicks/s
>> UDP_STREAM 1472byte              247758                 27
>>                  Send   3645.37            6958.10
>>                  Recv   3588.56            6958.10
>>                1byte                9865                 37
>>                  Send      4.34               5.43
>>                  Recv      4.17               5.26
>> TCP_STREAM             8801.03    45794   9592.77     2884
> 
> Impressive numbers.
> 
>>
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>> ---
...
>> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
>> +{
>> +    return unlikely(signal_pending(current)) || vhost_has_work(dev);
>>   }
>>     static void vhost_net_disable_vq(struct vhost_net *n,
>> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct
>> vhost_net *net,
>>         if (r == vq->num && vq->busyloop_timeout) {
>>           preempt_disable();
>> -        endtime = busy_clock() + vq->busyloop_timeout;
>> -        while (vhost_can_busy_poll(vq->dev, endtime) &&
>> -               vhost_vq_avail_empty(vq->dev, vq))
>> +        if (vq->busyloop_endtime) {
>> +            endtime = vq->busyloop_endtime;
>> +            vq->busyloop_endtime = 0;
> 
> Looks like endtime may be before current time. So we skip busy loop in
> this case.

Sure, I'll add a condition.

> 
>> +        } else {
>> +            endtime = busy_clock() + vq->busyloop_timeout;
>> +        }
>> +        while (vhost_can_busy_poll(endtime)) {
>> +            if (vhost_busy_poll_interrupted(vq->dev)) {
>> +                vq->busyloop_endtime = endtime;
>> +                break;
>> +            }
>> +            if (!vhost_vq_avail_empty(vq->dev, vq))
>> +                break;
>>               cpu_relax();
>> +        }
>>           preempt_enable();
>>           r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>>                         out_num, in_num, NULL, NULL);
...
>> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct
>> vhost_net_virtqueue *nvq)
>>     static int vhost_net_rx_peek_head_len(struct vhost_net *net,
>> struct sock *sk)
>>   {
>> -    struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>> -    struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>> -    struct vhost_virtqueue *vq = &nvq->vq;
>> +    struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>> +    struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>> +    struct vhost_virtqueue *rvq = &rnvq->vq;
>> +    struct vhost_virtqueue *tvq = &tnvq->vq;
> 
> NIt: I admit the variable name is confusing. It would be better to do
> the renaming in a separate patch to ease review.

OK.

> 
>>       unsigned long uninitialized_var(endtime);
>> -    int len = peek_head_len(rvq, sk);
>> +    int len = peek_head_len(rnvq, sk);
>>   -    if (!len && vq->busyloop_timeout) {
>> +    if (!len && tvq->busyloop_timeout) {
>>           /* Flush batched heads first */
>> -        vhost_rx_signal_used(rvq);
>> +        vhost_rx_signal_used(rnvq);
>>           /* Both tx vq and rx socket were polled here */
>> -        mutex_lock_nested(&vq->mutex, 1);
>> -        vhost_disable_notify(&net->dev, vq);
>> +        mutex_lock_nested(&tvq->mutex, 1);
>> +        vhost_disable_notify(&net->dev, tvq);
>>             preempt_disable();
>> -        endtime = busy_clock() + vq->busyloop_timeout;
>> +        if (rvq->busyloop_endtime) {
>> +            endtime = rvq->busyloop_endtime;
>> +            rvq->busyloop_endtime = 0;
>> +        } else {
>> +            endtime = busy_clock() + tvq->busyloop_timeout;
>> +        }
>>   -        while (vhost_can_busy_poll(&net->dev, endtime) &&
>> -               !sk_has_rx_data(sk) &&
>> -               vhost_vq_avail_empty(&net->dev, vq))
>> +        while (vhost_can_busy_poll(endtime)) {
>> +            if (vhost_busy_poll_interrupted(&net->dev)) {
>> +                rvq->busyloop_endtime = endtime;
>> +                break;
>> +            }
>> +            if (sk_has_rx_data(sk) ||
>> +                !vhost_vq_avail_empty(&net->dev, tvq))
>> +                break;
>>               cpu_relax();
> 
> Actually, I plan to poll guest rx refilling as well here to avoid rx
> kicks. Want to draft another patch to do it as well? It's as simple as
> add a vhost_vq_avail_empty for rvq above.

OK. will try it.

> 
>> +        }
>>             preempt_enable();
>>   -        if (!vhost_vq_avail_empty(&net->dev, vq))
>> -            vhost_poll_queue(&vq->poll);
>> -        else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> -            vhost_disable_notify(&net->dev, vq);
>> -            vhost_poll_queue(&vq->poll);
>> +        if (!vhost_vq_avail_empty(&net->dev, tvq)) {
>> +            vhost_poll_queue(&tvq->poll);
>> +        } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
>> +            vhost_disable_notify(&net->dev, tvq);
>> +            vhost_poll_queue(&tvq->poll);
>>           }
>>   -        mutex_unlock(&vq->mutex);
>> +        mutex_unlock(&tvq->mutex);
>>   -        len = peek_head_len(rvq, sk);
>> +        len = peek_head_len(rnvq, sk);
>>       }
>>         return len;
...
>> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>>               goto out;
>>           }
>>       }
>> -    vhost_net_enable_vq(net, vq);
>> +    if (unlikely(vq->busyloop_endtime)) {
>> +        /* Busy poll is interrupted. */
>> +        vhost_poll_queue(&vq->poll);
>> +    } else {
>> +        vhost_net_enable_vq(net, vq);
>> +    }
> 
> This is to reduce the rx wake ups. Better with another patch.
> 
> So I suggest to split the patches into:
> 
> 1 better naming of variable of vhost_net_rx_peek_head_len().
> 2 avoid tx kicks (most of what this patch did)
> 3 avoid rx wakeups (exactly the above 6 lines did)
> 4 avoid rx kicks

OK, I'll reorganize the patch.
Thank you for your feedback!

> 
> Btw, tonghao is doing some refactoring of busy polling as well. Depends
> on the order of being merged, one of you may need rebasing.

Thanks for sharing. I'll take a look.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  2:41 UTC (permalink / raw)
  To: Michael S. Tsirkin, Toshiaki Makita; +Cc: netdev, kvm, virtualization
In-Reply-To: <20180629193448-mutt-send-email-mst@kernel.org>



On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>> Under heavy load vhost busypoll may run without suppressing
>> notification. For example tx zerocopy callback can push tx work while
>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>> condition and enables notification but immediately reenters handle_tx()
>> because the pushed work was tx. In this case handle_tx() tries to
>> disable notification again, but when using event_idx it by design
>> cannot. Then busyloop will run without suppressing notification.
>> Another example is the case where handle_tx() tries to enable
>> notification but avail idx is advanced so disables it again. This case
>> also lead to the same situation with event_idx.
>>
>> The problem is that once we enter this situation busyloop does not work
>> under heavy load for considerable amount of time, because notification
>> is likely to happen during busyloop and handle_tx() immediately enables
>> notification after notification happens. Specifically busyloop detects
>> notification by vhost_has_work() and then handle_tx() calls
>> vhost_enable_notify().
> I'd like to understand the problem a bit better.
> Why does this happen?
> Doesn't this only happen if ring is empty?
>

My understanding is:

vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause 
the busy loop in vhost_net_tx_get_vq_desc() to exit because of 
vhost_has_work() return true. Then handle_tx() tends to enable 
notification. Then guest may kick us even if handle_tx() call 
vhost_disable_notify() which in fact did nothing for even index.

Maybe we can try to call vhost_zerocopy_signal_used() if we found 
there's pending used from zerocopy instead.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: Jason Wang @ 2018-07-02  2:32 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-5-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patch improves the guest receive and transmit performance.
> On the handle_tx side, we poll the sock receive queue at the
> same time. handle_rx do that in the same way.
>
> We set the poll-us=100us and use the iperf3 to test
> its bandwidth, use the netperf to test throughput and mean
> latency. When running the tests, the vhost-net kthread of
> that VM, is alway 100% CPU. The commands are shown as below.
>
> iperf3  -s -D
> iperf3  -c IP -i 1 -P 1 -t 20 -M 1400
>
> or
> netserver
> netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
>
> host -> guest:
> iperf3:
> * With the patch:     27.0 Gbits/sec
> * Without the patch:  14.4 Gbits/sec
>
> netperf (TCP_RR):
> * With the patch:     48039.56 trans/s, 20.64us mean latency
> * Without the patch:  46027.07 trans/s, 21.58us mean latency
>
> This patch also improves the guest transmit performance.
>
> guest -> host:
> iperf3:
> * With the patch:     27.2 Gbits/sec
> * Without the patch:  24.4 Gbits/sec
>
> netperf (TCP_RR):
> * With the patch:     47963.25 trans/s, 20.71us mean latency
> * Without the patch:  45796.70 trans/s, 21.68us mean latency
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 10 +++-------
>   1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 458f81d..fb43d82 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -478,17 +478,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>   				    struct iovec iov[], unsigned int iov_size,
>   				    unsigned int *out_num, unsigned int *in_num)
>   {
> -	unsigned long uninitialized_var(endtime);
> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
>   	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>   				  out_num, in_num, NULL, NULL);
>   
>   	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> +		vhost_net_busy_poll(net, &nvq_rx->vq, vq, false);
> +
>   		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>   				      out_num, in_num, NULL, NULL);
>   	}

Looks good to me.

A nit is "rnvq" looks better.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Jason Wang @ 2018-07-02  2:29 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-4-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Factor out generic busy polling logic and will be
> used for tx path in the next patch. And with the patch,
> qemu can set differently the busyloop_timeout for rx queue.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
>   1 file changed, 53 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 62bb8e8..458f81d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>   	return vhost_poll_start(poll, sock->file);
>   }
>   
> +static int sk_has_rx_data(struct sock *sk)
> +{
> +	struct socket *sock = sk->sk_socket;
> +
> +	if (sock->ops->peek_len)
> +		return sock->ops->peek_len(sock);
> +
> +	return skb_queue_empty(&sk->sk_receive_queue);
> +}
> +
> +static void vhost_net_busy_poll(struct vhost_net *net,
> +				struct vhost_virtqueue *rvq,
> +				struct vhost_virtqueue *tvq,
> +				bool rx)
> +{
> +	unsigned long uninitialized_var(endtime);
> +	struct socket *sock = rvq->private_data;
> +	struct vhost_virtqueue *vq = rx ? tvq : rvq;
> +	unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
> +					      tvq->busyloop_timeout;

As simple as vq->busyloop_timeout?

> +
> +	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);

We need move sock = rvq->private_data under the protection of vq mutex 
if rx is false.

> +	vhost_disable_notify(&net->dev, vq);
> +
> +	preempt_disable();
> +	endtime = busy_clock() + busyloop_timeout;
> +	while (vhost_can_busy_poll(tvq->dev, endtime) &&
> +	       !(sock && sk_has_rx_data(sock->sk)) &&
> +	       vhost_vq_avail_empty(tvq->dev, tvq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> +	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> +		vhost_poll_queue(&vq->poll);
> +	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> +		vhost_disable_notify(&net->dev, vq);
> +		vhost_poll_queue(&vq->poll);
> +	}
> +
> +	mutex_unlock(&vq->mutex);
> +}
> +
> +
>   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>   				    struct vhost_virtqueue *vq,
>   				    struct iovec iov[], unsigned int iov_size,
> @@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>   	return len;
>   }
>   
> -static int sk_has_rx_data(struct sock *sk)
> -{
> -	struct socket *sock = sk->sk_socket;
> -
> -	if (sock->ops->peek_len)
> -		return sock->ops->peek_len(sock);
> -
> -	return skb_queue_empty(&sk->sk_receive_queue);
> -}
> -
>   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>   {
>   	struct vhost_virtqueue *vq = &nvq->vq;
> @@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>   
>   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>   {
> -	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> -	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> -	struct vhost_virtqueue *vq = &nvq->vq;
> -	unsigned long uninitialized_var(endtime);
> -	int len = peek_head_len(rvq, sk);
> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> +	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];

It looks to me rnvq and tnvq is slightly better.

Other looks good to me.

Thanks

>   
> -	if (!len && vq->busyloop_timeout) {
> -		/* Flush batched heads first */
> -		vhost_rx_signal_used(rvq);
> -		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> -		vhost_disable_notify(&net->dev, vq);
> +	int len = peek_head_len(nvq_rx, sk);
>   
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -
> -		while (vhost_can_busy_poll(&net->dev, endtime) &&
> -		       !sk_has_rx_data(sk) &&
> -		       vhost_vq_avail_empty(&net->dev, vq))
> -			cpu_relax();
> -
> -		preempt_enable();
> -
> -		if (!vhost_vq_avail_empty(&net->dev, vq))
> -			vhost_poll_queue(&vq->poll);
> -		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> -			vhost_disable_notify(&net->dev, vq);
> -			vhost_poll_queue(&vq->poll);
> -		}
> +	if (!len && nvq_rx->vq.busyloop_timeout) {
> +		/* Flush batched heads first */
> +		vhost_rx_signal_used(nvq_rx);
>   
> -		mutex_unlock(&vq->mutex);
> +		/* Both tx vq and rx socket were polled here */
> +		vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
>   
> -		len = peek_head_len(rvq, sk);
> +		len = peek_head_len(nvq_rx, sk);
>   	}
>   
>   	return len;

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 2/4] net: vhost: replace magic number of lock annotation
From: Jason Wang @ 2018-07-02  2:21 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-3-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index e7cf7d2..62bb8e8 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
>   	bool zcopy, zcopy_used;
>   	int sent_pkts = 0;
>   
> -	mutex_lock(&vq->mutex);
> +	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>   	sock = vq->private_data;
>   	if (!sock)
>   		goto out;
> @@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>   		/* Flush batched heads first */
>   		vhost_rx_signal_used(rvq);
>   		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, 1);
> +		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>   		vhost_disable_notify(&net->dev, vq);
>   
>   		preempt_disable();
> @@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
>   	__virtio16 num_buffers;
>   	int recv_pkts = 0;
>   
> -	mutex_lock_nested(&vq->mutex, 0);
> +	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
>   	sock = vq->private_data;
>   	if (!sock)
>   		goto out;

Acked-by: Jason Wang <jasowang@redhat.com>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 1/4] net: vhost: lock the vqs one by one
From: Jason Wang @ 2018-07-02  2:21 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-2-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patch changes the way that lock all vqs
> at the same, to lock them one by one. It will
> be used for next patch to avoid the deadlock.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/vhost.c | 24 +++++++-----------------
>   1 file changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 895eaa2..4ca9383 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>   {
>   	int i;
>   
> -	for (i = 0; i < d->nvqs; ++i)
> +	for (i = 0; i < d->nvqs; ++i) {
> +		mutex_lock(&d->vqs[i]->mutex);
>   		__vhost_vq_meta_reset(d->vqs[i]);
> +		mutex_unlock(&d->vqs[i]->mutex);
> +	}
>   }
>   
>   static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>   #define vhost_get_used(vq, x, ptr) \
>   	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
>   
> -static void vhost_dev_lock_vqs(struct vhost_dev *d)
> -{
> -	int i = 0;
> -	for (i = 0; i < d->nvqs; ++i)
> -		mutex_lock_nested(&d->vqs[i]->mutex, i);
> -}
> -
> -static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> -{
> -	int i = 0;
> -	for (i = 0; i < d->nvqs; ++i)
> -		mutex_unlock(&d->vqs[i]->mutex);
> -}
> -
>   static int vhost_new_umem_range(struct vhost_umem *umem,
>   				u64 start, u64 size, u64 end,
>   				u64 userspace_addr, int perm)
> @@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
>   		if (msg->iova <= vq_msg->iova &&
>   		    msg->iova + msg->size - 1 > vq_msg->iova &&
>   		    vq_msg->type == VHOST_IOTLB_MISS) {
> +			mutex_lock(&node->vq->mutex);
>   			vhost_poll_queue(&node->vq->poll);
> +			mutex_unlock(&node->vq->mutex);
> +
>   			list_del(&node->node);
>   			kfree(node);
>   		}
> @@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
>   	int ret = 0;
>   
>   	mutex_lock(&dev->mutex);
> -	vhost_dev_lock_vqs(dev);
>   	switch (msg->type) {
>   	case VHOST_IOTLB_UPDATE:
>   		if (!dev->iotlb) {
> @@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
>   		break;
>   	}
>   
> -	vhost_dev_unlock_vqs(dev);
>   	mutex_unlock(&dev->mutex);
>   
>   	return ret;

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: Tonghao Zhang @ 2018-06-30  8:48 UTC (permalink / raw)
  To: brouer; +Cc: mst, Linux Kernel Network Developers, virtualization,
	Tonghao Zhang
In-Reply-To: <20180630090303.043a06c7@redhat.com>

On Sat, Jun 30, 2018 at 3:03 PM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Fri, 29 Jun 2018 23:33:58 -0700
> xiangxia.m.yue@gmail.com wrote:
>
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > This patch improves the guest receive and transmit performance.
> > On the handle_tx side, we poll the sock receive queue at the
> > same time. handle_rx do that in the same way.
> >
> > We set the poll-us=100us and use the iperf3 to test
>
> Where/how do you configure poll-us=100us ?
>
> Are you talking about /proc/sys/net/core/busy_poll ?
No, we set the poll-us in qemu. e.g
-netdev tap,ifname=tap0,id=hostnet0,vhost=on,script=no,downscript=no,poll-us=100

>
> p.s. Nice performance boost! :-)
>
> > its bandwidth, use the netperf to test throughput and mean
> > latency. When running the tests, the vhost-net kthread of
> > that VM, is alway 100% CPU. The commands are shown as below.
> >
> > iperf3  -s -D
> > iperf3  -c IP -i 1 -P 1 -t 20 -M 1400
> >
> > or
> > netserver
> > netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
> >
> > host -> guest:
> > iperf3:
> > * With the patch:     27.0 Gbits/sec
> > * Without the patch:  14.4 Gbits/sec
> >
> > netperf (TCP_RR):
> > * With the patch:     48039.56 trans/s, 20.64us mean latency
> > * Without the patch:  46027.07 trans/s, 21.58us mean latency
> >
> > This patch also improves the guest transmit performance.
> >
> > guest -> host:
> > iperf3:
> > * With the patch:     27.2 Gbits/sec
> > * Without the patch:  24.4 Gbits/sec
> >
> > netperf (TCP_RR):
> > * With the patch:     47963.25 trans/s, 20.71us mean latency
> > * Without the patch:  45796.70 trans/s, 21.68us mean latency
> >
> > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: Jesper Dangaard Brouer @ 2018-06-30  7:03 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: mst, netdev, brouer, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-5-git-send-email-xiangxia.m.yue@gmail.com>

On Fri, 29 Jun 2018 23:33:58 -0700
xiangxia.m.yue@gmail.com wrote:

> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> 
> This patch improves the guest receive and transmit performance.
> On the handle_tx side, we poll the sock receive queue at the
> same time. handle_rx do that in the same way.
> 
> We set the poll-us=100us and use the iperf3 to test

Where/how do you configure poll-us=100us ?

Are you talking about /proc/sys/net/core/busy_poll ?


p.s. Nice performance boost! :-)

> its bandwidth, use the netperf to test throughput and mean
> latency. When running the tests, the vhost-net kthread of
> that VM, is alway 100% CPU. The commands are shown as below.
> 
> iperf3  -s -D
> iperf3  -c IP -i 1 -P 1 -t 20 -M 1400
> 
> or
> netserver
> netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
> 
> host -> guest:
> iperf3:
> * With the patch:     27.0 Gbits/sec
> * Without the patch:  14.4 Gbits/sec
> 
> netperf (TCP_RR):
> * With the patch:     48039.56 trans/s, 20.64us mean latency
> * Without the patch:  46027.07 trans/s, 21.58us mean latency
> 
> This patch also improves the guest transmit performance.
> 
> guest -> host:
> iperf3:
> * With the patch:     27.2 Gbits/sec
> * Without the patch:  24.4 Gbits/sec
> 
> netperf (TCP_RR):
> * With the patch:     47963.25 trans/s, 20.71us mean latency
> * Without the patch:  45796.70 trans/s, 21.68us mean latency
> 
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch improves the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.

We set the poll-us=100us and use the iperf3 to test
its bandwidth, use the netperf to test throughput and mean
latency. When running the tests, the vhost-net kthread of
that VM, is alway 100% CPU. The commands are shown as below.

iperf3  -s -D
iperf3  -c IP -i 1 -P 1 -t 20 -M 1400

or
netserver
netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"

host -> guest:
iperf3:
* With the patch:     27.0 Gbits/sec
* Without the patch:  14.4 Gbits/sec

netperf (TCP_RR):
* With the patch:     48039.56 trans/s, 20.64us mean latency
* Without the patch:  46027.07 trans/s, 21.58us mean latency

This patch also improves the guest transmit performance.

guest -> host:
iperf3:
* With the patch:     27.2 Gbits/sec
* Without the patch:  24.4 Gbits/sec

netperf (TCP_RR):
* With the patch:     47963.25 trans/s, 20.71us mean latency
* Without the patch:  45796.70 trans/s, 21.68us mean latency

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 458f81d..fb43d82 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -478,17 +478,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
-	unsigned long uninitialized_var(endtime);
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
 	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax();
-		preempt_enable();
+		vhost_net_busy_poll(net, &nvq_rx->vq, vq, false);
+
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				      out_num, in_num, NULL, NULL);
 	}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Factor out generic busy polling logic and will be
used for tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..458f81d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sock->ops->peek_len)
+		return sock->ops->peek_len(sock);
+
+	return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+				struct vhost_virtqueue *rvq,
+				struct vhost_virtqueue *tvq,
+				bool rx)
+{
+	unsigned long uninitialized_var(endtime);
+	struct socket *sock = rvq->private_data;
+	struct vhost_virtqueue *vq = rx ? tvq : rvq;
+	unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
+					      tvq->busyloop_timeout;
+
+	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+	vhost_disable_notify(&net->dev, vq);
+
+	preempt_disable();
+	endtime = busy_clock() + busyloop_timeout;
+	while (vhost_can_busy_poll(tvq->dev, endtime) &&
+	       !(sock && sk_has_rx_data(sock->sk)) &&
+	       vhost_vq_avail_empty(tvq->dev, tvq))
+		cpu_relax();
+	preempt_enable();
+
+	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+		vhost_poll_queue(&vq->poll);
+	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+		vhost_disable_notify(&net->dev, vq);
+		vhost_poll_queue(&vq->poll);
+	}
+
+	mutex_unlock(&vq->mutex);
+}
+
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	return len;
 }
 
-static int sk_has_rx_data(struct sock *sk)
-{
-	struct socket *sock = sk->sk_socket;
-
-	if (sock->ops->peek_len)
-		return sock->ops->peek_len(sock);
-
-	return skb_queue_empty(&sk->sk_receive_queue);
-}
-
 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
-	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
-	struct vhost_virtqueue *vq = &nvq->vq;
-	unsigned long uninitialized_var(endtime);
-	int len = peek_head_len(rvq, sk);
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
 
-	if (!len && vq->busyloop_timeout) {
-		/* Flush batched heads first */
-		vhost_rx_signal_used(rvq);
-		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
-		vhost_disable_notify(&net->dev, vq);
+	int len = peek_head_len(nvq_rx, sk);
 
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-
-		while (vhost_can_busy_poll(&net->dev, endtime) &&
-		       !sk_has_rx_data(sk) &&
-		       vhost_vq_avail_empty(&net->dev, vq))
-			cpu_relax();
-
-		preempt_enable();
-
-		if (!vhost_vq_avail_empty(&net->dev, vq))
-			vhost_poll_queue(&vq->poll);
-		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-			vhost_disable_notify(&net->dev, vq);
-			vhost_poll_queue(&vq->poll);
-		}
+	if (!len && nvq_rx->vq.busyloop_timeout) {
+		/* Flush batched heads first */
+		vhost_rx_signal_used(nvq_rx);
 
-		mutex_unlock(&vq->mutex);
+		/* Both tx vq and rx socket were polled here */
+		vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
 
-		len = peek_head_len(rvq, sk);
+		len = peek_head_len(nvq_rx, sk);
 	}
 
 	return len;
-- 
1.8.3.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox