* [PATCH v2 net-next 3/4] vhost_net: Avoid rx queue wake-ups during busypoll
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
We may run handle_rx() while rx work is queued. For example a packet can
push the rx work during the window before handle_rx calls
vhost_net_disable_vq().
In that case busypoll immediately exits due to vhost_has_work()
condition and enables vq again. This can lead to another unnecessary rx
wake-ups, so poll rx work instead of enabling the vq.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 26 +++++++++++++++++++-------
1 file changed, 19 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 811c0e5..791bc8b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -653,7 +653,8 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
nvq->done_idx = 0;
}
-static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
+ bool *busyloop_intr)
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -671,11 +672,16 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
preempt_disable();
endtime = busy_clock() + tvq->busyloop_timeout;
- while (vhost_can_busy_poll(endtime) &&
- !vhost_has_work(&net->dev) &&
- !sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, tvq))
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+ if (sk_has_rx_data(sk) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
cpu_relax();
+ }
preempt_enable();
@@ -795,6 +801,7 @@ static void handle_rx(struct vhost_net *net)
s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
+ bool busyloop_intr = false;
struct socket *sock;
struct iov_iter fixup;
__virtio16 num_buffers;
@@ -818,7 +825,9 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+ while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ &busyloop_intr))) {
+ busyloop_intr = false;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -905,7 +914,10 @@ static void handle_rx(struct vhost_net *net)
goto out;
}
}
- vhost_net_enable_vq(net, vq);
+ if (unlikely(busyloop_intr))
+ vhost_poll_queue(&vq->poll);
+ else
+ vhost_net_enable_vq(net, vq);
out:
vhost_rx_signal_used(nvq);
mutex_unlock(&vq->mutex);
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
We may run out of avail rx ring descriptor under heavy load but busypoll
did not detect it so busypoll may have exited prematurely. Avoid this by
checking rx ring full during busypoll.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 791bc8b..b224036 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
@@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
*busyloop_intr = true;
break;
}
- if (sk_has_rx_data(sk) ||
+ if ((sk_has_rx_data(sk) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
!vhost_vq_avail_empty(&net->dev, tvq))
break;
cpu_relax();
@@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
&busyloop_intr))) {
- busyloop_intr = false;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
goto out;
/* OK, now we need to know about added descriptors. */
if (!headcount) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
@@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
+ busyloop_intr = false;
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH v2 net-next 1/4] vhost_net: Rename local variables in vhost_net_rx_peek_head_len
From: Jason Wang @ 2018-07-03 9:03 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-2-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> So we can easily see which variable is for which, tx or rx.
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 34 +++++++++++++++++-----------------
> 1 file changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 29756d8..3939c50 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -647,39 +647,39 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>
> static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> {
> - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> - struct vhost_virtqueue *vq = &nvq->vq;
> + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *tvq = &tnvq->vq;
> unsigned long uninitialized_var(endtime);
> - int len = peek_head_len(rvq, sk);
> + int len = peek_head_len(rnvq, sk);
>
> - if (!len && vq->busyloop_timeout) {
> + if (!len && tvq->busyloop_timeout) {
> /* Flush batched heads first */
> - vhost_rx_signal_used(rvq);
> + vhost_rx_signal_used(rnvq);
> /* Both tx vq and rx socket were polled here */
> - mutex_lock_nested(&vq->mutex, 1);
> - vhost_disable_notify(&net->dev, vq);
> + mutex_lock_nested(&tvq->mutex, 1);
> + vhost_disable_notify(&net->dev, tvq);
>
> preempt_disable();
> - endtime = busy_clock() + vq->busyloop_timeout;
> + endtime = busy_clock() + tvq->busyloop_timeout;
>
> while (vhost_can_busy_poll(&net->dev, endtime) &&
> !sk_has_rx_data(sk) &&
> - vhost_vq_avail_empty(&net->dev, vq))
> + vhost_vq_avail_empty(&net->dev, tvq))
> cpu_relax();
>
> preempt_enable();
>
> - if (!vhost_vq_avail_empty(&net->dev, vq))
> - vhost_poll_queue(&vq->poll);
> - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> - vhost_disable_notify(&net->dev, vq);
> - vhost_poll_queue(&vq->poll);
> + if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> + vhost_poll_queue(&tvq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> + vhost_disable_notify(&net->dev, tvq);
> + vhost_poll_queue(&tvq->poll);
> }
>
> - mutex_unlock(&vq->mutex);
> + mutex_unlock(&tvq->mutex);
>
> - len = peek_head_len(rvq, sk);
> + len = peek_head_len(rnvq, sk);
> }
>
> return len;
Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 2/4] vhost_net: Avoid tx vring kicks during busyloop
From: Jason Wang @ 2018-07-03 9:04 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-3-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> Under heavy load vhost busypoll may run without suppressing
> notification. For example tx zerocopy callback can push tx work while
> handle_tx() is running, then busyloop exits due to vhost_has_work()
> condition and enables notification but immediately reenters handle_tx()
> because the pushed work was tx. In this case handle_tx() tries to
> disable notification again, but when using event_idx it by design
> cannot. Then busyloop will run without suppressing notification.
> Another example is the case where handle_tx() tries to enable
> notification but avail idx is advanced so disables it again. This case
> also leads to the same situation with event_idx.
>
> The problem is that once we enter this situation busyloop does not work
> under heavy load for considerable amount of time, because notification
> is likely to happen during busyloop and handle_tx() immediately enables
> notification after notification happens. Specifically busyloop detects
> notification by vhost_has_work() and then handle_tx() calls
> vhost_enable_notify(). Because the detected work was the tx work, it
> enters handle_tx(), and enters busyloop without suppression again.
> This is likely to be repeated, so with event_idx we are almost not able
> to suppress notification in this case.
>
> To fix this, poll the work instead of enabling notification when
> busypoll is interrupted by something. IMHO vhost_has_work() is kind of
> interruption rather than a signal to completely cancel the busypoll, so
> let's run busypoll after the necessary work is done.
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 35 ++++++++++++++++++++++-------------
> 1 file changed, 22 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 3939c50..811c0e5 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -396,13 +396,10 @@ static inline unsigned long busy_clock(void)
> return local_clock() >> 10;
> }
>
> -static bool vhost_can_busy_poll(struct vhost_dev *dev,
> - unsigned long endtime)
> +static bool vhost_can_busy_poll(unsigned long endtime)
> {
> - return likely(!need_resched()) &&
> - likely(!time_after(busy_clock(), endtime)) &&
> - likely(!signal_pending(current)) &&
> - !vhost_has_work(dev);
> + return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
> + !signal_pending(current));
> }
>
> static void vhost_net_disable_vq(struct vhost_net *n,
> @@ -434,7 +431,8 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num)
> + unsigned int *out_num, unsigned int *in_num,
> + bool *busyloop_intr)
> {
> unsigned long uninitialized_var(endtime);
> int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> @@ -443,9 +441,15 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> if (r == vq->num && vq->busyloop_timeout) {
> preempt_disable();
> endtime = busy_clock() + vq->busyloop_timeout;
> - while (vhost_can_busy_poll(vq->dev, endtime) &&
> - vhost_vq_avail_empty(vq->dev, vq))
> + while (vhost_can_busy_poll(endtime)) {
> + if (vhost_has_work(vq->dev)) {
> + *busyloop_intr = true;
> + break;
> + }
> + if (!vhost_vq_avail_empty(vq->dev, vq))
> + break;
> cpu_relax();
> + }
> preempt_enable();
> r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> out_num, in_num, NULL, NULL);
> @@ -501,20 +505,24 @@ static void handle_tx(struct vhost_net *net)
> zcopy = nvq->ubufs;
>
> for (;;) {
> + bool busyloop_intr;
> +
> /* Release DMAs done buffers first */
> if (zcopy)
> vhost_zerocopy_signal_used(net, vq);
>
> -
> + busyloop_intr = false;
> head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> ARRAY_SIZE(vq->iov),
> - &out, &in);
> + &out, &in, &busyloop_intr);
> /* On error, stop handling until the next kick. */
> if (unlikely(head < 0))
> break;
> /* Nothing new? Wait for eventfd to tell us they refilled. */
> if (head == vq->num) {
> - if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> + if (unlikely(busyloop_intr)) {
> + vhost_poll_queue(&vq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> vhost_disable_notify(&net->dev, vq);
> continue;
> }
> @@ -663,7 +671,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> preempt_disable();
> endtime = busy_clock() + tvq->busyloop_timeout;
>
> - while (vhost_can_busy_poll(&net->dev, endtime) &&
> + while (vhost_can_busy_poll(endtime) &&
> + !vhost_has_work(&net->dev) &&
> !sk_has_rx_data(sk) &&
> vhost_vq_avail_empty(&net->dev, tvq))
> cpu_relax();
Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Jason Wang @ 2018-07-03 9:05 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-5-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> We may run out of avail rx ring descriptor under heavy load but busypoll
> did not detect it so busypoll may have exited prematurely. Avoid this by
> checking rx ring full during busypoll.
Actually, we're checking whether it was empty in fact?
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 791bc8b..b224036 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
> {
> struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *rvq = &rnvq->vq;
> struct vhost_virtqueue *tvq = &tnvq->vq;
> unsigned long uninitialized_var(endtime);
> int len = peek_head_len(rnvq, sk);
> @@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
> *busyloop_intr = true;
> break;
> }
> - if (sk_has_rx_data(sk) ||
> + if ((sk_has_rx_data(sk) &&
> + !vhost_vq_avail_empty(&net->dev, rvq)) ||
> !vhost_vq_avail_empty(&net->dev, tvq))
> break;
> cpu_relax();
> @@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
>
I thought below codes should belong to patch 3. Or I may miss something.
Thanks
> while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
> &busyloop_intr))) {
> - busyloop_intr = false;
> sock_len += sock_hlen;
> vhost_len = sock_len + vhost_hlen;
> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> @@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
> goto out;
> /* OK, now we need to know about added descriptors. */
> if (!headcount) {
> - if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> + if (unlikely(busyloop_intr)) {
> + vhost_poll_queue(&vq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> /* They have slipped one in as we were
> * doing that: check again. */
> vhost_disable_notify(&net->dev, vq);
> @@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
> * they refilled. */
> goto out;
> }
> + busyloop_intr = false;
> if (nvq->rx_ring)
> msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
> /* On overrun, truncate and discard */
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH 14/15] drm/virtio: Remove unecessary dma_fence_ops
From: Daniel Vetter @ 2018-07-03 11:14 UTC (permalink / raw)
To: DRI Development
Cc: David Airlie, Daniel Vetter, Intel Graphics Development,
virtualization
In-Reply-To: <20180503142603.28513-15-daniel.vetter@ffwll.ch>
On Thu, May 03, 2018 at 04:26:02PM +0200, Daniel Vetter wrote:
> dma_fence_default_wait is the default now, same for the trivial
> enable_signaling implementation.
>
> Reviewed-by: Eric Anholt <eric@anholt.net>
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: David Airlie <airlied@linux.ie>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: virtualization@lists.linux-foundation.org
Ok pulled in the remaining patches here with acks/r-bs, will resend the
others.
-Daniel
> ---
> drivers/gpu/drm/virtio/virtgpu_fence.c | 7 -------
> 1 file changed, 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/virtio/virtgpu_fence.c b/drivers/gpu/drm/virtio/virtgpu_fence.c
> index 23353521f903..00c742a441bf 100644
> --- a/drivers/gpu/drm/virtio/virtgpu_fence.c
> +++ b/drivers/gpu/drm/virtio/virtgpu_fence.c
> @@ -36,11 +36,6 @@ static const char *virtio_get_timeline_name(struct dma_fence *f)
> return "controlq";
> }
>
> -static bool virtio_enable_signaling(struct dma_fence *f)
> -{
> - return true;
> -}
> -
> static bool virtio_signaled(struct dma_fence *f)
> {
> struct virtio_gpu_fence *fence = to_virtio_fence(f);
> @@ -67,9 +62,7 @@ static void virtio_timeline_value_str(struct dma_fence *f, char *str, int size)
> static const struct dma_fence_ops virtio_fence_ops = {
> .get_driver_name = virtio_get_driver_name,
> .get_timeline_name = virtio_get_timeline_name,
> - .enable_signaling = virtio_enable_signaling,
> .signaled = virtio_signaled,
> - .wait = dma_fence_default_wait,
> .fence_value_str = virtio_fence_value_str,
> .timeline_value_str = virtio_timeline_value_str,
> };
> --
> 2.17.0
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
^ permalink raw reply
* Re: [PATCH net-next 1/8] vhost: move get_rx_bufs to vhost.c
From: kbuild test robot @ 2018-07-04 1:01 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin,
kbuild-all, wexu
In-Reply-To: <1530596284-4101-2-git-send-email-jasowang@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 12105 bytes --]
Hi Jason,
I love your patch! Yet something to improve:
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Jason-Wang/Packed-virtqueue-for-vhost/20180703-154751
config: x86_64-randconfig-s0-07032254 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
Note: the linux-review/Jason-Wang/Packed-virtqueue-for-vhost/20180703-154751 HEAD 01b902f1126212ea2597e6d09802bd9c4431bf82 builds fine.
It only hurts bisectibility.
All errors (new ones prefixed by >>):
drivers//vhost/net.c: In function 'handle_rx':
>> drivers//vhost/net.c:738:15: error: implicit declaration of function 'get_rx_bufs' [-Werror=implicit-function-declaration]
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
^~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/get_rx_bufs +738 drivers//vhost/net.c
030881372 Jason Wang 2016-03-04 687
3a4d5c94e Michael S. Tsirkin 2010-01-14 688 /* Expects to be always run from workqueue - which acts as
3a4d5c94e Michael S. Tsirkin 2010-01-14 689 * read-size critical section for our kind of RCU. */
94249369e Jason Wang 2011-01-17 690 static void handle_rx(struct vhost_net *net)
3a4d5c94e Michael S. Tsirkin 2010-01-14 691 {
81f95a558 Michael S. Tsirkin 2013-04-28 692 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
81f95a558 Michael S. Tsirkin 2013-04-28 693 struct vhost_virtqueue *vq = &nvq->vq;
8dd014adf David Stevens 2010-07-27 694 unsigned uninitialized_var(in), log;
8dd014adf David Stevens 2010-07-27 695 struct vhost_log *vq_log;
8dd014adf David Stevens 2010-07-27 696 struct msghdr msg = {
8dd014adf David Stevens 2010-07-27 697 .msg_name = NULL,
8dd014adf David Stevens 2010-07-27 698 .msg_namelen = 0,
8dd014adf David Stevens 2010-07-27 699 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
8dd014adf David Stevens 2010-07-27 700 .msg_controllen = 0,
8dd014adf David Stevens 2010-07-27 701 .msg_flags = MSG_DONTWAIT,
8dd014adf David Stevens 2010-07-27 702 };
0960b6417 Jason Wang 2015-02-15 703 struct virtio_net_hdr hdr = {
0960b6417 Jason Wang 2015-02-15 704 .flags = 0,
0960b6417 Jason Wang 2015-02-15 705 .gso_type = VIRTIO_NET_HDR_GSO_NONE
8dd014adf David Stevens 2010-07-27 706 };
8dd014adf David Stevens 2010-07-27 707 size_t total_len = 0;
910a578f7 Michael S. Tsirkin 2012-10-24 708 int err, mergeable;
f5a4941aa Jason Wang 2018-05-29 709 s16 headcount;
8dd014adf David Stevens 2010-07-27 710 size_t vhost_hlen, sock_hlen;
8dd014adf David Stevens 2010-07-27 711 size_t vhost_len, sock_len;
2e26af79b Asias He 2013-05-07 712 struct socket *sock;
ba7438aed Al Viro 2014-12-10 713 struct iov_iter fixup;
0960b6417 Jason Wang 2015-02-15 714 __virtio16 num_buffers;
db688c24e Paolo Abeni 2018-04-24 715 int recv_pkts = 0;
8dd014adf David Stevens 2010-07-27 716
aaa3149bb Jason Wang 2018-03-26 717 mutex_lock_nested(&vq->mutex, 0);
2e26af79b Asias He 2013-05-07 718 sock = vq->private_data;
2e26af79b Asias He 2013-05-07 719 if (!sock)
2e26af79b Asias He 2013-05-07 720 goto out;
6b1e6cc78 Jason Wang 2016-06-23 721
6b1e6cc78 Jason Wang 2016-06-23 722 if (!vq_iotlb_prefetch(vq))
6b1e6cc78 Jason Wang 2016-06-23 723 goto out;
6b1e6cc78 Jason Wang 2016-06-23 724
8ea8cf89e Michael S. Tsirkin 2011-05-20 725 vhost_disable_notify(&net->dev, vq);
8241a1e46 Jason Wang 2016-06-01 726 vhost_net_disable_vq(net, vq);
2e26af79b Asias He 2013-05-07 727
81f95a558 Michael S. Tsirkin 2013-04-28 728 vhost_hlen = nvq->vhost_hlen;
81f95a558 Michael S. Tsirkin 2013-04-28 729 sock_hlen = nvq->sock_hlen;
8dd014adf David Stevens 2010-07-27 730
ea16c5143 Michael S. Tsirkin 2014-06-05 731 vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
8dd014adf David Stevens 2010-07-27 732 vq->log : NULL;
ea16c5143 Michael S. Tsirkin 2014-06-05 733 mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
8dd014adf David Stevens 2010-07-27 734
030881372 Jason Wang 2016-03-04 735 while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
8dd014adf David Stevens 2010-07-27 736 sock_len += sock_hlen;
8dd014adf David Stevens 2010-07-27 737 vhost_len = sock_len + vhost_hlen;
f5a4941aa Jason Wang 2018-05-29 @738 headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
f5a4941aa Jason Wang 2018-05-29 739 vhost_len, &in, vq_log, &log,
94249369e Jason Wang 2011-01-17 740 likely(mergeable) ? UIO_MAXIOV : 1);
8dd014adf David Stevens 2010-07-27 741 /* On error, stop handling until the next kick. */
8dd014adf David Stevens 2010-07-27 742 if (unlikely(headcount < 0))
8241a1e46 Jason Wang 2016-06-01 743 goto out;
8dd014adf David Stevens 2010-07-27 744 /* OK, now we need to know about added descriptors. */
8dd014adf David Stevens 2010-07-27 745 if (!headcount) {
8ea8cf89e Michael S. Tsirkin 2011-05-20 746 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
8dd014adf David Stevens 2010-07-27 747 /* They have slipped one in as we were
8dd014adf David Stevens 2010-07-27 748 * doing that: check again. */
8ea8cf89e Michael S. Tsirkin 2011-05-20 749 vhost_disable_notify(&net->dev, vq);
8dd014adf David Stevens 2010-07-27 750 continue;
8dd014adf David Stevens 2010-07-27 751 }
8dd014adf David Stevens 2010-07-27 752 /* Nothing new? Wait for eventfd to tell us
8dd014adf David Stevens 2010-07-27 753 * they refilled. */
8241a1e46 Jason Wang 2016-06-01 754 goto out;
8dd014adf David Stevens 2010-07-27 755 }
5990a3051 Jason Wang 2018-01-04 756 if (nvq->rx_ring)
6e474083f Wei Xu 2017-12-01 757 msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
6e474083f Wei Xu 2017-12-01 758 /* On overrun, truncate and discard */
6e474083f Wei Xu 2017-12-01 759 if (unlikely(headcount > UIO_MAXIOV)) {
6e474083f Wei Xu 2017-12-01 760 iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
6e474083f Wei Xu 2017-12-01 761 err = sock->ops->recvmsg(sock, &msg,
6e474083f Wei Xu 2017-12-01 762 1, MSG_DONTWAIT | MSG_TRUNC);
6e474083f Wei Xu 2017-12-01 763 pr_debug("Discarded rx packet: len %zd\n", sock_len);
6e474083f Wei Xu 2017-12-01 764 continue;
6e474083f Wei Xu 2017-12-01 765 }
8dd014adf David Stevens 2010-07-27 766 /* We don't need to be notified again. */
ba7438aed Al Viro 2014-12-10 767 iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
ba7438aed Al Viro 2014-12-10 768 fixup = msg.msg_iter;
ba7438aed Al Viro 2014-12-10 769 if (unlikely((vhost_hlen))) {
ba7438aed Al Viro 2014-12-10 770 /* We will supply the header ourselves
ba7438aed Al Viro 2014-12-10 771 * TODO: support TSO.
ba7438aed Al Viro 2014-12-10 772 */
ba7438aed Al Viro 2014-12-10 773 iov_iter_advance(&msg.msg_iter, vhost_hlen);
ba7438aed Al Viro 2014-12-10 774 }
1b7841404 Ying Xue 2015-03-02 775 err = sock->ops->recvmsg(sock, &msg,
8dd014adf David Stevens 2010-07-27 776 sock_len, MSG_DONTWAIT | MSG_TRUNC);
8dd014adf David Stevens 2010-07-27 777 /* Userspace might have consumed the packet meanwhile:
8dd014adf David Stevens 2010-07-27 778 * it's not supposed to do this usually, but might be hard
8dd014adf David Stevens 2010-07-27 779 * to prevent. Discard data we got (if any) and keep going. */
8dd014adf David Stevens 2010-07-27 780 if (unlikely(err != sock_len)) {
8dd014adf David Stevens 2010-07-27 781 pr_debug("Discarded rx packet: "
8dd014adf David Stevens 2010-07-27 782 " len %d, expected %zd\n", err, sock_len);
8dd014adf David Stevens 2010-07-27 783 vhost_discard_vq_desc(vq, headcount);
8dd014adf David Stevens 2010-07-27 784 continue;
8dd014adf David Stevens 2010-07-27 785 }
ba7438aed Al Viro 2014-12-10 786 /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
4c5a84421 Michael S. Tsirkin 2015-02-25 787 if (unlikely(vhost_hlen)) {
4c5a84421 Michael S. Tsirkin 2015-02-25 788 if (copy_to_iter(&hdr, sizeof(hdr),
4c5a84421 Michael S. Tsirkin 2015-02-25 789 &fixup) != sizeof(hdr)) {
4c5a84421 Michael S. Tsirkin 2015-02-25 790 vq_err(vq, "Unable to write vnet_hdr "
4c5a84421 Michael S. Tsirkin 2015-02-25 791 "at addr %p\n", vq->iov->iov_base);
8241a1e46 Jason Wang 2016-06-01 792 goto out;
8dd014adf David Stevens 2010-07-27 793 }
4c5a84421 Michael S. Tsirkin 2015-02-25 794 } else {
4c5a84421 Michael S. Tsirkin 2015-02-25 795 /* Header came from socket; we'll need to patch
4c5a84421 Michael S. Tsirkin 2015-02-25 796 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
4c5a84421 Michael S. Tsirkin 2015-02-25 797 */
4c5a84421 Michael S. Tsirkin 2015-02-25 798 iov_iter_advance(&fixup, sizeof(hdr));
4c5a84421 Michael S. Tsirkin 2015-02-25 799 }
8dd014adf David Stevens 2010-07-27 800 /* TODO: Should check and handle checksum. */
5201aa49b Michael S. Tsirkin 2015-02-03 801
0960b6417 Jason Wang 2015-02-15 802 num_buffers = cpu_to_vhost16(vq, headcount);
cfbdab951 Jason Wang 2011-01-17 803 if (likely(mergeable) &&
0d79a493e Michael S. Tsirkin 2015-02-25 804 copy_to_iter(&num_buffers, sizeof num_buffers,
0d79a493e Michael S. Tsirkin 2015-02-25 805 &fixup) != sizeof num_buffers) {
8dd014adf David Stevens 2010-07-27 806 vq_err(vq, "Failed num_buffers write");
8dd014adf David Stevens 2010-07-27 807 vhost_discard_vq_desc(vq, headcount);
8241a1e46 Jason Wang 2016-06-01 808 goto out;
8dd014adf David Stevens 2010-07-27 809 }
f5a4941aa Jason Wang 2018-05-29 810 nvq->done_idx += headcount;
f5a4941aa Jason Wang 2018-05-29 811 if (nvq->done_idx > VHOST_RX_BATCH)
f5a4941aa Jason Wang 2018-05-29 812 vhost_rx_signal_used(nvq);
8dd014adf David Stevens 2010-07-27 813 if (unlikely(vq_log))
8dd014adf David Stevens 2010-07-27 814 vhost_log_write(vq, vq_log, log, vhost_len);
8dd014adf David Stevens 2010-07-27 815 total_len += vhost_len;
db688c24e Paolo Abeni 2018-04-24 816 if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
db688c24e Paolo Abeni 2018-04-24 817 unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
8dd014adf David Stevens 2010-07-27 818 vhost_poll_queue(&vq->poll);
8241a1e46 Jason Wang 2016-06-01 819 goto out;
8dd014adf David Stevens 2010-07-27 820 }
8dd014adf David Stevens 2010-07-27 821 }
8241a1e46 Jason Wang 2016-06-01 822 vhost_net_enable_vq(net, vq);
2e26af79b Asias He 2013-05-07 823 out:
f5a4941aa Jason Wang 2018-05-29 824 vhost_rx_signal_used(nvq);
8dd014adf David Stevens 2010-07-27 825 mutex_unlock(&vq->mutex);
8dd014adf David Stevens 2010-07-27 826 }
8dd014adf David Stevens 2010-07-27 827
:::::: The code at line 738 was first introduced by commit
:::::: f5a4941aa6d190e676065e8f4ed35999f52a01c3 vhost_net: flush batched heads before trying to busy polling
:::::: TO: Jason Wang <jasowang@redhat.com>
:::::: CC: David S. Miller <davem@davemloft.net>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 34871 bytes --]
[-- Attachment #3: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Toshiaki Makita @ 2018-07-04 1:21 UTC (permalink / raw)
To: Jason Wang, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <abc4f8ce-35da-f628-811f-053250fa9b0e@redhat.com>
On 2018/07/03 18:05, Jason Wang wrote:
> On 2018年07月03日 15:31, Toshiaki Makita wrote:
>> We may run out of avail rx ring descriptor under heavy load but busypoll
>> did not detect it so busypoll may have exited prematurely. Avoid this by
>> checking rx ring full during busypoll.
>
> Actually, we're checking whether it was empty in fact?
My understanding is that on rx empty avail means no free descriptors
from the perspective of host so the ring is conceptually full.
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>> ---
>> drivers/vhost/net.c | 10 +++++++---
>> 1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 791bc8b..b224036 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct
>> vhost_net *net, struct sock *sk,
>> {
>> struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>> struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>> + struct vhost_virtqueue *rvq = &rnvq->vq;
>> struct vhost_virtqueue *tvq = &tnvq->vq;
>> unsigned long uninitialized_var(endtime);
>> int len = peek_head_len(rnvq, sk);
>> @@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct
>> vhost_net *net, struct sock *sk,
>> *busyloop_intr = true;
>> break;
>> }
>> - if (sk_has_rx_data(sk) ||
>> + if ((sk_has_rx_data(sk) &&
>> + !vhost_vq_avail_empty(&net->dev, rvq)) ||
>> !vhost_vq_avail_empty(&net->dev, tvq))
>> break;
>> cpu_relax();
>> @@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
>>
>
> I thought below codes should belong to patch 3. Or I may miss something.
That codes are added for the case that busypoll is waiting for rx vq
avail but interrupted by another work. At the point of patch 3 busypoll
does not wait for rx vq avail so I don't think we move them to patch 3.
>
> Thanks
>
>> while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
>> &busyloop_intr))) {
>> - busyloop_intr = false;
>> sock_len += sock_hlen;
>> vhost_len = sock_len + vhost_hlen;
>> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
>> @@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
>> goto out;
>> /* OK, now we need to know about added descriptors. */
>> if (!headcount) {
>> - if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> + if (unlikely(busyloop_intr)) {
>> + vhost_poll_queue(&vq->poll);
>> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> /* They have slipped one in as we were
>> * doing that: check again. */
>> vhost_disable_notify(&net->dev, vq);
>> @@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
>> * they refilled. */
>> goto out;
>> }
>> + busyloop_intr = false;
>> if (nvq->rx_ring)
>> msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
>> /* On overrun, truncate and discard */
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Tonghao Zhang @ 2018-07-04 1:40 UTC (permalink / raw)
To: jasowang
Cc: Linux Kernel Network Developers, virtualization, Tonghao Zhang,
mst
In-Reply-To: <2f5d20a0-cc95-ed37-3f6c-3368f92a7586@redhat.com>
On Tue, Jul 3, 2018 at 1:55 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年07月03日 13:48, Tonghao Zhang wrote:
> > On Tue, Jul 3, 2018 at 10:12 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >>
> >> On 2018年07月02日 20:57, xiangxia.m.yue@gmail.com wrote:
> >>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >>>
> >>> Factor out generic busy polling logic and will be
> >>> used for in tx path in the next patch. And with the patch,
> >>> qemu can set differently the busyloop_timeout for rx queue.
> >>>
> >>> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> >>> ---
> >>> drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
> >>> 1 file changed, 55 insertions(+), 39 deletions(-)
> >>>
> >>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> >>> index 62bb8e8..2790959 100644
> >>> --- a/drivers/vhost/net.c
> >>> +++ b/drivers/vhost/net.c
> >>> @@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> >>> return vhost_poll_start(poll, sock->file);
> >>> }
> >>>
> >>> +static int sk_has_rx_data(struct sock *sk)
> >>> +{
> >>> + struct socket *sock = sk->sk_socket;
> >>> +
> >>> + if (sock->ops->peek_len)
> >>> + return sock->ops->peek_len(sock);
> >>> +
> >>> + return skb_queue_empty(&sk->sk_receive_queue);
> >>> +}
> >>> +
> >>> +static void vhost_net_busy_poll(struct vhost_net *net,
> >>> + struct vhost_virtqueue *rvq,
> >>> + struct vhost_virtqueue *tvq,
> >>> + bool rx)
> >>> +{
> >>> + unsigned long uninitialized_var(endtime);
> >>> + unsigned long busyloop_timeout;
> >>> + struct socket *sock;
> >>> + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> >>> +
> >>> + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> >>> +
> >>> + vhost_disable_notify(&net->dev, vq);
> >>> + sock = rvq->private_data;
> >>> + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
> >>> +
> >>> + preempt_disable();
> >>> + endtime = busy_clock() + busyloop_timeout;
> >>> + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> >>> + !(sock && sk_has_rx_data(sock->sk)) &&
> >>> + vhost_vq_avail_empty(tvq->dev, tvq))
> >>> + cpu_relax();
> >>> + preempt_enable();
> >>> +
> >>> + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> >>> + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> >>> + vhost_poll_queue(&vq->poll);
> >>> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> >> One last question, do we need this for rx? This check will be always
> >> true under light or medium load.
> > will remove the 'unlikely'
>
> Not only the unlikely. We only want rx kick when packet is pending at
> socket but we're out of available buffers. This is not the case here
> (you have polled the vq above).
>
> So we probably want to do this only when rx == true.
got it. change it to:
+ } else if (vhost_enable_notify(&net->dev, vq) && rx) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
> Thanks
>
> >
> >> Thanks
> >>
> >>> + vhost_disable_notify(&net->dev, vq);
> >>> + vhost_poll_queue(&vq->poll);
> >>> + }
> >>> +
> >>> + mutex_unlock(&vq->mutex);
> >>> +}
> >>> +
> >>> +
> >>> static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >>> struct vhost_virtqueue *vq,
> >>> struct iovec iov[], unsigned int iov_size,
> >>> @@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
> >>> return len;
> >>> }
> >>>
> >>> -static int sk_has_rx_data(struct sock *sk)
> >>> -{
> >>> - struct socket *sock = sk->sk_socket;
> >>> -
> >>> - if (sock->ops->peek_len)
> >>> - return sock->ops->peek_len(sock);
> >>> -
> >>> - return skb_queue_empty(&sk->sk_receive_queue);
> >>> -}
> >>> -
> >>> static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >>> {
> >>> struct vhost_virtqueue *vq = &nvq->vq;
> >>> @@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >>>
> >>> static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> >>> {
> >>> - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> >>> - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> >>> - struct vhost_virtqueue *vq = &nvq->vq;
> >>> - unsigned long uninitialized_var(endtime);
> >>> - int len = peek_head_len(rvq, sk);
> >>> + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> >>> + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> >>>
> >>> - if (!len && vq->busyloop_timeout) {
> >>> - /* Flush batched heads first */
> >>> - vhost_rx_signal_used(rvq);
> >>> - /* Both tx vq and rx socket were polled here */
> >>> - mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> >>> - vhost_disable_notify(&net->dev, vq);
> >>> + int len = peek_head_len(rnvq, sk);
> >>>
> >>> - preempt_disable();
> >>> - endtime = busy_clock() + vq->busyloop_timeout;
> >>> -
> >>> - while (vhost_can_busy_poll(&net->dev, endtime) &&
> >>> - !sk_has_rx_data(sk) &&
> >>> - vhost_vq_avail_empty(&net->dev, vq))
> >>> - cpu_relax();
> >>> -
> >>> - preempt_enable();
> >>> -
> >>> - if (!vhost_vq_avail_empty(&net->dev, vq))
> >>> - vhost_poll_queue(&vq->poll);
> >>> - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> >>> - vhost_disable_notify(&net->dev, vq);
> >>> - vhost_poll_queue(&vq->poll);
> >>> - }
> >>> + if (!len && rnvq->vq.busyloop_timeout) {
> >>> + /* Flush batched heads first */
> >>> + vhost_rx_signal_used(rnvq);
> >>>
> >>> - mutex_unlock(&vq->mutex);
> >>> + /* Both tx vq and rx socket were polled here */
> >>> + vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
> >>>
> >>> - len = peek_head_len(rvq, sk);
> >>> + len = peek_head_len(rnvq, sk);
> >>> }
> >>>
> >>> return len;
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Jason Wang @ 2018-07-04 3:26 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <251f4d24-f8e2-7549-064e-94b4d6c6f4bd@lab.ntt.co.jp>
On 2018年07月04日 09:21, Toshiaki Makita wrote:
> On 2018/07/03 18:05, Jason Wang wrote:
>> On 2018年07月03日 15:31, Toshiaki Makita wrote:
>>> We may run out of avail rx ring descriptor under heavy load but busypoll
>>> did not detect it so busypoll may have exited prematurely. Avoid this by
>>> checking rx ring full during busypoll.
>> Actually, we're checking whether it was empty in fact?
> My understanding is that on rx empty avail means no free descriptors
> from the perspective of host so the ring is conceptually full.
Ok.
>
>>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>>> ---
>>> drivers/vhost/net.c | 10 +++++++---
>>> 1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>>> index 791bc8b..b224036 100644
>>> --- a/drivers/vhost/net.c
>>> +++ b/drivers/vhost/net.c
>>> @@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct
>>> vhost_net *net, struct sock *sk,
>>> {
>>> struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>>> struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>>> + struct vhost_virtqueue *rvq = &rnvq->vq;
>>> struct vhost_virtqueue *tvq = &tnvq->vq;
>>> unsigned long uninitialized_var(endtime);
>>> int len = peek_head_len(rnvq, sk);
>>> @@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct
>>> vhost_net *net, struct sock *sk,
>>> *busyloop_intr = true;
>>> break;
>>> }
>>> - if (sk_has_rx_data(sk) ||
>>> + if ((sk_has_rx_data(sk) &&
>>> + !vhost_vq_avail_empty(&net->dev, rvq)) ||
>>> !vhost_vq_avail_empty(&net->dev, tvq))
>>> break;
>>> cpu_relax();
>>> @@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
>>>
>> I thought below codes should belong to patch 3. Or I may miss something.
> That codes are added for the case that busypoll is waiting for rx vq
> avail but interrupted by another work. At the point of patch 3 busypoll
> does not wait for rx vq avail so I don't think we move them to patch 3.
I get this.
Thanks
>
>> Thanks
>>
>>> while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
>>> &busyloop_intr))) {
>>> - busyloop_intr = false;
>>> sock_len += sock_hlen;
>>> vhost_len = sock_len + vhost_hlen;
>>> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
>>> @@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
>>> goto out;
>>> /* OK, now we need to know about added descriptors. */
>>> if (!headcount) {
>>> - if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>>> + if (unlikely(busyloop_intr)) {
>>> + vhost_poll_queue(&vq->poll);
>>> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>>> /* They have slipped one in as we were
>>> * doing that: check again. */
>>> vhost_disable_notify(&net->dev, vq);
>>> @@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
>>> * they refilled. */
>>> goto out;
>>> }
>>> + busyloop_intr = false;
>>> if (nvq->rx_ring)
>>> msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
>>> /* On overrun, truncate and discard */
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 3/4] vhost_net: Avoid rx queue wake-ups during busypoll
From: Jason Wang @ 2018-07-04 3:26 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-4-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> We may run handle_rx() while rx work is queued. For example a packet can
> push the rx work during the window before handle_rx calls
> vhost_net_disable_vq().
> In that case busypoll immediately exits due to vhost_has_work()
> condition and enables vq again. This can lead to another unnecessary rx
> wake-ups, so poll rx work instead of enabling the vq.
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 26 +++++++++++++++++++-------
> 1 file changed, 19 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 811c0e5..791bc8b 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -653,7 +653,8 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> nvq->done_idx = 0;
> }
>
> -static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> +static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
> + bool *busyloop_intr)
> {
> struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> @@ -671,11 +672,16 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> preempt_disable();
> endtime = busy_clock() + tvq->busyloop_timeout;
>
> - while (vhost_can_busy_poll(endtime) &&
> - !vhost_has_work(&net->dev) &&
> - !sk_has_rx_data(sk) &&
> - vhost_vq_avail_empty(&net->dev, tvq))
> + while (vhost_can_busy_poll(endtime)) {
> + if (vhost_has_work(&net->dev)) {
> + *busyloop_intr = true;
> + break;
> + }
> + if (sk_has_rx_data(sk) ||
> + !vhost_vq_avail_empty(&net->dev, tvq))
> + break;
> cpu_relax();
> + }
>
> preempt_enable();
>
> @@ -795,6 +801,7 @@ static void handle_rx(struct vhost_net *net)
> s16 headcount;
> size_t vhost_hlen, sock_hlen;
> size_t vhost_len, sock_len;
> + bool busyloop_intr = false;
> struct socket *sock;
> struct iov_iter fixup;
> __virtio16 num_buffers;
> @@ -818,7 +825,9 @@ static void handle_rx(struct vhost_net *net)
> vq->log : NULL;
> mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
>
> - while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
> + while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
> + &busyloop_intr))) {
> + busyloop_intr = false;
> sock_len += sock_hlen;
> vhost_len = sock_len + vhost_hlen;
> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> @@ -905,7 +914,10 @@ static void handle_rx(struct vhost_net *net)
> goto out;
> }
> }
> - vhost_net_enable_vq(net, vq);
> + if (unlikely(busyloop_intr))
> + vhost_poll_queue(&vq->poll);
> + else
> + vhost_net_enable_vq(net, vq);
> out:
> vhost_rx_signal_used(nvq);
> mutex_unlock(&vq->mutex);
Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Jason Wang @ 2018-07-04 3:27 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-5-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> We may run out of avail rx ring descriptor under heavy load but busypoll
> did not detect it so busypoll may have exited prematurely. Avoid this by
> checking rx ring full during busypoll.
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 791bc8b..b224036 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
> {
> struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *rvq = &rnvq->vq;
> struct vhost_virtqueue *tvq = &tnvq->vq;
> unsigned long uninitialized_var(endtime);
> int len = peek_head_len(rnvq, sk);
> @@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
> *busyloop_intr = true;
> break;
> }
> - if (sk_has_rx_data(sk) ||
> + if ((sk_has_rx_data(sk) &&
> + !vhost_vq_avail_empty(&net->dev, rvq)) ||
> !vhost_vq_avail_empty(&net->dev, tvq))
> break;
> cpu_relax();
> @@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
>
> while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
> &busyloop_intr))) {
> - busyloop_intr = false;
> sock_len += sock_hlen;
> vhost_len = sock_len + vhost_hlen;
> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> @@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
> goto out;
> /* OK, now we need to know about added descriptors. */
> if (!headcount) {
> - if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> + if (unlikely(busyloop_intr)) {
> + vhost_poll_queue(&vq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> /* They have slipped one in as we were
> * doing that: check again. */
> vhost_disable_notify(&net->dev, vq);
> @@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
> * they refilled. */
> goto out;
> }
> + busyloop_intr = false;
> if (nvq->rx_ring)
> msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
> /* On overrun, truncate and discard */
Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-07-04 3:28 UTC (permalink / raw)
To: kbuild test robot
Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin,
kbuild-all, wexu
In-Reply-To: <201807040813.LULqRH8c%fengguang.wu@intel.com>
On 2018年07月04日 09:01, kbuild test robot wrote:
> Hi Jason,
>
> I love your patch! Yet something to improve:
>
> [auto build test ERROR on net-next/master]
>
> url:https://github.com/0day-ci/linux/commits/Jason-Wang/Packed-virtqueue-for-vhost/20180703-154751
> config: x86_64-randconfig-s0-07032254 (attached as .config)
> compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64
>
> Note: the linux-review/Jason-Wang/Packed-virtqueue-for-vhost/20180703-154751 HEAD 01b902f1126212ea2597e6d09802bd9c4431bf82 builds fine.
> It only hurts bisectibility.
>
> All errors (new ones prefixed by >>):
>
> drivers//vhost/net.c: In function 'handle_rx':
>>> drivers//vhost/net.c:738:15: error: implicit declaration of function 'get_rx_bufs' [-Werror=implicit-function-declaration]
> headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> ^~~~~~~~~~~
> cc1: some warnings being treated as errors
>
> vim +/get_rx_bufs +738 drivers//vhost/net.c
My bad, forget to do one by one compling test.
Will post V2.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 8/8] vhost: event suppression for packed ring
From: Wei Xu @ 2018-07-04 4:13 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin
In-Reply-To: <1530596284-4101-9-git-send-email-jasowang@redhat.com>
On Tue, Jul 03, 2018 at 01:38:04PM +0800, Jason Wang wrote:
> This patch introduces support for event suppression. This is done by
> have a two areas: device area and driver area. One side could then try
> to disable or enable (delayed) notification from other side by using a
> boolean hint or event index interface in the areas.
>
> For more information, please refer Virtio spec.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/vhost.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++----
> drivers/vhost/vhost.h | 10 ++-
> 2 files changed, 185 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 0f3f07c..cccbc82 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1115,10 +1115,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
> struct vring_used __user *used)
> {
> struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
> + struct vring_packed_desc_event *driver_event =
> + (struct vring_packed_desc_event *)avail;
> + struct vring_packed_desc_event *device_event =
> + (struct vring_packed_desc_event *)used;
>
> - /* TODO: check device area and driver area */
> return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
> - access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
> + access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
R/W parameter doesn't make sense to most architectures and the comment in x86
says WRITE is a superset of READ, is it possible to converge them here?
/**
* access_ok: - Checks if a user space pointer is valid
* @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
* %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
* to write to a block, it is always safe to read from it.
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* Checks if a pointer to a block of memory in user space is valid.
*
* Returns true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
#define access_ok(type, addr, size)
......
Thanks,
Wei
> + access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
> + access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
> }
>
> static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
> @@ -1193,14 +1198,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
> return true;
> }
>
> -int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
> +int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
> +{
> + int num = vq->num;
> +
> + return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
> + num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
> + iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
> + num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
> + iotlb_access_ok(vq, VHOST_ACCESS_RO,
> + (u64)(uintptr_t)vq->driver_event,
> + sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
> + iotlb_access_ok(vq, VHOST_ACCESS_WO,
> + (u64)(uintptr_t)vq->device_event,
> + sizeof(*vq->device_event), VHOST_ADDR_USED);
> +}
> +
> +int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
> {
> size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
> unsigned int num = vq->num;
>
> - if (!vq->iotlb)
> - return 1;
> -
> return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
> num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
> iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
> @@ -1212,6 +1230,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
> num * sizeof(*vq->used->ring) + s,
> VHOST_ADDR_USED);
> }
> +
> +int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
> +{
> + if (!vq->iotlb)
> + return 1;
> +
> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> + return vq_iotlb_prefetch_packed(vq);
> + else
> + return vq_iotlb_prefetch_split(vq);
> +}
> EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
>
> /* Can we log writes? */
> @@ -1771,6 +1800,50 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
> return 0;
> }
>
> +static int vhost_update_device_flags(struct vhost_virtqueue *vq,
> + __virtio16 device_flags)
> +{
> + void __user *flags;
> +
> + if (vhost_put_user(vq, device_flags, &vq->device_event->flags,
> + VHOST_ADDR_USED) < 0)
> + return -EFAULT;
> + if (unlikely(vq->log_used)) {
> + /* Make sure the flag is seen before log. */
> + smp_wmb();
> + /* Log used flag write. */
> + flags = &vq->device_event->flags;
> + log_write(vq->log_base, vq->log_addr +
> + (flags - (void __user *)vq->device_event),
> + sizeof(vq->device_event->flags));
> + if (vq->log_ctx)
> + eventfd_signal(vq->log_ctx, 1);
> + }
> + return 0;
> +}
> +
> +static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq,
> + __virtio16 device_off_wrap)
> +{
> + void __user *off_wrap;
> +
> + if (vhost_put_user(vq, device_off_wrap, &vq->device_event->off_wrap,
> + VHOST_ADDR_USED) < 0)
> + return -EFAULT;
> + if (unlikely(vq->log_used)) {
> + /* Make sure the flag is seen before log. */
> + smp_wmb();
> + /* Log used flag write. */
> + off_wrap = &vq->device_event->off_wrap;
> + log_write(vq->log_base, vq->log_addr +
> + (off_wrap - (void __user *)vq->device_event),
> + sizeof(vq->device_event->off_wrap));
> + if (vq->log_ctx)
> + eventfd_signal(vq->log_ctx, 1);
> + }
> + return 0;
> +}
> +
> static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
> {
> if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
> @@ -2756,16 +2829,13 @@ int vhost_add_used_n(struct vhost_virtqueue *vq,
> }
> EXPORT_SYMBOL_GPL(vhost_add_used_n);
>
> -static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> +static bool vhost_notify_split(struct vhost_dev *dev,
> + struct vhost_virtqueue *vq)
> {
> __u16 old, new;
> __virtio16 event;
> bool v;
>
> - /* TODO: check driver area */
> - if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> - return true;
> -
> /* Flush out used index updates. This is paired
> * with the barrier that the Guest executes when enabling
> * interrupts. */
> @@ -2798,6 +2868,64 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> return vring_need_event(vhost16_to_cpu(vq, event), new, old);
> }
>
> +static bool vhost_notify_packed(struct vhost_dev *dev,
> + struct vhost_virtqueue *vq)
> +{
> + __virtio16 event_off_wrap, event_flags;
> + __u16 old, new, off_wrap;
> + bool v;
> +
> + /* Flush out used descriptors updates. This is paired
> + * with the barrier that the Guest executes when enabling
> + * interrupts.
> + */
> + smp_mb();
> +
> + if (vhost_get_avail(vq, event_flags,
> + &vq->driver_event->flags) < 0) {
> + vq_err(vq, "Failed to get driver desc_event_flags");
> + return true;
> + }
> +
> + if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX))
> + return event_flags !=
> + cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
> +
> + old = vq->signalled_used;
> + v = vq->signalled_used_valid;
> + new = vq->signalled_used = vq->last_used_idx;
> + vq->signalled_used_valid = true;
> +
> + if (event_flags != cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC))
> + return event_flags !=
> + cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
> +
> + /* Read desc event flags before event_off and event_wrap */
> + smp_rmb();
> +
> + if (vhost_get_avail(vq, event_off_wrap,
> + &vq->driver_event->off_wrap) < 0) {
> + vq_err(vq, "Failed to get driver desc_event_off/wrap");
> + return true;
> + }
> +
> + off_wrap = vhost16_to_cpu(vq, event_off_wrap);
> +
> + if (unlikely(!v))
> + return true;
> +
> + return vhost_vring_packed_need_event(vq, vq->last_used_wrap_counter,
> + off_wrap, new, old);
> +}
> +
> +static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> +{
> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> + return vhost_notify_packed(dev, vq);
> + else
> + return vhost_notify_split(dev, vq);
> +}
> +
> /* This actually signals the guest, using eventfd. */
> void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> {
> @@ -2875,10 +3003,34 @@ static bool vhost_enable_notify_packed(struct vhost_dev *dev,
> struct vhost_virtqueue *vq)
> {
> struct vring_desc_packed *d = vq->desc_packed + vq->avail_idx;
> - __virtio16 flags;
> + __virtio16 flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
> int ret;
>
> - /* TODO: enable notification through device area */
> + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
> + return false;
> + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
> +
> + if (vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
> + __virtio16 off_wrap = cpu_to_vhost16(vq, vq->avail_idx |
> + vq->avail_wrap_counter << 15);
> +
> + ret = vhost_update_device_off_wrap(vq, off_wrap);
> + if (ret) {
> + vq_err(vq, "Failed to write to off warp at %p: %d\n",
> + &vq->device_event->off_wrap, ret);
> + return false;
> + }
> + /* Make sure off_wrap is wrote before flags */
> + smp_wmb();
> + flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC);
> + }
> +
> + ret = vhost_update_device_flags(vq, flags);
> + if (ret) {
> + vq_err(vq, "Failed to enable notification at %p: %d\n",
> + &vq->device_event->flags, ret);
> + return false;
> + }
>
> /* They could have slipped one in as we were doing that: make
> * sure it's written, then check again.
> @@ -2945,7 +3097,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
> static void vhost_disable_notify_packed(struct vhost_dev *dev,
> struct vhost_virtqueue *vq)
> {
> - /* TODO: disable notification through device area */
> + __virtio16 flags;
> + int r;
> +
> + if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
> + return;
> + vq->used_flags |= VRING_USED_F_NO_NOTIFY;
> +
> + flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
> + r = vhost_update_device_flags(vq, flags);
> + if (r)
> + vq_err(vq, "Failed to enable notification at %p: %d\n",
> + &vq->device_event->flags, r);
> }
>
> static void vhost_disable_notify_split(struct vhost_dev *dev,
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index db09958..d071daf 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -96,8 +96,14 @@ struct vhost_virtqueue {
> struct vring_desc __user *desc;
> struct vring_desc_packed __user *desc_packed;
> };
> - struct vring_avail __user *avail;
> - struct vring_used __user *used;
> + union {
> + struct vring_avail __user *avail;
> + struct vring_packed_desc_event __user *driver_event;
> + };
> + union {
> + struct vring_used __user *used;
> + struct vring_packed_desc_event __user *device_event;
> + };
> const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
> struct file *kick;
> struct eventfd_ctx *call_ctx;
> --
> 2.7.4
>
^ permalink raw reply
* [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-07-04 4:31 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patches improve the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the same time.
handle_rx do that in the same way.
For more performance report, see patch 4.
v4 -> v5:
fix some issues
v3 -> v4:
fix some issues
v2 -> v3:
This patches are splited from previous big patch:
http://patchwork.ozlabs.org/patch/934673/
Tonghao Zhang (4):
vhost: lock the vqs one by one
net: vhost: replace magic number of lock annotation
net: vhost: factor out busy polling logic to vhost_net_busy_poll()
net: vhost: add rx busy polling in tx path
drivers/vhost/net.c | 108 ++++++++++++++++++++++++++++----------------------
drivers/vhost/vhost.c | 24 ++++-------
2 files changed, 67 insertions(+), 65 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH net-next v5 1/4] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-07-04 4:31 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530678698-33427-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 895eaa2..4ca9383 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
{
int i;
- for (i = 0; i < d->nvqs; ++i)
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
__vhost_vq_meta_reset(d->vqs[i]);
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
}
static void vhost_vq_reset(struct vhost_dev *dev,
@@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
#define vhost_get_used(vq, x, ptr) \
vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_unlock(&d->vqs[i]->mutex);
-}
-
static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
if (msg->iova <= vq_msg->iova &&
msg->iova + msg->size - 1 > vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
+ mutex_lock(&node->vq->mutex);
vhost_poll_queue(&node->vq->poll);
+ mutex_unlock(&node->vq->mutex);
+
list_del(&node->node);
kfree(node);
}
@@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
int ret = 0;
mutex_lock(&dev->mutex);
- vhost_dev_lock_vqs(dev);
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
if (!dev->iotlb) {
@@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
- vhost_dev_unlock_vqs(dev);
mutex_unlock(&dev->mutex);
return ret;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v5 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-07-04 4:31 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530678698-33427-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..62bb8e8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
bool zcopy, zcopy_used;
int sent_pkts = 0;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
sock = vq->private_data;
if (!sock)
goto out;
@@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
/* Flush batched heads first */
vhost_rx_signal_used(rvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&vq->mutex, 1);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
vhost_disable_notify(&net->dev, vq);
preempt_disable();
@@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
__virtio16 num_buffers;
int recv_pkts = 0;
- mutex_lock_nested(&vq->mutex, 0);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
sock = vq->private_data;
if (!sock)
goto out;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v5 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-07-04 4:31 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530678698-33427-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
1 file changed, 55 insertions(+), 39 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..2790959 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool rx)
+{
+ unsigned long uninitialized_var(endtime);
+ unsigned long busyloop_timeout;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+ mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+
+ vhost_disable_notify(&net->dev, vq);
+ sock = rvq->private_data;
+ busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+ while (vhost_can_busy_poll(tvq->dev, endtime) &&
+ !(sock && sk_has_rx_data(sock->sk)) &&
+ vhost_vq_avail_empty(tvq->dev, tvq))
+ cpu_relax();
+ preempt_enable();
+
+ if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+ (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+ vhost_poll_queue(&vq->poll);
+ } else if (vhost_enable_notify(&net->dev, vq) && rx) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+
+ mutex_unlock(&vq->mutex);
+}
+
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
@@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
{
struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
- struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
- unsigned long uninitialized_var(endtime);
- int len = peek_head_len(rvq, sk);
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
- if (!len && vq->busyloop_timeout) {
- /* Flush batched heads first */
- vhost_rx_signal_used(rvq);
- /* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(&net->dev, vq);
+ int len = peek_head_len(rnvq, sk);
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
-
- while (vhost_can_busy_poll(&net->dev, endtime) &&
- !sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, vq))
- cpu_relax();
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, vq))
- vhost_poll_queue(&vq->poll);
- else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- vhost_disable_notify(&net->dev, vq);
- vhost_poll_queue(&vq->poll);
- }
+ if (!len && rnvq->vq.busyloop_timeout) {
+ /* Flush batched heads first */
+ vhost_rx_signal_used(rnvq);
- mutex_unlock(&vq->mutex);
+ /* Both tx vq and rx socket were polled here */
+ vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
- len = peek_head_len(rvq, sk);
+ len = peek_head_len(rnvq, sk);
}
return len;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v5 4/4] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-07-04 4:31 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530678698-33427-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch improves the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.
We set the poll-us=100us and use the iperf3 to test
its bandwidth, use the netperf to test throughput and mean
latency. When running the tests, the vhost-net kthread of
that VM, is alway 100% CPU. The commands are shown as below.
iperf3 -s -D
iperf3 -c IP -i 1 -P 1 -t 20 -M 1400
or
netserver
netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
host -> guest:
iperf3:
* With the patch: 27.0 Gbits/sec
* Without the patch: 14.4 Gbits/sec
netperf (TCP_RR):
* With the patch: 48039.56 trans/s, 20.64us mean latency
* Without the patch: 46027.07 trans/s, 21.58us mean latency
This patch also improves the guest transmit performance.
guest -> host:
iperf3:
* With the patch: 27.2 Gbits/sec
* Without the patch: 24.4 Gbits/sec
netperf (TCP_RR):
* With the patch: 47963.25 trans/s, 20.71us mean latency
* Without the patch: 45796.70 trans/s, 21.68us mean latency
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
drivers/vhost/net.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2790959..3f26547 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -480,17 +480,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
{
- unsigned long uninitialized_var(endtime);
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
if (r == vq->num && vq->busyloop_timeout) {
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(vq->dev, endtime) &&
- vhost_vq_avail_empty(vq->dev, vq))
- cpu_relax();
- preempt_enable();
+ vhost_net_busy_poll(net, &rnvq->vq, vq, false);
+
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
}
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH net-next 6/8] virtio: introduce packed ring defines
From: Tiwei Bie @ 2018-07-04 4:48 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin,
wexu
In-Reply-To: <1530596284-4101-7-git-send-email-jasowang@redhat.com>
On Tue, Jul 03, 2018 at 01:38:02PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> include/uapi/linux/virtio_config.h | 2 ++
> include/uapi/linux/virtio_ring.h | 32 ++++++++++++++++++++++++++++++++
> 2 files changed, 34 insertions(+)
>
> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> index 449132c..947f6a3 100644
> --- a/include/uapi/linux/virtio_config.h
> +++ b/include/uapi/linux/virtio_config.h
> @@ -75,6 +75,8 @@
> */
> #define VIRTIO_F_IOMMU_PLATFORM 33
>
> +#define VIRTIO_F_RING_PACKED 34
It's better to add some comments for this macro.
> +
> /*
> * Does the device support Single Root I/O Virtualization?
> */
> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> index 6d5d5fa..71c7a46 100644
> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h
> @@ -43,6 +43,8 @@
> #define VRING_DESC_F_WRITE 2
> /* This means the buffer contains a list of buffer descriptors. */
> #define VRING_DESC_F_INDIRECT 4
> +#define VRING_DESC_F_AVAIL 7
It's better to use tab between VRING_DESC_F_AVAIL and 7.
> +#define VRING_DESC_F_USED 15
Maybe it's better to define VRING_DESC_F_AVAIL and
VRING_DESC_F_USED as (1 << 7) and (1 << 15) or something
similar to make them consistent with VRING_DESC_F_NEXT
(1 << 0), VRING_DESC_F_WRITE (1 << 1) and
VRING_DESC_F_INDIRECT (1 << 2).
I plan to define below macros in virtio_ring.c:
#define _VRING_DESC_F_AVAIL(b) ((__u16)(b) << 7)
#define _VRING_DESC_F_USED(b) ((__u16)(b) << 15)
>
> /* The Host uses this in used->flags to advise the Guest: don't kick me when
> * you add a buffer. It's unreliable, so it's simply an optimization. Guest
> @@ -62,6 +64,36 @@
> * at the end of the used ring. Guest should ignore the used->flags field. */
> #define VIRTIO_RING_F_EVENT_IDX 29
>
> +struct vring_desc_packed {
We may have other related types named as:
struct vring_packed;
struct vring_packed_desc_event;
So maybe it's better to name this type as:
struct vring_packed_desc;
> + /* Buffer Address. */
> + __virtio64 addr;
> + /* Buffer Length. */
> + __virtio32 len;
> + /* Buffer ID. */
> + __virtio16 id;
> + /* The flags depending on descriptor type. */
> + __virtio16 flags;
> +};
> +
> +/* Enable events */
> +#define RING_EVENT_FLAGS_ENABLE 0x0
> +/* Disable events */
> +#define RING_EVENT_FLAGS_DISABLE 0x1
> +/*
> + * Enable events for a specific descriptor
> + * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
> + * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
> + */
> +#define RING_EVENT_FLAGS_DESC 0x2
For above three macros, maybe it's better to name
them as:
VRING_EVENT_FLAGS_ENABLE
VRING_EVENT_FLAGS_DISABLE
VRING_EVENT_FLAGS_DESC
or
VRING_EVENT_F_ENABLE
VRING_EVENT_F_DISABLE
VRING_EVENT_F_DESC
VRING_EVENT_F_* will be more consistent with
VIRTIO_F_*, VRING_DESC_F_*, etc
> +/* The value 0x3 is reserved */
> +
> +struct vring_packed_desc_event {
> + /* Descriptor Ring Change Event Offset and Wrap Counter */
> + __virtio16 off_wrap;
> + /* Descriptor Ring Change Event Flags */
> + __virtio16 flags;
> +};
> +
> /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
> struct vring_desc {
> /* Address (guest-physical). */
> --
> 2.7.4
>
^ permalink raw reply
* Re: [PATCH net-next 8/8] vhost: event suppression for packed ring
From: Jason Wang @ 2018-07-04 5:23 UTC (permalink / raw)
To: Wei Xu; +Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin
In-Reply-To: <20180704041352.GA9287@wei-ubt>
On 2018年07月04日 12:13, Wei Xu wrote:
> On Tue, Jul 03, 2018 at 01:38:04PM +0800, Jason Wang wrote:
>> This patch introduces support for event suppression. This is done by
>> have a two areas: device area and driver area. One side could then try
>> to disable or enable (delayed) notification from other side by using a
>> boolean hint or event index interface in the areas.
>>
>> For more information, please refer Virtio spec.
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>> ---
>> drivers/vhost/vhost.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++----
>> drivers/vhost/vhost.h | 10 ++-
>> 2 files changed, 185 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 0f3f07c..cccbc82 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -1115,10 +1115,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
>> struct vring_used __user *used)
>> {
>> struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
>> + struct vring_packed_desc_event *driver_event =
>> + (struct vring_packed_desc_event *)avail;
>> + struct vring_packed_desc_event *device_event =
>> + (struct vring_packed_desc_event *)used;
>>
>> - /* TODO: check device area and driver area */
>> return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
>> - access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
>> + access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
> R/W parameter doesn't make sense to most architectures and the comment in x86
> says WRITE is a superset of READ, is it possible to converge them here?
>
> /**
> * access_ok: - Checks if a user space pointer is valid
> * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
> * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
> * to write to a block, it is always safe to read from it.
> * @addr: User space pointer to start of block to check
> * @size: Size of block to check
> *
> * Context: User context only. This function may sleep if pagefaults are
> * enabled.
> *
> * Checks if a pointer to a block of memory in user space is valid.
> *
> * Returns true (nonzero) if the memory block may be valid, false (zero)
> * if it is definitely invalid.
> *
> * Note that, depending on architecture, this function probably just
> * checks that the pointer is in the user space range - after calling
> * this function, memory access functions may still return -EFAULT.
> */
> #define access_ok(type, addr, size)
> ......
>
> Thanks,
> Wei
>
Well, this is a question that beyond the scope of this patch.
My understanding is we should keep it unless type was meaningless on all
archs.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 6/8] virtio: introduce packed ring defines
From: Jason Wang @ 2018-07-04 5:26 UTC (permalink / raw)
To: Tiwei Bie
Cc: kvm, mst, netdev, linux-kernel, virtualization, maxime.coquelin,
wexu
In-Reply-To: <20180704044823.GA22808@debian>
On 2018年07月04日 12:48, Tiwei Bie wrote:
> On Tue, Jul 03, 2018 at 01:38:02PM +0800, Jason Wang wrote:
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> include/uapi/linux/virtio_config.h | 2 ++
>> include/uapi/linux/virtio_ring.h | 32 ++++++++++++++++++++++++++++++++
>> 2 files changed, 34 insertions(+)
>>
>> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
>> index 449132c..947f6a3 100644
>> --- a/include/uapi/linux/virtio_config.h
>> +++ b/include/uapi/linux/virtio_config.h
>> @@ -75,6 +75,8 @@
>> */
>> #define VIRTIO_F_IOMMU_PLATFORM 33
>>
>> +#define VIRTIO_F_RING_PACKED 34
> It's better to add some comments for this macro.
Ok.
>
>> +
>> /*
>> * Does the device support Single Root I/O Virtualization?
>> */
>> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
>> index 6d5d5fa..71c7a46 100644
>> --- a/include/uapi/linux/virtio_ring.h
>> +++ b/include/uapi/linux/virtio_ring.h
>> @@ -43,6 +43,8 @@
>> #define VRING_DESC_F_WRITE 2
>> /* This means the buffer contains a list of buffer descriptors. */
>> #define VRING_DESC_F_INDIRECT 4
>> +#define VRING_DESC_F_AVAIL 7
> It's better to use tab between VRING_DESC_F_AVAIL and 7.
Let me fix.
>
>> +#define VRING_DESC_F_USED 15
> Maybe it's better to define VRING_DESC_F_AVAIL and
> VRING_DESC_F_USED as (1 << 7) and (1 << 15) or something
> similar to make them consistent with VRING_DESC_F_NEXT
> (1 << 0), VRING_DESC_F_WRITE (1 << 1) and
> VRING_DESC_F_INDIRECT (1 << 2).
Ok.
>
> I plan to define below macros in virtio_ring.c:
>
> #define _VRING_DESC_F_AVAIL(b) ((__u16)(b) << 7)
> #define _VRING_DESC_F_USED(b) ((__u16)(b) << 15)
>
>>
>> /* The Host uses this in used->flags to advise the Guest: don't kick me when
>> * you add a buffer. It's unreliable, so it's simply an optimization. Guest
>> @@ -62,6 +64,36 @@
>> * at the end of the used ring. Guest should ignore the used->flags field. */
>> #define VIRTIO_RING_F_EVENT_IDX 29
>>
>> +struct vring_desc_packed {
> We may have other related types named as:
>
> struct vring_packed;
> struct vring_packed_desc_event;
>
> So maybe it's better to name this type as:
>
> struct vring_packed_desc;
Yes.
>
>> + /* Buffer Address. */
>> + __virtio64 addr;
>> + /* Buffer Length. */
>> + __virtio32 len;
>> + /* Buffer ID. */
>> + __virtio16 id;
>> + /* The flags depending on descriptor type. */
>> + __virtio16 flags;
>> +};
>> +
>> +/* Enable events */
>> +#define RING_EVENT_FLAGS_ENABLE 0x0
>> +/* Disable events */
>> +#define RING_EVENT_FLAGS_DISABLE 0x1
>> +/*
>> + * Enable events for a specific descriptor
>> + * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
>> + * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
>> + */
>> +#define RING_EVENT_FLAGS_DESC 0x2
> For above three macros, maybe it's better to name
> them as:
>
> VRING_EVENT_FLAGS_ENABLE
> VRING_EVENT_FLAGS_DISABLE
> VRING_EVENT_FLAGS_DESC
>
> or
>
> VRING_EVENT_F_ENABLE
> VRING_EVENT_F_DISABLE
> VRING_EVENT_F_DESC
>
> VRING_EVENT_F_* will be more consistent with
> VIRTIO_F_*, VRING_DESC_F_*, etc
True. Let me fix above in next version.
Thanks
>> +/* The value 0x3 is reserved */
>> +
>> +struct vring_packed_desc_event {
>> + /* Descriptor Ring Change Event Offset and Wrap Counter */
>> + __virtio16 off_wrap;
>> + /* Descriptor Ring Change Event Flags */
>> + __virtio16 flags;
>> +};
>> +
>> /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
>> struct vring_desc {
>> /* Address (guest-physical). */
>> --
>> 2.7.4
>>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v2 net-next 0/4] vhost_net: Avoid vq kicks during busyloop
From: Michael S. Tsirkin @ 2018-07-04 5:35 UTC (permalink / raw)
To: Toshiaki Makita; +Cc: kvm, netdev, virtualization, David S. Miller
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On Tue, Jul 03, 2018 at 04:31:30PM +0900, Toshiaki Makita wrote:
> Under heavy load vhost tx busypoll tend not to suppress vq kicks, which
> causes poor guest tx performance. The detailed scenario is described in
> commitlog of patch 2.
> Rx seems not to have that serious problem, but for consistency I made a
> similar change on rx to avoid rx wakeups (patch 3).
> Additionary patch 4 is to avoid rx kicks under heavy load during
> busypoll.
>
> Tx performance is greatly improved by this change. I don't see notable
> performance change on rx with this series though.
>
> Performance numbers (tx):
>
> - Bulk transfer from guest to external physical server.
> [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
> - Set 10us busypoll.
> - Guest disables checksum and TSO because of host XDP.
> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
> (EPT_MISCONFIG event).
>
> Before After
> Mbps kicks/s Mbps kicks/s
> UDP_STREAM 1472byte 247758 27
> Send 3645.37 6958.10
> Recv 3588.56 6958.10
> 1byte 9865 37
> Send 4.34 5.43
> Recv 4.17 5.26
> TCP_STREAM 8801.03 45794 9592.77 2884
Thanks!
Series:
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> v2:
> - Split patches into 3 parts (renaming variables, tx-kick fix, rx-wakeup
> fix).
> - Avoid rx-kicks too (patch 4).
> - Don't memorize endtime as it is not needed for now.
>
> Toshiaki Makita (4):
> vhost_net: Rename local variables in vhost_net_rx_peek_head_len
> vhost_net: Avoid tx vring kicks during busyloop
> vhost_net: Avoid rx queue wake-ups during busypoll
> vhost_net: Avoid rx vring kicks during busyloop
>
> drivers/vhost/net.c | 95 +++++++++++++++++++++++++++++++++--------------------
> 1 file changed, 60 insertions(+), 35 deletions(-)
>
> --
> 1.8.3.1
>
^ permalink raw reply
* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Jason Wang @ 2018-07-04 6:27 UTC (permalink / raw)
To: xiangxia.m.yue; +Cc: netdev, virtualization, mst
In-Reply-To: <1530678698-33427-1-git-send-email-xiangxia.m.yue@gmail.com>
On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patches improve the guest receive and transmit performance.
> On the handle_tx side, we poll the sock receive queue at the same time.
> handle_rx do that in the same way.
>
> For more performance report, see patch 4.
>
> v4 -> v5:
> fix some issues
>
> v3 -> v4:
> fix some issues
>
> v2 -> v3:
> This patches are splited from previous big patch:
> http://patchwork.ozlabs.org/patch/934673/
>
> Tonghao Zhang (4):
> vhost: lock the vqs one by one
> net: vhost: replace magic number of lock annotation
> net: vhost: factor out busy polling logic to vhost_net_busy_poll()
> net: vhost: add rx busy polling in tx path
>
> drivers/vhost/net.c | 108 ++++++++++++++++++++++++++++----------------------
> drivers/vhost/vhost.c | 24 ++++-------
> 2 files changed, 67 insertions(+), 65 deletions(-)
>
Acked-by: Jason Wang <jasowang@redhat.com>
Note: you probably need a rebase on top of Makita's recent patches. It
depends on the order of being merged, let's see.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next v5 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Toshiaki Makita @ 2018-07-04 7:59 UTC (permalink / raw)
To: xiangxia.m.yue, jasowang; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530678698-33427-4-git-send-email-xiangxia.m.yue@gmail.com>
On 2018/07/04 13:31, xiangxia.m.yue@gmail.com wrote:
...
> +static void vhost_net_busy_poll(struct vhost_net *net,
> + struct vhost_virtqueue *rvq,
> + struct vhost_virtqueue *tvq,
> + bool rx)
> +{
> + unsigned long uninitialized_var(endtime);
> + unsigned long busyloop_timeout;
> + struct socket *sock;
> + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> +
> + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> +
> + vhost_disable_notify(&net->dev, vq);
> + sock = rvq->private_data;
> + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
> +
> + preempt_disable();
> + endtime = busy_clock() + busyloop_timeout;
> + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> + !(sock && sk_has_rx_data(sock->sk)) &&
> + vhost_vq_avail_empty(tvq->dev, tvq))
> + cpu_relax();
> + preempt_enable();
> +
> + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> + vhost_poll_queue(&vq->poll);
> + } else if (vhost_enable_notify(&net->dev, vq) && rx) {
Hmm... on tx here sock has no rx data, so you are waiting for sock
wakeup for rx and vhost_enable_notify() seems not needed. Do you want
this actually?
} else if (rx && vhost_enable_notify(&net->dev, vq)) {
> + vhost_disable_notify(&net->dev, vq);
> + vhost_poll_queue(&vq->poll);
> + }
--
Toshiaki Makita
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox