* Re: [PATCH v2 net-next 1/4] vhost_net: Rename local variables in vhost_net_rx_peek_head_len
From: Jason Wang @ 2018-07-03 9:03 UTC (permalink / raw)
To: Toshiaki Makita, David S. Miller, Michael S. Tsirkin
Cc: netdev, kvm, virtualization
In-Reply-To: <1530603094-2476-2-git-send-email-makita.toshiaki@lab.ntt.co.jp>
On 2018年07月03日 15:31, Toshiaki Makita wrote:
> So we can easily see which variable is for which, tx or rx.
>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
> drivers/vhost/net.c | 34 +++++++++++++++++-----------------
> 1 file changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 29756d8..3939c50 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -647,39 +647,39 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>
> static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> {
> - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> - struct vhost_virtqueue *vq = &nvq->vq;
> + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *tvq = &tnvq->vq;
> unsigned long uninitialized_var(endtime);
> - int len = peek_head_len(rvq, sk);
> + int len = peek_head_len(rnvq, sk);
>
> - if (!len && vq->busyloop_timeout) {
> + if (!len && tvq->busyloop_timeout) {
> /* Flush batched heads first */
> - vhost_rx_signal_used(rvq);
> + vhost_rx_signal_used(rnvq);
> /* Both tx vq and rx socket were polled here */
> - mutex_lock_nested(&vq->mutex, 1);
> - vhost_disable_notify(&net->dev, vq);
> + mutex_lock_nested(&tvq->mutex, 1);
> + vhost_disable_notify(&net->dev, tvq);
>
> preempt_disable();
> - endtime = busy_clock() + vq->busyloop_timeout;
> + endtime = busy_clock() + tvq->busyloop_timeout;
>
> while (vhost_can_busy_poll(&net->dev, endtime) &&
> !sk_has_rx_data(sk) &&
> - vhost_vq_avail_empty(&net->dev, vq))
> + vhost_vq_avail_empty(&net->dev, tvq))
> cpu_relax();
>
> preempt_enable();
>
> - if (!vhost_vq_avail_empty(&net->dev, vq))
> - vhost_poll_queue(&vq->poll);
> - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> - vhost_disable_notify(&net->dev, vq);
> - vhost_poll_queue(&vq->poll);
> + if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> + vhost_poll_queue(&tvq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> + vhost_disable_notify(&net->dev, tvq);
> + vhost_poll_queue(&tvq->poll);
> }
>
> - mutex_unlock(&vq->mutex);
> + mutex_unlock(&tvq->mutex);
>
> - len = peek_head_len(rvq, sk);
> + len = peek_head_len(rnvq, sk);
> }
>
> return len;
Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH v2 net-next 4/4] vhost_net: Avoid rx vring kicks during busyloop
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
We may run out of avail rx ring descriptor under heavy load but busypoll
did not detect it so busypoll may have exited prematurely. Avoid this by
checking rx ring full during busypoll.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 791bc8b..b224036 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -658,6 +658,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
@@ -677,7 +678,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
*busyloop_intr = true;
break;
}
- if (sk_has_rx_data(sk) ||
+ if ((sk_has_rx_data(sk) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
!vhost_vq_avail_empty(&net->dev, tvq))
break;
cpu_relax();
@@ -827,7 +829,6 @@ static void handle_rx(struct vhost_net *net)
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
&busyloop_intr))) {
- busyloop_intr = false;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -838,7 +839,9 @@ static void handle_rx(struct vhost_net *net)
goto out;
/* OK, now we need to know about added descriptors. */
if (!headcount) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
@@ -848,6 +851,7 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
+ busyloop_intr = false;
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 net-next 3/4] vhost_net: Avoid rx queue wake-ups during busypoll
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
We may run handle_rx() while rx work is queued. For example a packet can
push the rx work during the window before handle_rx calls
vhost_net_disable_vq().
In that case busypoll immediately exits due to vhost_has_work()
condition and enables vq again. This can lead to another unnecessary rx
wake-ups, so poll rx work instead of enabling the vq.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 26 +++++++++++++++++++-------
1 file changed, 19 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 811c0e5..791bc8b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -653,7 +653,8 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
nvq->done_idx = 0;
}
-static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
+ bool *busyloop_intr)
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -671,11 +672,16 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
preempt_disable();
endtime = busy_clock() + tvq->busyloop_timeout;
- while (vhost_can_busy_poll(endtime) &&
- !vhost_has_work(&net->dev) &&
- !sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, tvq))
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+ if (sk_has_rx_data(sk) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
cpu_relax();
+ }
preempt_enable();
@@ -795,6 +801,7 @@ static void handle_rx(struct vhost_net *net)
s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
+ bool busyloop_intr = false;
struct socket *sock;
struct iov_iter fixup;
__virtio16 num_buffers;
@@ -818,7 +825,9 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+ while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ &busyloop_intr))) {
+ busyloop_intr = false;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -905,7 +914,10 @@ static void handle_rx(struct vhost_net *net)
goto out;
}
}
- vhost_net_enable_vq(net, vq);
+ if (unlikely(busyloop_intr))
+ vhost_poll_queue(&vq->poll);
+ else
+ vhost_net_enable_vq(net, vq);
out:
vhost_rx_signal_used(nvq);
mutex_unlock(&vq->mutex);
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 net-next 2/4] vhost_net: Avoid tx vring kicks during busyloop
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
Under heavy load vhost busypoll may run without suppressing
notification. For example tx zerocopy callback can push tx work while
handle_tx() is running, then busyloop exits due to vhost_has_work()
condition and enables notification but immediately reenters handle_tx()
because the pushed work was tx. In this case handle_tx() tries to
disable notification again, but when using event_idx it by design
cannot. Then busyloop will run without suppressing notification.
Another example is the case where handle_tx() tries to enable
notification but avail idx is advanced so disables it again. This case
also leads to the same situation with event_idx.
The problem is that once we enter this situation busyloop does not work
under heavy load for considerable amount of time, because notification
is likely to happen during busyloop and handle_tx() immediately enables
notification after notification happens. Specifically busyloop detects
notification by vhost_has_work() and then handle_tx() calls
vhost_enable_notify(). Because the detected work was the tx work, it
enters handle_tx(), and enters busyloop without suppression again.
This is likely to be repeated, so with event_idx we are almost not able
to suppress notification in this case.
To fix this, poll the work instead of enabling notification when
busypoll is interrupted by something. IMHO vhost_has_work() is kind of
interruption rather than a signal to completely cancel the busypoll, so
let's run busypoll after the necessary work is done.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 35 ++++++++++++++++++++++-------------
1 file changed, 22 insertions(+), 13 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 3939c50..811c0e5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -396,13 +396,10 @@ static inline unsigned long busy_clock(void)
return local_clock() >> 10;
}
-static bool vhost_can_busy_poll(struct vhost_dev *dev,
- unsigned long endtime)
+static bool vhost_can_busy_poll(unsigned long endtime)
{
- return likely(!need_resched()) &&
- likely(!time_after(busy_clock(), endtime)) &&
- likely(!signal_pending(current)) &&
- !vhost_has_work(dev);
+ return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
+ !signal_pending(current));
}
static void vhost_net_disable_vq(struct vhost_net *n,
@@ -434,7 +431,8 @@ static int vhost_net_enable_vq(struct vhost_net *n,
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num)
+ unsigned int *out_num, unsigned int *in_num,
+ bool *busyloop_intr)
{
unsigned long uninitialized_var(endtime);
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
@@ -443,9 +441,15 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
if (r == vq->num && vq->busyloop_timeout) {
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(vq->dev, endtime) &&
- vhost_vq_avail_empty(vq->dev, vq))
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(vq->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+ if (!vhost_vq_avail_empty(vq->dev, vq))
+ break;
cpu_relax();
+ }
preempt_enable();
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
@@ -501,20 +505,24 @@ static void handle_tx(struct vhost_net *net)
zcopy = nvq->ubufs;
for (;;) {
+ bool busyloop_intr;
+
/* Release DMAs done buffers first */
if (zcopy)
vhost_zerocopy_signal_used(net, vq);
-
+ busyloop_intr = false;
head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
ARRAY_SIZE(vq->iov),
- &out, &in);
+ &out, &in, &busyloop_intr);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if (head == vq->num) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue;
}
@@ -663,7 +671,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
preempt_disable();
endtime = busy_clock() + tvq->busyloop_timeout;
- while (vhost_can_busy_poll(&net->dev, endtime) &&
+ while (vhost_can_busy_poll(endtime) &&
+ !vhost_has_work(&net->dev) &&
!sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, tvq))
cpu_relax();
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 net-next 1/4] vhost_net: Rename local variables in vhost_net_rx_peek_head_len
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
In-Reply-To: <1530603094-2476-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
So we can easily see which variable is for which, tx or rx.
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
drivers/vhost/net.c | 34 +++++++++++++++++-----------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 29756d8..3939c50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -647,39 +647,39 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
- struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *tvq = &tnvq->vq;
unsigned long uninitialized_var(endtime);
- int len = peek_head_len(rvq, sk);
+ int len = peek_head_len(rnvq, sk);
- if (!len && vq->busyloop_timeout) {
+ if (!len && tvq->busyloop_timeout) {
/* Flush batched heads first */
- vhost_rx_signal_used(rvq);
+ vhost_rx_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&vq->mutex, 1);
- vhost_disable_notify(&net->dev, vq);
+ mutex_lock_nested(&tvq->mutex, 1);
+ vhost_disable_notify(&net->dev, tvq);
preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
+ endtime = busy_clock() + tvq->busyloop_timeout;
while (vhost_can_busy_poll(&net->dev, endtime) &&
!sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, vq))
+ vhost_vq_avail_empty(&net->dev, tvq))
cpu_relax();
preempt_enable();
- if (!vhost_vq_avail_empty(&net->dev, vq))
- vhost_poll_queue(&vq->poll);
- else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- vhost_disable_notify(&net->dev, vq);
- vhost_poll_queue(&vq->poll);
+ if (!vhost_vq_avail_empty(&net->dev, tvq)) {
+ vhost_poll_queue(&tvq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
+ vhost_disable_notify(&net->dev, tvq);
+ vhost_poll_queue(&tvq->poll);
}
- mutex_unlock(&vq->mutex);
+ mutex_unlock(&tvq->mutex);
- len = peek_head_len(rvq, sk);
+ len = peek_head_len(rnvq, sk);
}
return len;
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 net-next 0/4] vhost_net: Avoid vq kicks during busyloop
From: Toshiaki Makita @ 2018-07-03 7:31 UTC (permalink / raw)
To: David S. Miller, Michael S. Tsirkin, Jason Wang
Cc: netdev, virtualization, kvm
Under heavy load vhost tx busypoll tend not to suppress vq kicks, which
causes poor guest tx performance. The detailed scenario is described in
commitlog of patch 2.
Rx seems not to have that serious problem, but for consistency I made a
similar change on rx to avoid rx wakeups (patch 3).
Additionary patch 4 is to avoid rx kicks under heavy load during
busypoll.
Tx performance is greatly improved by this change. I don't see notable
performance change on rx with this series though.
Performance numbers (tx):
- Bulk transfer from guest to external physical server.
[Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
- Set 10us busypoll.
- Guest disables checksum and TSO because of host XDP.
- Measured single flow Mbps by netperf, and kicks by perf kvm stat
(EPT_MISCONFIG event).
Before After
Mbps kicks/s Mbps kicks/s
UDP_STREAM 1472byte 247758 27
Send 3645.37 6958.10
Recv 3588.56 6958.10
1byte 9865 37
Send 4.34 5.43
Recv 4.17 5.26
TCP_STREAM 8801.03 45794 9592.77 2884
v2:
- Split patches into 3 parts (renaming variables, tx-kick fix, rx-wakeup
fix).
- Avoid rx-kicks too (patch 4).
- Don't memorize endtime as it is not needed for now.
Toshiaki Makita (4):
vhost_net: Rename local variables in vhost_net_rx_peek_head_len
vhost_net: Avoid tx vring kicks during busyloop
vhost_net: Avoid rx queue wake-ups during busypoll
vhost_net: Avoid rx vring kicks during busyloop
drivers/vhost/net.c | 95 +++++++++++++++++++++++++++++++++--------------------
1 file changed, 60 insertions(+), 35 deletions(-)
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH v6 3/3] x86: paravirt: make native_save_fl extern inline
From: Juergen Gross @ 2018-07-03 7:21 UTC (permalink / raw)
To: Nick Desaulniers, Ingo Molnar
Cc: Kate Stewart, andrea.parri, linux-efi, brijesh.singh, J. Kiszka,
Josh Poimboeuf, Will Deacon, jarkko.sakkinen, virtualization,
Masahiro Yamada, Manoj Gupta, hpa, boris.ostrovsky,
Thiebaud Weksteen, mawilcox, x86, akataria, Greg Hackmann, mingo,
Alistair Strachan, David Rientjes, geert, thomas.lendacky,
Arnd Bergmann, Linux Kbuild mailing list, Philippe Ombredanne,
rostedt
In-Reply-To: <CAKwvOdnW9r=pgmodEL0VXCv=PZVt_WMCA+B5x4D5oKRM1y-0Ow@mail.gmail.com>
On 26/06/18 18:22, Nick Desaulniers wrote:
> On Tue, Jun 26, 2018 at 3:13 AM Ingo Molnar <mingo@kernel.org> wrote:
>> Ok!
>>
>> Acked-by: Ingo Molnar <mingo@kernel.org>
>>
>> What's the planned upstreaming route for these patches/fixes?
>
> While the fix is mainly for paravirt, 2/3 of the patches exclusively
> touch arch/x86, so I think they should go up in the x86 tree (as
> opposed to paravirt's), if you would be so kind. hpa mentioned not
> handling merges in https://lkml.org/lkml/2018/6/14/903, so I reached
> out to you and tglx.
As there is no paravirt tree the x86 one seems to be appropriate. :-)
Juergen
^ permalink raw reply
* Re: [PATCH] gpu: drm: virito: code cleanup
From: Gerd Hoffmann @ 2018-07-03 6:28 UTC (permalink / raw)
To: Souptick Joarder
Cc: ajitn.linux, svptas.linux, sabyasachi.linux, airlied,
linux-kernel, dri-devel, virtualization, brajeswar.linux
In-Reply-To: <20180702182728.GA6015@jordon-HP-15-Notebook-PC>
On Mon, Jul 02, 2018 at 11:57:28PM +0530, Souptick Joarder wrote:
> The fault handler code is commented since v4.2.
> If there is no plan to enable the fault handler
> code in future, we can remove this dead code.
Indeed, but please without tyops in the $subject line.
cheers,
Gerd
^ permalink raw reply
* Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Jason Wang @ 2018-07-03 5:55 UTC (permalink / raw)
To: Tonghao Zhang
Cc: Linux Kernel Network Developers, virtualization, Tonghao Zhang,
mst
In-Reply-To: <CAMDZJNU6bvACS9=RS1MA8T1z8+1fVkGZw=Xx8DWDjwQYWy0k8g@mail.gmail.com>
On 2018年07月03日 13:48, Tonghao Zhang wrote:
> On Tue, Jul 3, 2018 at 10:12 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> On 2018年07月02日 20:57, xiangxia.m.yue@gmail.com wrote:
>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>
>>> Factor out generic busy polling logic and will be
>>> used for in tx path in the next patch. And with the patch,
>>> qemu can set differently the busyloop_timeout for rx queue.
>>>
>>> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
>>> ---
>>> drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
>>> 1 file changed, 55 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>>> index 62bb8e8..2790959 100644
>>> --- a/drivers/vhost/net.c
>>> +++ b/drivers/vhost/net.c
>>> @@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>>> return vhost_poll_start(poll, sock->file);
>>> }
>>>
>>> +static int sk_has_rx_data(struct sock *sk)
>>> +{
>>> + struct socket *sock = sk->sk_socket;
>>> +
>>> + if (sock->ops->peek_len)
>>> + return sock->ops->peek_len(sock);
>>> +
>>> + return skb_queue_empty(&sk->sk_receive_queue);
>>> +}
>>> +
>>> +static void vhost_net_busy_poll(struct vhost_net *net,
>>> + struct vhost_virtqueue *rvq,
>>> + struct vhost_virtqueue *tvq,
>>> + bool rx)
>>> +{
>>> + unsigned long uninitialized_var(endtime);
>>> + unsigned long busyloop_timeout;
>>> + struct socket *sock;
>>> + struct vhost_virtqueue *vq = rx ? tvq : rvq;
>>> +
>>> + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>>> +
>>> + vhost_disable_notify(&net->dev, vq);
>>> + sock = rvq->private_data;
>>> + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
>>> +
>>> + preempt_disable();
>>> + endtime = busy_clock() + busyloop_timeout;
>>> + while (vhost_can_busy_poll(tvq->dev, endtime) &&
>>> + !(sock && sk_has_rx_data(sock->sk)) &&
>>> + vhost_vq_avail_empty(tvq->dev, tvq))
>>> + cpu_relax();
>>> + preempt_enable();
>>> +
>>> + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
>>> + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
>>> + vhost_poll_queue(&vq->poll);
>>> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> One last question, do we need this for rx? This check will be always
>> true under light or medium load.
> will remove the 'unlikely'
Not only the unlikely. We only want rx kick when packet is pending at
socket but we're out of available buffers. This is not the case here
(you have polled the vq above).
So we probably want to do this only when rx == true.
Thanks
>
>> Thanks
>>
>>> + vhost_disable_notify(&net->dev, vq);
>>> + vhost_poll_queue(&vq->poll);
>>> + }
>>> +
>>> + mutex_unlock(&vq->mutex);
>>> +}
>>> +
>>> +
>>> static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>>> struct vhost_virtqueue *vq,
>>> struct iovec iov[], unsigned int iov_size,
>>> @@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>>> return len;
>>> }
>>>
>>> -static int sk_has_rx_data(struct sock *sk)
>>> -{
>>> - struct socket *sock = sk->sk_socket;
>>> -
>>> - if (sock->ops->peek_len)
>>> - return sock->ops->peek_len(sock);
>>> -
>>> - return skb_queue_empty(&sk->sk_receive_queue);
>>> -}
>>> -
>>> static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>>> {
>>> struct vhost_virtqueue *vq = &nvq->vq;
>>> @@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>>>
>>> static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>>> {
>>> - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>>> - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>>> - struct vhost_virtqueue *vq = &nvq->vq;
>>> - unsigned long uninitialized_var(endtime);
>>> - int len = peek_head_len(rvq, sk);
>>> + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>>> + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>>>
>>> - if (!len && vq->busyloop_timeout) {
>>> - /* Flush batched heads first */
>>> - vhost_rx_signal_used(rvq);
>>> - /* Both tx vq and rx socket were polled here */
>>> - mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>>> - vhost_disable_notify(&net->dev, vq);
>>> + int len = peek_head_len(rnvq, sk);
>>>
>>> - preempt_disable();
>>> - endtime = busy_clock() + vq->busyloop_timeout;
>>> -
>>> - while (vhost_can_busy_poll(&net->dev, endtime) &&
>>> - !sk_has_rx_data(sk) &&
>>> - vhost_vq_avail_empty(&net->dev, vq))
>>> - cpu_relax();
>>> -
>>> - preempt_enable();
>>> -
>>> - if (!vhost_vq_avail_empty(&net->dev, vq))
>>> - vhost_poll_queue(&vq->poll);
>>> - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>>> - vhost_disable_notify(&net->dev, vq);
>>> - vhost_poll_queue(&vq->poll);
>>> - }
>>> + if (!len && rnvq->vq.busyloop_timeout) {
>>> + /* Flush batched heads first */
>>> + vhost_rx_signal_used(rnvq);
>>>
>>> - mutex_unlock(&vq->mutex);
>>> + /* Both tx vq and rx socket were polled here */
>>> + vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
>>>
>>> - len = peek_head_len(rvq, sk);
>>> + len = peek_head_len(rnvq, sk);
>>> }
>>>
>>> return len;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Tonghao Zhang @ 2018-07-03 5:48 UTC (permalink / raw)
To: jasowang
Cc: Linux Kernel Network Developers, virtualization, Tonghao Zhang,
mst
In-Reply-To: <3eca0b98-52fc-fe34-c4dc-c022c2232cf5@redhat.com>
On Tue, Jul 3, 2018 at 10:12 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年07月02日 20:57, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > Factor out generic busy polling logic and will be
> > used for in tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> > ---
> > drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
> > 1 file changed, 55 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 62bb8e8..2790959 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> > return vhost_poll_start(poll, sock->file);
> > }
> >
> > +static int sk_has_rx_data(struct sock *sk)
> > +{
> > + struct socket *sock = sk->sk_socket;
> > +
> > + if (sock->ops->peek_len)
> > + return sock->ops->peek_len(sock);
> > +
> > + return skb_queue_empty(&sk->sk_receive_queue);
> > +}
> > +
> > +static void vhost_net_busy_poll(struct vhost_net *net,
> > + struct vhost_virtqueue *rvq,
> > + struct vhost_virtqueue *tvq,
> > + bool rx)
> > +{
> > + unsigned long uninitialized_var(endtime);
> > + unsigned long busyloop_timeout;
> > + struct socket *sock;
> > + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> > +
> > + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> > +
> > + vhost_disable_notify(&net->dev, vq);
> > + sock = rvq->private_data;
> > + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
> > +
> > + preempt_disable();
> > + endtime = busy_clock() + busyloop_timeout;
> > + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> > + !(sock && sk_has_rx_data(sock->sk)) &&
> > + vhost_vq_avail_empty(tvq->dev, tvq))
> > + cpu_relax();
> > + preempt_enable();
> > +
> > + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> > + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> > + vhost_poll_queue(&vq->poll);
> > + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>
> One last question, do we need this for rx? This check will be always
> true under light or medium load.
will remove the 'unlikely'
> Thanks
>
> > + vhost_disable_notify(&net->dev, vq);
> > + vhost_poll_queue(&vq->poll);
> > + }
> > +
> > + mutex_unlock(&vq->mutex);
> > +}
> > +
> > +
> > static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > struct vhost_virtqueue *vq,
> > struct iovec iov[], unsigned int iov_size,
> > @@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
> > return len;
> > }
> >
> > -static int sk_has_rx_data(struct sock *sk)
> > -{
> > - struct socket *sock = sk->sk_socket;
> > -
> > - if (sock->ops->peek_len)
> > - return sock->ops->peek_len(sock);
> > -
> > - return skb_queue_empty(&sk->sk_receive_queue);
> > -}
> > -
> > static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> > {
> > struct vhost_virtqueue *vq = &nvq->vq;
> > @@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >
> > static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> > {
> > - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> > - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> > - struct vhost_virtqueue *vq = &nvq->vq;
> > - unsigned long uninitialized_var(endtime);
> > - int len = peek_head_len(rvq, sk);
> > + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> > + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> >
> > - if (!len && vq->busyloop_timeout) {
> > - /* Flush batched heads first */
> > - vhost_rx_signal_used(rvq);
> > - /* Both tx vq and rx socket were polled here */
> > - mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> > - vhost_disable_notify(&net->dev, vq);
> > + int len = peek_head_len(rnvq, sk);
> >
> > - preempt_disable();
> > - endtime = busy_clock() + vq->busyloop_timeout;
> > -
> > - while (vhost_can_busy_poll(&net->dev, endtime) &&
> > - !sk_has_rx_data(sk) &&
> > - vhost_vq_avail_empty(&net->dev, vq))
> > - cpu_relax();
> > -
> > - preempt_enable();
> > -
> > - if (!vhost_vq_avail_empty(&net->dev, vq))
> > - vhost_poll_queue(&vq->poll);
> > - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> > - vhost_disable_notify(&net->dev, vq);
> > - vhost_poll_queue(&vq->poll);
> > - }
> > + if (!len && rnvq->vq.busyloop_timeout) {
> > + /* Flush batched heads first */
> > + vhost_rx_signal_used(rnvq);
> >
> > - mutex_unlock(&vq->mutex);
> > + /* Both tx vq and rx socket were polled here */
> > + vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
> >
> > - len = peek_head_len(rvq, sk);
> > + len = peek_head_len(rnvq, sk);
> > }
> >
> > return len;
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net-next 8/8] vhost: event suppression for packed ring
From: Jason Wang @ 2018-07-03 5:38 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
This patch introduces support for event suppression. This is done by
have a two areas: device area and driver area. One side could then try
to disable or enable (delayed) notification from other side by using a
boolean hint or event index interface in the areas.
For more information, please refer Virtio spec.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++----
drivers/vhost/vhost.h | 10 ++-
2 files changed, 185 insertions(+), 16 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0f3f07c..cccbc82 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1115,10 +1115,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
struct vring_used __user *used)
{
struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+ struct vring_packed_desc_event *driver_event =
+ (struct vring_packed_desc_event *)avail;
+ struct vring_packed_desc_event *device_event =
+ (struct vring_packed_desc_event *)used;
- /* TODO: check device area and driver area */
return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
- access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+ access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+ access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+ access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
}
static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1193,14 +1198,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
}
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+ int num = vq->num;
+
+ return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+ num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_RO,
+ (u64)(uintptr_t)vq->driver_event,
+ sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO,
+ (u64)(uintptr_t)vq->device_event,
+ sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
- if (!vq->iotlb)
- return 1;
-
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1212,6 +1230,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
num * sizeof(*vq->used->ring) + s,
VHOST_ADDR_USED);
}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+ if (!vq->iotlb)
+ return 1;
+
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vq_iotlb_prefetch_packed(vq);
+ else
+ return vq_iotlb_prefetch_split(vq);
+}
EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
/* Can we log writes? */
@@ -1771,6 +1800,50 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
return 0;
}
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+ __virtio16 device_flags)
+{
+ void __user *flags;
+
+ if (vhost_put_user(vq, device_flags, &vq->device_event->flags,
+ VHOST_ADDR_USED) < 0)
+ return -EFAULT;
+ if (unlikely(vq->log_used)) {
+ /* Make sure the flag is seen before log. */
+ smp_wmb();
+ /* Log used flag write. */
+ flags = &vq->device_event->flags;
+ log_write(vq->log_base, vq->log_addr +
+ (flags - (void __user *)vq->device_event),
+ sizeof(vq->device_event->flags));
+ if (vq->log_ctx)
+ eventfd_signal(vq->log_ctx, 1);
+ }
+ return 0;
+}
+
+static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq,
+ __virtio16 device_off_wrap)
+{
+ void __user *off_wrap;
+
+ if (vhost_put_user(vq, device_off_wrap, &vq->device_event->off_wrap,
+ VHOST_ADDR_USED) < 0)
+ return -EFAULT;
+ if (unlikely(vq->log_used)) {
+ /* Make sure the flag is seen before log. */
+ smp_wmb();
+ /* Log used flag write. */
+ off_wrap = &vq->device_event->off_wrap;
+ log_write(vq->log_base, vq->log_addr +
+ (off_wrap - (void __user *)vq->device_event),
+ sizeof(vq->device_event->off_wrap));
+ if (vq->log_ctx)
+ eventfd_signal(vq->log_ctx, 1);
+ }
+ return 0;
+}
+
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
@@ -2756,16 +2829,13 @@ int vhost_add_used_n(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_add_used_n);
-static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__u16 old, new;
__virtio16 event;
bool v;
- /* TODO: check driver area */
- if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
- return true;
-
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
* interrupts. */
@@ -2798,6 +2868,64 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vring_need_event(vhost16_to_cpu(vq, event), new, old);
}
+static bool vhost_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ __virtio16 event_off_wrap, event_flags;
+ __u16 old, new, off_wrap;
+ bool v;
+
+ /* Flush out used descriptors updates. This is paired
+ * with the barrier that the Guest executes when enabling
+ * interrupts.
+ */
+ smp_mb();
+
+ if (vhost_get_avail(vq, event_flags,
+ &vq->driver_event->flags) < 0) {
+ vq_err(vq, "Failed to get driver desc_event_flags");
+ return true;
+ }
+
+ if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX))
+ return event_flags !=
+ cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+
+ old = vq->signalled_used;
+ v = vq->signalled_used_valid;
+ new = vq->signalled_used = vq->last_used_idx;
+ vq->signalled_used_valid = true;
+
+ if (event_flags != cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC))
+ return event_flags !=
+ cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+
+ /* Read desc event flags before event_off and event_wrap */
+ smp_rmb();
+
+ if (vhost_get_avail(vq, event_off_wrap,
+ &vq->driver_event->off_wrap) < 0) {
+ vq_err(vq, "Failed to get driver desc_event_off/wrap");
+ return true;
+ }
+
+ off_wrap = vhost16_to_cpu(vq, event_off_wrap);
+
+ if (unlikely(!v))
+ return true;
+
+ return vhost_vring_packed_need_event(vq, vq->last_used_wrap_counter,
+ off_wrap, new, old);
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_notify_packed(dev, vq);
+ else
+ return vhost_notify_split(dev, vq);
+}
+
/* This actually signals the guest, using eventfd. */
void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
@@ -2875,10 +3003,34 @@ static bool vhost_enable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
struct vring_desc_packed *d = vq->desc_packed + vq->avail_idx;
- __virtio16 flags;
+ __virtio16 flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
int ret;
- /* TODO: enable notification through device area */
+ if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+ return false;
+ vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+ if (vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
+ __virtio16 off_wrap = cpu_to_vhost16(vq, vq->avail_idx |
+ vq->avail_wrap_counter << 15);
+
+ ret = vhost_update_device_off_wrap(vq, off_wrap);
+ if (ret) {
+ vq_err(vq, "Failed to write to off warp at %p: %d\n",
+ &vq->device_event->off_wrap, ret);
+ return false;
+ }
+ /* Make sure off_wrap is wrote before flags */
+ smp_wmb();
+ flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC);
+ }
+
+ ret = vhost_update_device_flags(vq, flags);
+ if (ret) {
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->device_event->flags, ret);
+ return false;
+ }
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again.
@@ -2945,7 +3097,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
static void vhost_disable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
- /* TODO: disable notification through device area */
+ __virtio16 flags;
+ int r;
+
+ if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+ return;
+ vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+ flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+ r = vhost_update_device_flags(vq, flags);
+ if (r)
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->device_event->flags, r);
}
static void vhost_disable_notify_split(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index db09958..d071daf 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_desc_packed __user *desc_packed;
};
- struct vring_avail __user *avail;
- struct vring_used __user *used;
+ union {
+ struct vring_avail __user *avail;
+ struct vring_packed_desc_event __user *driver_event;
+ };
+ union {
+ struct vring_used __user *used;
+ struct vring_packed_desc_event __user *device_event;
+ };
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
struct eventfd_ctx *call_ctx;
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 7/8] vhost: packed ring support
From: Jason Wang @ 2018-07-03 5:38 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
This patch introduces basic support for packed ring. The idea behinds
packed ring is to use a single descriptor ring instead of three
different rings (avail, used and descriptor). This could help to
reduce the cache contention and PCI transactions. So it was designed
to help for the performance for both software implementation and
hardware implementation.
The implementation was straightforward, packed version of vhost core
(whose name has a packed suffix) helpers were introduced and previous
helpers were renamed with a split suffix. Then the exported helpers
can just do a switch to go to the correct internal helpers.
The event suppression (device area and driver area) were not
implemented. It will be done on top with another patch.
For more information of packed ring, please refer Virtio spec.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 15 +-
drivers/vhost/vhost.c | 655 ++++++++++++++++++++++++++++++++++++++++++---
drivers/vhost/vhost.h | 13 +-
include/uapi/linux/vhost.h | 7 +
4 files changed, 644 insertions(+), 46 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index eada5a6..90d9efb 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,7 +74,8 @@ enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VIRTIO_F_IOMMU_PLATFORM)
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_F_RING_PACKED)
};
enum {
@@ -583,7 +584,7 @@ static void handle_tx(struct vhost_net *net)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_vq_desc(vq, &used, 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -736,10 +737,12 @@ static void handle_rx(struct vhost_net *net)
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+ struct vhost_used_elem *used = vq->heads + nvq->done_idx;
+
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
- err = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
- vhost_len, &in, vq_log, &log,
+ err = vhost_get_bufs(vq, used, vhost_len,
+ &in, vq_log, &log,
likely(mergeable) ? UIO_MAXIOV : 1,
&headcount);
/* OK, now we need to know about added descriptors. */
@@ -784,7 +787,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
" len %d, expected %zd\n", err, sock_len);
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, used, 1);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -808,7 +811,7 @@ static void handle_rx(struct vhost_net *net)
copy_to_iter(&num_buffers, sizeof num_buffers,
&fixup) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, used, 1);
goto out;
}
nvq->done_idx += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 060a431..0f3f07c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -323,6 +323,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
vq->busyloop_timeout = 0;
+ vq->last_used_wrap_counter = true;
+ vq->last_avail_wrap_counter = true;
+ vq->avail_wrap_counter = true;
vq->umem = NULL;
vq->iotlb = NULL;
__vhost_vq_meta_reset(vq);
@@ -1106,11 +1109,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
return 0;
}
-static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
- struct vring_desc __user *desc,
- struct vring_avail __user *avail,
- struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+
+ /* TODO: check device area and driver area */
+ return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
+ access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+}
+static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
@@ -1121,6 +1135,17 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
sizeof *used + num * sizeof *used->ring + s);
}
+static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vq_access_ok_packed(vq, num, desc, avail, used);
+ else
+ return vq_access_ok_split(vq, num, desc, avail, used);
+}
+
static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
const struct vhost_umem_node *node,
int type)
@@ -1318,6 +1343,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
struct vhost_vring_state s;
struct vhost_vring_file f;
struct vhost_vring_addr a;
+ bool wrap_counter;
u32 idx;
long r;
@@ -1360,6 +1386,10 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
r = -EFAULT;
break;
}
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
+ wrap_counter = s.num >> 31;
+ s.num &= ~(1 << 31);
+ }
if (s.num > 0xffff) {
r = -EINVAL;
break;
@@ -1367,10 +1397,48 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
vq->last_avail_idx = s.num;
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
+ vq->last_avail_wrap_counter = wrap_counter;
+ vq->avail_wrap_counter = vq->last_avail_wrap_counter;
+ }
break;
case VHOST_GET_VRING_BASE:
s.index = idx;
s.num = vq->last_avail_idx;
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ s.num |= vq->last_avail_wrap_counter << 31;
+ if (copy_to_user(argp, &s, sizeof(s)))
+ r = -EFAULT;
+ break;
+ case VHOST_SET_VRING_USED_BASE:
+ /* Moving base with an active backend?
+ * You don't want to do that.
+ */
+ if (vq->private_data) {
+ r = -EBUSY;
+ break;
+ }
+ if (copy_from_user(&s, argp, sizeof(s))) {
+ r = -EFAULT;
+ break;
+ }
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
+ wrap_counter = s.num >> 31;
+ s.num &= ~(1 << 31);
+ }
+ if (s.num > 0xffff) {
+ r = -EINVAL;
+ break;
+ }
+ vq->last_used_idx = s.num;
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ vq->last_used_wrap_counter = wrap_counter;
+ break;
+ case VHOST_GET_VRING_USED_BASE:
+ s.index = idx;
+ s.num = vq->last_used_idx;
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ s.num |= vq->last_used_wrap_counter << 31;
if (copy_to_user(argp, &s, sizeof s))
r = -EFAULT;
break;
@@ -1734,6 +1802,9 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
vhost_init_is_le(vq);
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return 0;
+
r = vhost_update_used_flags(vq);
if (r)
goto err;
@@ -1807,7 +1878,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
/* Each buffer in the virtqueues is actually a chain of descriptors. This
* function returns the next descriptor in the chain,
* or -1U if we're at the end. */
-static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
+static unsigned next_desc_split(struct vhost_virtqueue *vq,
+ struct vring_desc *desc)
{
unsigned int next;
@@ -1820,11 +1892,17 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
return next;
}
-static int get_indirect(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num,
- struct vring_desc *indirect)
+static unsigned next_desc_packed(struct vhost_virtqueue *vq,
+ struct vring_desc_packed *desc)
+{
+ return desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT);
+}
+
+static int get_indirect_split(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ struct vring_desc *indirect)
{
struct vring_desc desc;
unsigned int i = 0, count, found = 0;
@@ -1914,23 +1992,300 @@ static int get_indirect(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
- } while ((i = next_desc(vq, &desc)) != -1);
+ } while ((i = next_desc_split(vq, &desc)) != -1);
return 0;
}
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct vhost_used_elem *used,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+static int get_indirect_packed(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ struct vring_desc_packed *indirect)
+{
+ struct vring_desc_packed desc;
+ unsigned int i = 0, count, found = 0;
+ u32 len = vhost32_to_cpu(vq, indirect->len);
+ struct iov_iter from;
+ int ret, access;
+
+ /* Sanity check */
+ if (unlikely(len % sizeof(desc))) {
+ vq_err(vq, "Invalid length in indirect descriptor: len 0x%llx not multiple of 0x%zx\n",
+ (unsigned long long)len,
+ sizeof(desc));
+ return -EINVAL;
+ }
+
+ ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr),
+ len, vq->indirect,
+ UIO_MAXIOV, VHOST_ACCESS_RO);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d in indirect.\n",
+ ret);
+ return ret;
+ }
+ iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+ /* We will use the result as an address to read from, so most
+ * architectures only need a compiler barrier here.
+ */
+ read_barrier_depends();
+
+ count = len / sizeof(desc);
+ /* Buffers are chained via a 16 bit next field, so
+ * we can have at most 2^16 of these.
+ */
+ if (unlikely(count > USHRT_MAX + 1)) {
+ vq_err(vq, "Indirect buffer length too big: %d\n",
+ indirect->len);
+ return -E2BIG;
+ }
+
+ do {
+ unsigned int iov_count = *in_num + *out_num;
+
+ if (unlikely(++found > count)) {
+ vq_err(vq, "Loop detected: last one at %u indirect size %u\n",
+ i, count);
+ return -EINVAL;
+ }
+ if (unlikely(!copy_from_iter_full(&desc, sizeof(desc),
+ &from))) {
+ vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+ i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+ + i * sizeof(desc));
+ return -EINVAL;
+ }
+ if (unlikely(desc.flags &
+ cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+ vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+ i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+ + i * sizeof(desc));
+ return -EINVAL;
+ }
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+
+ ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+ vhost32_to_cpu(vq, desc.len),
+ iov + iov_count,
+ iov_size - iov_count, access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d indirect idx %d\n",
+ ret, i);
+ return ret;
+ }
+ /* If this is an input descriptor, increment that count. */
+ if (access == VHOST_ACCESS_WO) {
+ *in_num += ret;
+ if (unlikely(log)) {
+ log[*log_num].addr =
+ vhost64_to_cpu(vq, desc.addr);
+ log[*log_num].len =
+ vhost32_to_cpu(vq, desc.len);
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors.
+ */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Indirect descriptor has out after in: idx %d\n",
+ i);
+ return -EINVAL;
+ }
+ *out_num += ret;
+ }
+ i++;
+ } while (next_desc_packed(vq, &desc));
+ return 0;
+}
+
+#define DESC_AVAIL (1 << VRING_DESC_F_AVAIL)
+#define DESC_USED (1 << VRING_DESC_F_USED)
+static bool desc_is_avail(struct vhost_virtqueue *vq, bool wrap_counter,
+ __virtio16 flags)
+{
+ bool avail = flags & cpu_to_vhost16(vq, DESC_AVAIL);
+
+ return avail == wrap_counter;
+}
+
+static __virtio16 get_desc_flags(struct vhost_virtqueue *vq,
+ bool wrap_counter, bool write)
+{
+ __virtio16 flags = 0;
+
+ if (wrap_counter) {
+ flags |= cpu_to_vhost16(vq, DESC_AVAIL);
+ flags |= cpu_to_vhost16(vq, DESC_USED);
+ } else {
+ flags &= ~cpu_to_vhost16(vq, DESC_AVAIL);
+ flags &= ~cpu_to_vhost16(vq, DESC_USED);
+ }
+
+ if (write)
+ flags |= cpu_to_vhost16(vq, VRING_DESC_F_WRITE);
+
+ return flags;
+}
+
+static bool vhost_vring_packed_need_event(struct vhost_virtqueue *vq,
+ bool wrap, __u16 off_wrap, __u16 new,
+ __u16 old)
+{
+ int off = off_wrap & ~(1 << 15);
+
+ if (wrap != off_wrap >> 15)
+ off -= vq->num;
+
+ return vring_need_event(off, new, old);
+}
+
+static int vhost_get_vq_desc_packed(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log,
+ unsigned int *log_num)
+{
+ struct vring_desc_packed desc;
+ int ret, access, i;
+ u16 last_avail_idx = vq->last_avail_idx;
+ u16 off_wrap = vq->avail_idx | (vq->avail_wrap_counter << 15);
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+ if (unlikely(log))
+ *log_num = 0;
+
+ used->count = 0;
+
+ do {
+ struct vring_desc_packed *d = vq->desc_packed +
+ vq->last_avail_idx;
+ unsigned int iov_count = *in_num + *out_num;
+
+ ret = vhost_get_user(vq, desc.flags, &d->flags,
+ VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+ vq->last_avail_idx, &d->flags);
+ return -EFAULT;
+ }
+
+ if (!desc_is_avail(vq, vq->last_avail_wrap_counter,
+ desc.flags)) {
+ /* If there's nothing new since last we looked, return
+ * invalid.
+ */
+ if (!used->count)
+ return -ENOSPC;
+ vq_err(vq, "Unexpected unavail descriptor: idx %d\n",
+ vq->last_avail_idx);
+ return -EFAULT;
+ }
+
+ /* Read desc content after we're sure it was available. */
+ smp_rmb();
+
+ ret = vhost_copy_from_user(vq, &desc, d, sizeof(desc));
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ vq->last_avail_idx, d);
+ return -EFAULT;
+ }
+
+ used->elem.id = desc.id;
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
+ ret = get_indirect_packed(vq, iov, iov_size,
+ out_num, in_num, log,
+ log_num, &desc);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Failure detected in indirect descriptor at idx %d\n",
+ i);
+ return ret;
+ }
+ goto next;
+ }
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+ ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+ vhost32_to_cpu(vq, desc.len),
+ iov + iov_count, iov_size - iov_count,
+ access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d idx %d\n",
+ ret, i);
+ return ret;
+ }
+
+ if (access == VHOST_ACCESS_WO) {
+ /* If this is an input descriptor,
+ * increment that count.
+ */
+ *in_num += ret;
+ if (unlikely(log)) {
+ log[*log_num].addr =
+ vhost64_to_cpu(vq, desc.addr);
+ log[*log_num].len =
+ vhost32_to_cpu(vq, desc.len);
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors.
+ */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Desc out after in: idx %d\n", i);
+ return -EINVAL;
+ }
+ *out_num += ret;
+ }
+
+next:
+ if (unlikely(++used->count > vq->num)) {
+ vq_err(vq, "Loop detected: last one at %u vq size %u head %u\n",
+ i, vq->num, used->elem.id);
+ return -EINVAL;
+ }
+ if (++vq->last_avail_idx >= vq->num) {
+ vq->last_avail_idx = 0;
+ vq->last_avail_wrap_counter ^= 1;
+ }
+ /* If this descriptor says it doesn't chain, we're done. */
+ } while (next_desc_packed(vq, &desc));
+
+ /* Packed ring does not have avail idx which means we need to
+ * track it by our own. The check here is to make sure it
+ * grows monotonically.
+ */
+ if (vhost_vring_packed_need_event(vq, vq->last_avail_wrap_counter,
+ off_wrap, vq->last_avail_idx,
+ last_avail_idx)) {
+ vq->avail_idx = vq->last_avail_idx;
+ vq->avail_wrap_counter = vq->last_avail_wrap_counter;
+ }
+
+ return 0;
+}
+
+static int vhost_get_vq_desc_split(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
{
struct vring_desc desc;
unsigned int i, head, found = 0;
@@ -2015,9 +2370,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return -EFAULT;
}
if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
- ret = get_indirect(vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
+ ret = get_indirect_split(vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, &desc);
if (unlikely(ret < 0)) {
if (ret != -EAGAIN)
vq_err(vq, "Failure detected "
@@ -2059,7 +2414,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
- } while ((i = next_desc(vq, &desc)) != -1);
+ } while ((i = next_desc_split(vq, &desc)) != -1);
/* On success, increment avail index. */
vq->last_avail_idx++;
@@ -2069,6 +2424,31 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
return 0;
}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_get_vq_desc_packed(vq, used, iov, iov_size,
+ out_num, in_num,
+ log, log_num);
+ else
+ return vhost_get_vq_desc_split(vq, used, iov, iov_size,
+ out_num, in_num,
+ log, log_num);
+}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
void vhost_set_used_len(struct vhost_virtqueue *vq,
@@ -2155,15 +2535,30 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
*count = headcount;
return 0;
err:
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, heads, headcount);
return r;
}
EXPORT_SYMBOL_GPL(vhost_get_bufs);
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ int headcount)
{
- vq->last_avail_idx -= n;
+ int i;
+
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
+ for (i = 0; i < headcount; i++) {
+ vq->last_avail_idx -= heads[i].count;
+ if (vq->last_avail_idx >= vq->num) {
+ vq->last_avail_wrap_counter ^= 1;
+ vq->last_avail_idx += vq->num;
+ }
+ }
+ } else {
+ vq->last_avail_idx -= headcount;
+ }
+
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
@@ -2219,10 +2614,102 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}
+static int vhost_add_used_packed(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ int idx, bool wrap_counter)
+{
+ struct vring_desc_packed __user *desc = vq->desc_packed + idx;
+ int ret;
+
+ ret = vhost_put_user(vq, used->elem.id, &desc->id, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update id: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+ ret = vhost_put_user(vq, used->elem.len, &desc->len, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update len: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+
+ if (idx == vq->last_used_idx) {
+ /* Make sure descriptor id and len is written before
+ * flags for the first used buffer.
+ */
+ smp_wmb();
+ }
+
+ ret = vhost_put_user(vq,
+ get_desc_flags(vq, wrap_counter, used->elem.len),
+ &desc->flags, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update flags: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+
+ if (unlikely(vq->log_used)) {
+ /* Make sure desc is written before update log. */
+ smp_wmb();
+ log_write(vq->log_base, vq->log_addr +
+ vq->last_used_idx * sizeof(*desc),
+ sizeof(*desc));
+ if (vq->log_ctx)
+ eventfd_signal(vq->log_ctx, 1);
+ }
+
+ return 0;
+}
+
+static int vhost_add_used_n_packed(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned int count)
+{
+ u16 last_used_idx = vq->last_used_idx + heads[0].count;
+ u16 wrap_counter = vq->last_used_wrap_counter;
+ int i, ret;
+
+ /* Update used elems other than first to save unnecessary
+ * memory barriers.
+ */
+ for (i = 1; i < count; i++) {
+ if (last_used_idx >= vq->num) {
+ last_used_idx -= vq->num;
+ wrap_counter ^= 1;
+ }
+
+ ret = vhost_add_used_packed(vq, &heads[i], last_used_idx,
+ wrap_counter);
+ if (unlikely(ret))
+ return ret;
+
+ last_used_idx += heads[i].count;
+ }
+
+ ret = vhost_add_used_packed(vq, &heads[0], vq->last_used_idx,
+ vq->last_used_wrap_counter);
+ if (unlikely(ret))
+ return ret;
+
+ if (last_used_idx >= vq->num) {
+ last_used_idx -= vq->num;
+ wrap_counter ^= 1;
+ }
+
+ vq->last_used_idx = last_used_idx;
+ vq->last_used_wrap_counter = wrap_counter;
+
+ return 0;
+}
+
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
- unsigned count)
+static int vhost_add_used_n_split(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned count)
+
{
int start, n, r;
@@ -2254,6 +2741,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
}
return r;
}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd.
+ */
+int vhost_add_used_n(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned int count)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_add_used_n_packed(vq, heads, count);
+ else
+ return vhost_add_used_n_split(vq, heads, count);
+}
EXPORT_SYMBOL_GPL(vhost_add_used_n);
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
@@ -2261,6 +2761,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
__u16 old, new;
__virtio16 event;
bool v;
+
+ /* TODO: check driver area */
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return true;
+
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
* interrupts. */
@@ -2323,7 +2828,8 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
/* return true if we're sure that avaiable ring is empty */
-bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_vq_avail_empty_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__virtio16 avail_idx;
int r;
@@ -2338,10 +2844,59 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vq->avail_idx == vq->last_avail_idx;
}
+
+static bool vhost_vq_avail_empty_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ struct vring_desc_packed *d = vq->desc_packed + vq->avail_idx;
+ __virtio16 flags;
+ int ret;
+
+ ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+ vq->last_avail_idx, d);
+ return -EFAULT;
+ }
+
+ return !desc_is_avail(vq, vq->avail_wrap_counter, flags);
+}
+
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_vq_avail_empty_packed(dev, vq);
+ else
+ return vhost_vq_avail_empty_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
-/* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_enable_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ struct vring_desc_packed *d = vq->desc_packed + vq->avail_idx;
+ __virtio16 flags;
+ int ret;
+
+ /* TODO: enable notification through device area */
+
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again.
+ */
+ smp_mb();
+
+ ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ vq->last_avail_idx, &d->flags);
+ return -EFAULT;
+ }
+
+ return desc_is_avail(vq, vq->avail_wrap_counter, flags);
+}
+
+static bool vhost_enable_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__virtio16 avail_idx;
int r;
@@ -2376,10 +2931,25 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
}
+
+/* OK, now we need to know about added descriptors. */
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_enable_notify_packed(dev, vq);
+ else
+ return vhost_enable_notify_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_enable_notify);
-/* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static void vhost_disable_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ /* TODO: disable notification through device area */
+}
+
+static void vhost_disable_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
int r;
@@ -2393,6 +2963,15 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
&vq->used->flags, r);
}
}
+
+/* We don't need to be notified again. */
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_disable_notify_packed(dev, vq);
+ else
+ return vhost_disable_notify_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_disable_notify);
/* Create a new message. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 604821b..db09958 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -36,6 +36,7 @@ struct vhost_poll {
struct vhost_used_elem {
struct vring_used_elem elem;
+ int count;
};
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
@@ -91,7 +92,10 @@ struct vhost_virtqueue {
/* The actual ring of buffers. */
struct mutex mutex;
unsigned int num;
- struct vring_desc __user *desc;
+ union {
+ struct vring_desc __user *desc;
+ struct vring_desc_packed __user *desc_packed;
+ };
struct vring_avail __user *avail;
struct vring_used __user *used;
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
@@ -148,6 +152,9 @@ struct vhost_virtqueue {
bool user_be;
#endif
u32 busyloop_timeout;
+ bool last_used_wrap_counter;
+ bool avail_wrap_counter;
+ bool last_avail_wrap_counter;
};
struct vhost_msg_node {
@@ -203,7 +210,9 @@ void vhost_set_used_len(struct vhost_virtqueue *vq,
int len);
int vhost_get_used_len(struct vhost_virtqueue *vq,
struct vhost_used_elem *used);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *elem,
+ int n);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *vq,
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c51f8e5..839ae7e 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -160,6 +160,13 @@ struct vhost_memory {
#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \
struct vhost_vring_state)
+/* Base value where queue looks for used descriptors */
+#define VHOST_SET_VRING_USED_BASE _IOW(VHOST_VIRTIO, 0x25, \
+ struct vhost_vring_state)
+/* Get accessor: reads index, writes value in num */
+#define VHOST_GET_VRING_USED_BASE _IOWR(VHOST_VIRTIO, 0x26, \
+ struct vhost_vring_state)
+
/* VHOST_NET specific defines */
/* Attach virtio net ring to a raw socket, or tap device.
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 6/8] virtio: introduce packed ring defines
From: Jason Wang @ 2018-07-03 5:38 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/uapi/linux/virtio_config.h | 2 ++
include/uapi/linux/virtio_ring.h | 32 ++++++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 449132c..947f6a3 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -75,6 +75,8 @@
*/
#define VIRTIO_F_IOMMU_PLATFORM 33
+#define VIRTIO_F_RING_PACKED 34
+
/*
* Does the device support Single Root I/O Virtualization?
*/
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..71c7a46 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
#define VRING_DESC_F_WRITE 2
/* This means the buffer contains a list of buffer descriptors. */
#define VRING_DESC_F_INDIRECT 4
+#define VRING_DESC_F_AVAIL 7
+#define VRING_DESC_F_USED 15
/* The Host uses this in used->flags to advise the Guest: don't kick me when
* you add a buffer. It's unreliable, so it's simply an optimization. Guest
@@ -62,6 +64,36 @@
* at the end of the used ring. Guest should ignore the used->flags field. */
#define VIRTIO_RING_F_EVENT_IDX 29
+struct vring_desc_packed {
+ /* Buffer Address. */
+ __virtio64 addr;
+ /* Buffer Length. */
+ __virtio32 len;
+ /* Buffer ID. */
+ __virtio16 id;
+ /* The flags depending on descriptor type. */
+ __virtio16 flags;
+};
+
+/* Enable events */
+#define RING_EVENT_FLAGS_ENABLE 0x0
+/* Disable events */
+#define RING_EVENT_FLAGS_DISABLE 0x1
+/*
+ * Enable events for a specific descriptor
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
+ */
+#define RING_EVENT_FLAGS_DESC 0x2
+/* The value 0x3 is reserved */
+
+struct vring_packed_desc_event {
+ /* Descriptor Ring Change Event Offset and Wrap Counter */
+ __virtio16 off_wrap;
+ /* Descriptor Ring Change Event Flags */
+ __virtio16 flags;
+};
+
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
struct vring_desc {
/* Address (guest-physical). */
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 5/8] vhost: vhost_put_user() can accept metadata type
From: Jason Wang @ 2018-07-03 5:38 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index af15bec..060a431 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -814,7 +814,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
}
-#define vhost_put_user(vq, x, ptr) \
+#define vhost_put_user(vq, x, ptr, type) \
({ \
int ret = -EFAULT; \
if (!vq->iotlb) { \
@@ -822,7 +822,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
} else { \
__typeof__(ptr) to = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
+ sizeof(*ptr), type); \
if (to != NULL) \
ret = __put_user(x, to); \
else \
@@ -1687,7 +1687,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
{
void __user *used;
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
- &vq->used->flags) < 0)
+ &vq->used->flags, VHOST_ADDR_USED) < 0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1706,7 +1706,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
- vhost_avail_event(vq)))
+ vhost_avail_event(vq), VHOST_ADDR_USED))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -2189,12 +2189,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
used = vq->used->ring + start;
for (i = 0; i < count; i++) {
if (unlikely(vhost_put_user(vq, heads[i].elem.id,
- &used[i].id))) {
+ &used[i].id, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
if (unlikely(vhost_put_user(vq, heads[i].elem.len,
- &used[i].len))) {
+ &used[i].len, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
@@ -2240,7 +2240,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
- &vq->used->idx)) {
+ &vq->used->idx, VHOST_ADDR_USED)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 4/8] vhost_net: do not explicitly manipulate vhost_used_elem
From: Jason Wang @ 2018-07-03 5:38 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
Two helpers of setting/getting used len were introduced to avoid
explicitly manipulating vhost_used_elem in zerocopy code. This will be
used to hide used_elem internals and simplify packed ring
implementation.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 11 +++++------
drivers/vhost/vhost.c | 12 ++++++++++--
drivers/vhost/vhost.h | 5 +++++
3 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d109649..eada5a6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -348,9 +348,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
int j = 0;
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
+ if (vhost_get_used_len(vq, &vq->heads[i]) ==
+ VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
- if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+ if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, &vq->heads[i]))) {
vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
@@ -549,10 +550,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[nvq->upend_idx].elem.id =
- cpu_to_vhost32(vq, used.elem.id);
- vq->heads[nvq->upend_idx].elem.len =
- VHOST_DMA_IN_PROGRESS;
+ vhost_set_used_len(vq, &used, VHOST_DMA_IN_PROGRESS);
+ vq->heads[nvq->upend_idx] = used;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 641f4c6..af15bec 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2071,11 +2071,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
- struct vhost_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used, int len)
{
used->elem.len = cpu_to_vhost32(vq, len);
}
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used)
+{
+ return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8dea44b..604821b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
unsigned *log_num,
unsigned int quota,
s16 *count);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ int len);
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_vq_init_access(struct vhost_virtqueue *);
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 3/8] vhost: do not use vring_used_elem
From: Jason Wang @ 2018-07-03 5:37 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 19 +++++++-------
drivers/vhost/scsi.c | 10 ++++----
drivers/vhost/vhost.c | 68 ++++++++++++---------------------------------------
drivers/vhost/vhost.h | 18 ++++++++------
drivers/vhost/vsock.c | 6 ++---
5 files changed, 45 insertions(+), 76 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 449f793..d109649 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -348,10 +348,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
int j = 0;
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+ if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
- if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
- vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+ if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+ vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
break;
@@ -374,7 +374,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
rcu_read_lock_bh();
/* set len to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = success ?
+ vq->heads[ubuf->desc].elem.len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
@@ -433,7 +433,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
- struct vring_used_elem *used_elem,
+ struct vhost_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
{
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
- struct vring_used_elem used;
+ struct vhost_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
@@ -549,9 +549,10 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[nvq->upend_idx].id =
- cpu_to_vhost32(vq, used.id);
- vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+ vq->heads[nvq->upend_idx].elem.id =
+ cpu_to_vhost32(vq, used.elem.id);
+ vq->heads[nvq->upend_idx].elem.len =
+ VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 013464c..149c38c 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
struct vhost_scsi_cmd {
/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
- struct vring_used_elem tvc_vq_used;
+ struct vhost_used_elem tvc_vq_used;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = &evt->event;
struct virtio_scsi_event __user *eventp;
- struct vring_used_elem used;
+ struct vhost_used_elem used;
unsigned out, in;
int ret;
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
static void
vhost_scsi_send_bad_target(struct vhost_scsi *vs,
struct vhost_virtqueue *vq,
- struct vring_used_elem *used, unsigned out)
+ struct vhost_used_elem *used, unsigned out)
{
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
struct virtio_scsi_cmd_req v_req;
struct virtio_scsi_cmd_req_pi v_req_pi;
struct vhost_scsi_cmd *cmd;
- struct vring_used_elem used;
+ struct vhost_used_elem used;
struct iov_iter out_iter, in_iter, prot_iter, data_iter;
u64 tag;
u32 exp_data_len, data_direction;
@@ -837,7 +837,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
ARRAY_SIZE(vq->iov), &out, &in,
NULL, NULL);
pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
- used.id, out, in);
+ used.elem.id, out, in);
/* Nothing new? Wait for eventfd to tell us they refilled. */
if (ret == -ENOSPC) {
if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9572c4f..641f4c6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -731,41 +731,6 @@ static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
struct iovec iov[], int iov_size, int access);
-static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
- const void *from, unsigned size)
-{
- int ret;
-
- if (!vq->iotlb)
- return __copy_to_user(to, from, size);
- else {
- /* This function should be called after iotlb
- * prefetch, which means we're sure that all vq
- * could be access through iotlb. So -EAGAIN should
- * not happen in this case.
- */
- struct iov_iter t;
- void __user *uaddr = vhost_vq_meta_fetch(vq,
- (u64)(uintptr_t)to, size,
- VHOST_ADDR_USED);
-
- if (uaddr)
- return __copy_to_user(uaddr, from, size);
-
- ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
- ARRAY_SIZE(vq->iotlb_iov),
- VHOST_ACCESS_WO);
- if (ret < 0)
- goto out;
- iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
- ret = copy_to_iter(from, size, &t);
- if (ret == size)
- ret = 0;
- }
-out:
- return ret;
-}
-
static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
void __user *from, unsigned size)
{
@@ -1962,7 +1927,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
* never a valid descriptor number) if none was found. A negative code is
* returned on error. */
int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct vring_used_elem *used,
+ struct vhost_used_elem *used,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
@@ -2013,7 +1978,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return -EFAULT;
}
- used->id = ring_head;
+ used->elem.id = ring_head;
head = vhost16_to_cpu(vq, ring_head);
/* If their number is silly, that's an error. */
@@ -2107,9 +2072,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
static void vhost_set_used_len(struct vhost_virtqueue *vq,
- struct vring_used_elem *used, int len)
+ struct vhost_used_elem *used, int len)
{
- used->len = cpu_to_vhost32(vq, len);
+ used->elem.len = cpu_to_vhost32(vq, len);
}
/* This is a multi-buffer version of vhost_get_desc, that works if
@@ -2123,7 +2088,7 @@ static void vhost_set_used_len(struct vhost_virtqueue *vq,
* returns number of buffer heads allocated, negative on error
*/
int vhost_get_bufs(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
+ struct vhost_used_elem *heads,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
@@ -2196,7 +2161,7 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
int len)
{
vhost_set_used_len(vq, used, len);
@@ -2205,27 +2170,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
EXPORT_SYMBOL_GPL(vhost_add_used);
static int __vhost_add_used_n(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
+ struct vhost_used_elem *heads,
unsigned count)
{
struct vring_used_elem __user *used;
u16 old, new;
- int start;
+ int start, i;
start = vq->last_used_idx & (vq->num - 1);
used = vq->used->ring + start;
- if (count == 1) {
- if (vhost_put_user(vq, heads[0].id, &used->id)) {
+ for (i = 0; i < count; i++) {
+ if (unlikely(vhost_put_user(vq, heads[i].elem.id,
+ &used[i].id))) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
- if (vhost_put_user(vq, heads[0].len, &used->len)) {
+ if (unlikely(vhost_put_user(vq, heads[i].elem.len,
+ &used[i].len))) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
- } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
- vq_err(vq, "Failed to write used");
- return -EFAULT;
}
if (unlikely(vq->log_used)) {
/* Make sure data is seen before log. */
@@ -2249,7 +2213,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
unsigned count)
{
int start, n, r;
@@ -2333,7 +2297,7 @@ EXPORT_SYMBOL_GPL(vhost_signal);
/* And here's the combo meal deal. Supersize me! */
void vhost_add_used_and_signal(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
- struct vring_used_elem *used, int len)
+ struct vhost_used_elem *used, int len)
{
vhost_add_used(vq, used, len);
vhost_signal(dev, vq);
@@ -2343,7 +2307,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
/* multi-buffer version of vhost_add_used_and_signal */
void vhost_add_used_and_signal_n(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
- struct vring_used_elem *heads, unsigned count)
+ struct vhost_used_elem *heads, unsigned count)
{
vhost_add_used_n(vq, heads, count);
vhost_signal(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a7cc7e7..8dea44b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -34,6 +34,10 @@ struct vhost_poll {
struct vhost_dev *dev;
};
+struct vhost_used_elem {
+ struct vring_used_elem elem;
+};
+
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
bool vhost_has_work(struct vhost_dev *dev);
@@ -126,7 +130,7 @@ struct vhost_virtqueue {
struct iovec iov[UIO_MAXIOV];
struct iovec iotlb_iov[64];
struct iovec *indirect;
- struct vring_used_elem *heads;
+ struct vhost_used_elem *heads;
/* Protected by virtqueue mutex. */
struct vhost_umem *umem;
struct vhost_umem *iotlb;
@@ -182,12 +186,12 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
int vhost_get_vq_desc(struct vhost_virtqueue *,
- struct vring_used_elem *used_elem,
+ struct vhost_used_elem *used_elem,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
int vhost_get_bufs(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
+ struct vhost_used_elem *heads,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
@@ -198,13 +202,13 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *vq,
- struct vring_used_elem *elem, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+ struct vhost_used_elem *elem, int len);
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
unsigned count);
void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
- struct vring_used_elem *, int len);
+ struct vhost_used_elem *, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
- struct vring_used_elem *heads, unsigned count);
+ struct vhost_used_elem *heads, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 59a01cd..695694f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
for (;;) {
struct virtio_vsock_pkt *pkt;
- struct vring_used_elem used;
+ struct vhost_used_elem used;
struct iov_iter iov_iter;
unsigned out, in;
size_t nbytes;
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}
- len = vhost32_to_cpu(vq, used.len);
+ len = vhost32_to_cpu(vq, used.elem.len);
iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -346,7 +346,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
dev);
struct virtio_vsock_pkt *pkt;
- struct vring_used_elem used;
+ struct vhost_used_elem used;
int ret;
unsigned int out, in;
bool added = false;
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 2/8] vhost: hide used ring layout from device
From: Jason Wang @ 2018-07-03 5:37 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.
So this patch tries to hide the used ring layout by
- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
vhost_add_used_and_signal()
This could help to hide used ring layout and make it easier to
implement packed ring on top.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 46 +++++++++++++++++++++-----------------
drivers/vhost/scsi.c | 62 +++++++++++++++++++++++++++------------------------
drivers/vhost/vhost.c | 52 +++++++++++++++++++++---------------------
drivers/vhost/vhost.h | 9 +++++---
drivers/vhost/vsock.c | 42 +++++++++++++++++-----------------
5 files changed, 112 insertions(+), 99 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 712b134..449f793 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -433,22 +433,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
+ struct vring_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
{
unsigned long uninitialized_var(endtime);
- int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
- if (r == vq->num && vq->busyloop_timeout) {
+ if (r == -ENOSPC && vq->busyloop_timeout) {
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(vq->dev, endtime) &&
vhost_vq_avail_empty(vq->dev, vq))
cpu_relax();
preempt_enable();
- r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
+ r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+ ARRAY_SIZE(vq->iov), out_num, in_num,
+ NULL, NULL);
}
return r;
@@ -470,7 +472,6 @@ static void handle_tx(struct vhost_net *net)
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in;
- int head;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -483,6 +484,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ struct vring_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
@@ -506,20 +508,20 @@ static void handle_tx(struct vhost_net *net)
vhost_zerocopy_signal_used(net, vq);
- head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in);
- /* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
- break;
+ err = vhost_net_tx_get_vq_desc(net, vq, &used, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in);
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (err == -ENOSPC) {
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue;
}
break;
}
+ /* On error, stop handling until the next kick. */
+ if (unlikely(err < 0))
+ break;
if (in) {
vq_err(vq, "Unexpected descriptor format for TX: "
"out %d, int %d\n", out, in);
@@ -547,7 +549,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+ vq->heads[nvq->upend_idx].id =
+ cpu_to_vhost32(vq, used.id);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
@@ -588,7 +591,7 @@ static void handle_tx(struct vhost_net *net)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
if (!zcopy_used)
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ vhost_add_used_and_signal(&net->dev, vq, &used, 0);
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
@@ -735,14 +738,12 @@ static void handle_rx(struct vhost_net *net)
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
- headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
- vhost_len, &in, vq_log, &log,
- likely(mergeable) ? UIO_MAXIOV : 1);
- /* On error, stop handling until the next kick. */
- if (unlikely(headcount < 0))
- goto out;
+ err = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
+ vhost_len, &in, vq_log, &log,
+ likely(mergeable) ? UIO_MAXIOV : 1,
+ &headcount);
/* OK, now we need to know about added descriptors. */
- if (!headcount) {
+ if (err == -ENOSPC) {
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
@@ -753,6 +754,9 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
+ /* On error, stop handling until the next kick. */
+ if (unlikely(err < 0))
+ goto out;
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 17fcd3b..013464c 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
struct vhost_scsi_cmd {
/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
- int tvc_vq_desc;
+ struct vring_used_elem tvc_vq_used;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -441,8 +441,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = &evt->event;
struct virtio_scsi_event __user *eventp;
+ struct vring_used_elem used;
unsigned out, in;
- int head, ret;
+ int ret;
if (!vq->private_data) {
vs->vs_events_missed = true;
@@ -451,16 +452,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
again:
vhost_disable_notify(&vs->dev, vq);
- head = vhost_get_vq_desc(vq, vq->iov,
+ ret = vhost_get_vq_desc(vq, &used, vq->iov,
ARRAY_SIZE(vq->iov), &out, &in,
NULL, NULL);
- if (head < 0) {
+ if (ret == -ENOSPC) {
+ if (vhost_enable_notify(&vs->dev, vq))
+ goto again;
vs->vs_events_missed = true;
return;
}
- if (head == vq->num) {
- if (vhost_enable_notify(&vs->dev, vq))
- goto again;
+ if (ret < 0) {
vs->vs_events_missed = true;
return;
}
@@ -480,7 +481,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
eventp = vq->iov[out].iov_base;
ret = __copy_to_user(eventp, event, sizeof(*event));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
else
vq_err(vq, "Faulted on vhost_scsi_send_event\n");
}
@@ -541,7 +542,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
struct vhost_scsi_virtqueue *q;
- vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+ vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
vq = q - vs->vqs;
__set_bit(vq, signal);
@@ -784,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
static void
vhost_scsi_send_bad_target(struct vhost_scsi *vs,
struct vhost_virtqueue *vq,
- int head, unsigned out)
+ struct vring_used_elem *used, unsigned out)
{
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -795,7 +796,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
resp = vq->iov[out].iov_base;
ret = __copy_to_user(resp, &rsp, sizeof(rsp));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_add_used_and_signal(&vs->dev, vq, used, 0);
else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
}
@@ -807,11 +808,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
struct virtio_scsi_cmd_req v_req;
struct virtio_scsi_cmd_req_pi v_req_pi;
struct vhost_scsi_cmd *cmd;
+ struct vring_used_elem used;
struct iov_iter out_iter, in_iter, prot_iter, data_iter;
u64 tag;
u32 exp_data_len, data_direction;
unsigned int out = 0, in = 0;
- int head, ret, prot_bytes;
+ int ret, prot_bytes;
size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
size_t out_size, in_size;
u16 lun;
@@ -831,22 +833,22 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
vhost_disable_notify(&vs->dev, vq);
for (;;) {
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov), &out, &in,
- NULL, NULL);
+ ret = vhost_get_vq_desc(vq, &used, vq->iov,
+ ARRAY_SIZE(vq->iov), &out, &in,
+ NULL, NULL);
pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
- head, out, in);
- /* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
- break;
+ used.id, out, in);
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (ret == -ENOSPC) {
if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
vhost_disable_notify(&vs->dev, vq);
continue;
}
break;
}
+ /* On error, stop handling until the next kick. */
+ if (unlikely(ret < 0))
+ break;
/*
* Check for a sane response buffer so we can report early
* errors back to the guest.
@@ -891,20 +893,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
vq_err(vq, "Faulted on copy_from_iter\n");
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
/* virtio-scsi spec requires byte 0 of the lun to be 1 */
if (unlikely(*lunp != 1)) {
vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
tpg = READ_ONCE(vs_tpg[*target]);
if (unlikely(!tpg)) {
/* Target does not exist, fail the request */
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
/*
@@ -950,7 +952,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (data_direction != DMA_TO_DEVICE) {
vq_err(vq, "Received non zero pi_bytesout,"
" but wrong data_direction\n");
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq,
+ &used, out);
continue;
}
prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
@@ -958,7 +961,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (data_direction != DMA_FROM_DEVICE) {
vq_err(vq, "Received non zero pi_bytesin,"
" but wrong data_direction\n");
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq,
+ &used, out);
continue;
}
prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
@@ -996,7 +1000,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
vq_err(vq, "Received SCSI CDB with command_size: %d that"
" exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
@@ -1005,7 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (IS_ERR(cmd)) {
vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
PTR_ERR(cmd));
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
cmd->tvc_vhost = vs;
@@ -1025,7 +1029,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (unlikely(ret)) {
vq_err(vq, "Failed to map iov to sgl\n");
vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
- vhost_scsi_send_bad_target(vs, vq, head, out);
+ vhost_scsi_send_bad_target(vs, vq, &used, out);
continue;
}
}
@@ -1034,7 +1038,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
* complete the virtio-scsi request in TCM callback context via
* vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
*/
- cmd->tvc_vq_desc = head;
+ cmd->tvc_vq_used = used;
/*
* Dispatch cmd descriptor for cmwq execution in process
* context provided by vhost_scsi_workqueue. This also ensures
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8814e5b..9572c4f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1962,6 +1962,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
* never a valid descriptor number) if none was found. A negative code is
* returned on error. */
int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+ struct vring_used_elem *used,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
@@ -1994,7 +1995,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
* invalid.
*/
if (vq->avail_idx == last_avail_idx)
- return vq->num;
+ return -ENOSPC;
/* Only get avail ring entries after they have been
* exposed by guest.
@@ -2012,6 +2013,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return -EFAULT;
}
+ used->id = ring_head;
head = vhost16_to_cpu(vq, ring_head);
/* If their number is silly, that's an error. */
@@ -2100,10 +2102,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
/* Assume notifications from guest are disabled at this point,
* if they aren't we would need to update avail_event index. */
BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
- return head;
+ return 0;
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+ struct vring_used_elem *used, int len)
+{
+ used->len = cpu_to_vhost32(vq, len);
+}
+
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
* @vq - the relevant virtqueue
@@ -2120,13 +2128,13 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
unsigned *iovcount,
struct vhost_log *log,
unsigned *log_num,
- unsigned int quota)
+ unsigned int quota,
+ s16 *count)
{
unsigned int out, in;
int seg = 0;
int headcount = 0;
- unsigned d;
- int r, nlogs = 0;
+ int r = 0, nlogs = 0;
/* len is always initialized before use since we are always called with
* datalen > 0.
*/
@@ -2137,17 +2145,12 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
r = -ENOBUFS;
goto err;
}
- r = vhost_get_vq_desc(vq, vq->iov + seg,
+ r = vhost_get_vq_desc(vq, &heads[headcount], vq->iov + seg,
ARRAY_SIZE(vq->iov) - seg, &out,
&in, log, log_num);
if (unlikely(r < 0))
goto err;
- d = r;
- if (d == vq->num) {
- r = 0;
- goto err;
- }
if (unlikely(out || in <= 0)) {
vq_err(vq, "unexpected descriptor format for RX: "
"out %d, in %d\n", out, in);
@@ -2158,24 +2161,26 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- heads[headcount].id = cpu_to_vhost32(vq, d);
+
len = iov_length(vq->iov + seg, in);
- heads[headcount].len = cpu_to_vhost32(vq, len);
+ vhost_set_used_len(vq, &heads[headcount], len);
datalen -= len;
++headcount;
seg += in;
}
- heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+ vhost_set_used_len(vq, &heads[headcount - 1], len + datalen);
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
/* Detect overrun */
if (unlikely(datalen > 0)) {
- r = UIO_MAXIOV + 1;
+ headcount = UIO_MAXIOV + 1;
goto err;
}
- return headcount;
+
+ *count = headcount;
+ return 0;
err:
vhost_discard_vq_desc(vq, headcount);
return r;
@@ -2191,14 +2196,11 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+ int len)
{
- struct vring_used_elem heads = {
- cpu_to_vhost32(vq, head),
- cpu_to_vhost32(vq, len)
- };
-
- return vhost_add_used_n(vq, &heads, 1);
+ vhost_set_used_len(vq, used, len);
+ return vhost_add_used_n(vq, used, 1);
}
EXPORT_SYMBOL_GPL(vhost_add_used);
@@ -2331,9 +2333,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
/* And here's the combo meal deal. Supersize me! */
void vhost_add_used_and_signal(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
- unsigned int head, int len)
+ struct vring_used_elem *used, int len)
{
- vhost_add_used(vq, head, len);
+ vhost_add_used(vq, used, len);
vhost_signal(dev, vq);
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 52edd242..a7cc7e7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -182,6 +182,7 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
int vhost_get_vq_desc(struct vhost_virtqueue *,
+ struct vring_used_elem *used_elem,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
@@ -191,15 +192,17 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
unsigned *iovcount,
struct vhost_log *log,
unsigned *log_num,
- unsigned int quota);
+ unsigned int quota,
+ s16 *count);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *vq,
+ struct vring_used_elem *elem, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
unsigned count);
void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
- unsigned int id, int len);
+ struct vring_used_elem *, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..59a01cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,11 +98,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
for (;;) {
struct virtio_vsock_pkt *pkt;
+ struct vring_used_elem used;
struct iov_iter iov_iter;
unsigned out, in;
size_t nbytes;
size_t len;
- int head;
+ int ret;
spin_lock_bh(&vsock->send_pkt_list_lock);
if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +117,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
list_del_init(&pkt->list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0) {
- spin_lock_bh(&vsock->send_pkt_list_lock);
- list_add(&pkt->list, &vsock->send_pkt_list);
- spin_unlock_bh(&vsock->send_pkt_list_lock);
- break;
- }
-
- if (head == vq->num) {
+ ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret == -ENOSPC) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -139,6 +133,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
}
break;
}
+ if (ret < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
if (out) {
virtio_transport_free_pkt(pkt);
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}
- len = iov_length(&vq->iov[out], in);
+ len = vhost32_to_cpu(vq, used.len);
iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -163,7 +163,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}
- vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+ vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
added = true;
if (pkt->reply) {
@@ -346,7 +346,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
dev);
struct virtio_vsock_pkt *pkt;
- int head;
+ struct vring_used_elem used;
+ int ret;
unsigned int out, in;
bool added = false;
@@ -367,18 +368,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
goto no_more_replies;
}
- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0)
- break;
-
- if (head == vq->num) {
+ ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret == -ENOSPC) {
if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
vhost_disable_notify(&vsock->dev, vq);
continue;
}
break;
}
+ if (ret < 0)
+ break;
pkt = vhost_vsock_alloc_pkt(vq, out, in);
if (!pkt) {
@@ -397,7 +397,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
else
virtio_transport_free_pkt(pkt);
- vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+ vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
added = true;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-07-03 5:37 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
In-Reply-To: <1530596284-4101-1-git-send-email-jasowang@redhat.com>
Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 77 --------------------------------------------------
drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
drivers/vhost/vhost.h | 7 +++++
3 files changed, 85 insertions(+), 77 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 29756d8..712b134 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -685,83 +685,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
return len;
}
-/* This is a multi-buffer version of vhost_get_desc, that works if
- * vq has read descriptors only.
- * @vq - the relevant virtqueue
- * @datalen - data length we'll be reading
- * @iovcount - returned count of io vectors we fill
- * @log - vhost log
- * @log_num - log offset
- * @quota - headcount quota, 1 for big buffer
- * returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
- int datalen,
- unsigned *iovcount,
- struct vhost_log *log,
- unsigned *log_num,
- unsigned int quota)
-{
- unsigned int out, in;
- int seg = 0;
- int headcount = 0;
- unsigned d;
- int r, nlogs = 0;
- /* len is always initialized before use since we are always called with
- * datalen > 0.
- */
- u32 uninitialized_var(len);
-
- while (datalen > 0 && headcount < quota) {
- if (unlikely(seg >= UIO_MAXIOV)) {
- r = -ENOBUFS;
- goto err;
- }
- r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, &out,
- &in, log, log_num);
- if (unlikely(r < 0))
- goto err;
-
- d = r;
- if (d == vq->num) {
- r = 0;
- goto err;
- }
- if (unlikely(out || in <= 0)) {
- vq_err(vq, "unexpected descriptor format for RX: "
- "out %d, in %d\n", out, in);
- r = -EINVAL;
- goto err;
- }
- if (unlikely(log)) {
- nlogs += *log_num;
- log += *log_num;
- }
- heads[headcount].id = cpu_to_vhost32(vq, d);
- len = iov_length(vq->iov + seg, in);
- heads[headcount].len = cpu_to_vhost32(vq, len);
- datalen -= len;
- ++headcount;
- seg += in;
- }
- heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
- *iovcount = seg;
- if (unlikely(log))
- *log_num = nlogs;
-
- /* Detect overrun */
- if (unlikely(datalen > 0)) {
- r = UIO_MAXIOV + 1;
- goto err;
- }
- return headcount;
-err:
- vhost_discard_vq_desc(vq, headcount);
- return r;
-}
-
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_rx(struct vhost_net *net)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..8814e5b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2104,6 +2104,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ * vq has read descriptors only.
+ * @vq - the relevant virtqueue
+ * @datalen - data length we'll be reading
+ * @iovcount - returned count of io vectors we fill
+ * @log - vhost log
+ * @log_num - log offset
+ * @quota - headcount quota, 1 for big buffer
+ * returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ int datalen,
+ unsigned *iovcount,
+ struct vhost_log *log,
+ unsigned *log_num,
+ unsigned int quota)
+{
+ unsigned int out, in;
+ int seg = 0;
+ int headcount = 0;
+ unsigned d;
+ int r, nlogs = 0;
+ /* len is always initialized before use since we are always called with
+ * datalen > 0.
+ */
+ u32 uninitialized_var(len);
+
+ while (datalen > 0 && headcount < quota) {
+ if (unlikely(seg >= UIO_MAXIOV)) {
+ r = -ENOBUFS;
+ goto err;
+ }
+ r = vhost_get_vq_desc(vq, vq->iov + seg,
+ ARRAY_SIZE(vq->iov) - seg, &out,
+ &in, log, log_num);
+ if (unlikely(r < 0))
+ goto err;
+
+ d = r;
+ if (d == vq->num) {
+ r = 0;
+ goto err;
+ }
+ if (unlikely(out || in <= 0)) {
+ vq_err(vq, "unexpected descriptor format for RX: "
+ "out %d, in %d\n", out, in);
+ r = -EINVAL;
+ goto err;
+ }
+ if (unlikely(log)) {
+ nlogs += *log_num;
+ log += *log_num;
+ }
+ heads[headcount].id = cpu_to_vhost32(vq, d);
+ len = iov_length(vq->iov + seg, in);
+ heads[headcount].len = cpu_to_vhost32(vq, len);
+ datalen -= len;
+ ++headcount;
+ seg += in;
+ }
+ heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+ *iovcount = seg;
+ if (unlikely(log))
+ *log_num = nlogs;
+
+ /* Detect overrun */
+ if (unlikely(datalen > 0)) {
+ r = UIO_MAXIOV + 1;
+ goto err;
+ }
+ return headcount;
+err:
+ vhost_discard_vq_desc(vq, headcount);
+ return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..52edd242 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ int datalen,
+ unsigned *iovcount,
+ struct vhost_log *log,
+ unsigned *log_num,
+ unsigned int quota);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_vq_init_access(struct vhost_virtqueue *);
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 0/8] Packed virtqueue for vhost
From: Jason Wang @ 2018-07-03 5:37 UTC (permalink / raw)
To: mst, jasowang, kvm, virtualization, netdev, linux-kernel
Cc: maxime.coquelin, wexu
Hi all:
This series implements packed virtqueues. The code were tested with
Tiwei's RFC V6 at https://lkml.org/lkml/2018/6/5/120.
Pktgen test for both RX and TX does not show obvious difference with
split virtqueues. The main bottleneck is the guest Linux driver, since
it can not stress vhost for a 100% CPU utilization. A full TCP
benchmark is ongoing. Will test virtio-net pmd as well when it was
ready.
This version were tested with:
- Zerocopy (Out of Order) support
- vIOMMU support
- mergeable buffer on/off
- busy polling on/off
Changes from RFC V5:
- save unnecessary barriers during vhost_add_used_packed_n()
- more compact math for event idx
- fix failure of SET_VRING_BASE when avail_wrap_counter is true
- fix not copy avail_wrap_counter during GET_VRING_BASE
- introduce SET_VRING_USED_BASE/GET_VRING_USED_BASE for syncing last_used_idx
- rename used_wrap_counter to last_used_wrap_counter
- rebase to net-next
Changes from RFC V4:
- fix signalled_used index recording
- track avail index correctly
- various minor fixes
Changes from RFC V3:
- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure
Changes from RFC V2:
- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool
Changes from RFC V1:
- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs
Jason Wang (8):
vhost: move get_rx_bufs to vhost.c
vhost: hide used ring layout from device
vhost: do not use vring_used_elem
vhost_net: do not explicitly manipulate vhost_used_elem
vhost: vhost_put_user() can accept metadata type
virtio: introduce packed ring defines
vhost: packed ring support
vhost: event suppression for packed ring
drivers/vhost/net.c | 144 ++----
drivers/vhost/scsi.c | 62 +--
drivers/vhost/vhost.c | 996 +++++++++++++++++++++++++++++++++----
drivers/vhost/vhost.h | 52 +-
drivers/vhost/vsock.c | 42 +-
include/uapi/linux/vhost.h | 7 +
include/uapi/linux/virtio_config.h | 2 +
include/uapi/linux/virtio_ring.h | 32 ++
8 files changed, 1070 insertions(+), 267 deletions(-)
--
2.7.4
^ permalink raw reply
* Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Jason Wang @ 2018-07-03 2:12 UTC (permalink / raw)
To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530536228-17462-4-git-send-email-xiangxia.m.yue@gmail.com>
On 2018年07月02日 20:57, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Factor out generic busy polling logic and will be
> used for in tx path in the next patch. And with the patch,
> qemu can set differently the busyloop_timeout for rx queue.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
> drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
> 1 file changed, 55 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 62bb8e8..2790959 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> return vhost_poll_start(poll, sock->file);
> }
>
> +static int sk_has_rx_data(struct sock *sk)
> +{
> + struct socket *sock = sk->sk_socket;
> +
> + if (sock->ops->peek_len)
> + return sock->ops->peek_len(sock);
> +
> + return skb_queue_empty(&sk->sk_receive_queue);
> +}
> +
> +static void vhost_net_busy_poll(struct vhost_net *net,
> + struct vhost_virtqueue *rvq,
> + struct vhost_virtqueue *tvq,
> + bool rx)
> +{
> + unsigned long uninitialized_var(endtime);
> + unsigned long busyloop_timeout;
> + struct socket *sock;
> + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> +
> + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> +
> + vhost_disable_notify(&net->dev, vq);
> + sock = rvq->private_data;
> + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
> +
> + preempt_disable();
> + endtime = busy_clock() + busyloop_timeout;
> + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> + !(sock && sk_has_rx_data(sock->sk)) &&
> + vhost_vq_avail_empty(tvq->dev, tvq))
> + cpu_relax();
> + preempt_enable();
> +
> + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> + vhost_poll_queue(&vq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
One last question, do we need this for rx? This check will be always
true under light or medium load.
Thanks
> + vhost_disable_notify(&net->dev, vq);
> + vhost_poll_queue(&vq->poll);
> + }
> +
> + mutex_unlock(&vq->mutex);
> +}
> +
> +
> static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> @@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
> return len;
> }
>
> -static int sk_has_rx_data(struct sock *sk)
> -{
> - struct socket *sock = sk->sk_socket;
> -
> - if (sock->ops->peek_len)
> - return sock->ops->peek_len(sock);
> -
> - return skb_queue_empty(&sk->sk_receive_queue);
> -}
> -
> static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> {
> struct vhost_virtqueue *vq = &nvq->vq;
> @@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>
> static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> {
> - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> - struct vhost_virtqueue *vq = &nvq->vq;
> - unsigned long uninitialized_var(endtime);
> - int len = peek_head_len(rvq, sk);
> + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>
> - if (!len && vq->busyloop_timeout) {
> - /* Flush batched heads first */
> - vhost_rx_signal_used(rvq);
> - /* Both tx vq and rx socket were polled here */
> - mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> - vhost_disable_notify(&net->dev, vq);
> + int len = peek_head_len(rnvq, sk);
>
> - preempt_disable();
> - endtime = busy_clock() + vq->busyloop_timeout;
> -
> - while (vhost_can_busy_poll(&net->dev, endtime) &&
> - !sk_has_rx_data(sk) &&
> - vhost_vq_avail_empty(&net->dev, vq))
> - cpu_relax();
> -
> - preempt_enable();
> -
> - if (!vhost_vq_avail_empty(&net->dev, vq))
> - vhost_poll_queue(&vq->poll);
> - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> - vhost_disable_notify(&net->dev, vq);
> - vhost_poll_queue(&vq->poll);
> - }
> + if (!len && rnvq->vq.busyloop_timeout) {
> + /* Flush batched heads first */
> + vhost_rx_signal_used(rnvq);
>
> - mutex_unlock(&vq->mutex);
> + /* Both tx vq and rx socket were polled here */
> + vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
>
> - len = peek_head_len(rvq, sk);
> + len = peek_head_len(rnvq, sk);
> }
>
> return len;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Call for Papers - DSIC’18 - Montenegro; Deadline: July 14
From: Marlemos @ 2018-07-02 19:00 UTC (permalink / raw)
To: virtualization
[-- Attachment #1.1: Type: text/plain, Size: 5927 bytes --]
* Proceedings by Springer
** Indexed in Scopus, ISI, DBLP, etc.
__________________________________________________
DSIC’18 – The 2018 International Conference on Digital Science
19 – 21 October 2018, Montenegro
https://digscience.org/ <https://digscience.org/>
----------------------------------------------------------------------------------------------------------------
SCOPE
DSIC’18 – The 2018 International Conference on Digital Science, to be held at Montenegro, 19 – 21 October 2018, is an international forum for researchers and practitioners to present and discuss the most recent innovations, trends, results, experiences and concerns in the several perspectives of Digital Science.
The main idea of this Conference is that the world of science is unified and united allowing all scientists/practitioners to be able to think, analyze and generalize their thoughts. The concept of this conference is that all scientists have the ability to think, analyze, and extend knowledge using digital technology, and that technology allows researchers to break the barriers between disciplines.
DSIC aims efficiently disseminate original research results in natural, social, art&humanities sciences. This Conference enables fast dissemination so conference participants can publish their papers in print and electronic format, which is then made available worldwide and accessible by millions of researchers.
DSIC covers publications on many multidisciplinaire levels but each accepted paper might be presented in the appropriate thematic Conference session of narrow specialization. The conference organizers believe that all scientists selected to present their work are important in their own right, therefore additional keynote speakers are unnecessary.
We are pleased to invite you to submit your papers to DSIC’18. Papers must be written in English. All submissions will be reviewed on the basis of relevance, originality, importance and clarity.
CONFERENCE TOPICS
We invite submissions that should be related with one or more of the following topics, but are not limited to:
1. Digital Art & Humanities
2. Digital Agriculture & Food Technology/strong>
3. Digital Economics
4. Digital Engineering
5. Digital Enviromental Sciences
6. Digital Finance, Business & Banking
7. Digital Medicine, Pharma & Public Health
8. Digital Health Care, Hospitals & Rehabilitation
9. Digital Law
10. Digital Public Administration
11. Digital Technology & Applied Sciences
12. Digital Virtual Reality
13. Digital Education
14. Digital Media
SUBMISSION AND DECISION
You may submit your paper online using OpenConf Peer Review & Conference Management System <http://www.aistic.org/ds2018/oc18/openconf.php>.
Submitted papers (until 8-page limit) must comply with the formatting rules for Proceedings in Advances in Intelligent Systems and Computing Series (see Instructions for Authors at Springer Website <https://www.springer.com/us/authors-editors/conference-proceedings/conference-proceedings-guidelines> or the requested format (download a DOC example <http://www.icits.me/springerformat.doc>) and be written in English, must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word and also in PDF format.
All papers will be subjected to a “double-blind review” by at least two Scientific Committee members. The «blind» review means that the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee or/and Reviewers. This information should only be included in the camera-ready version, saved in Word and/or in PDF format. Based on Scientific Committee evaluation, a paper can be rejected or accepted by the Conference Chairs.
Each accepted paper might be presented in the appropriate thematic session of narrow specialization. In the later case, it can be accepted as paper or poster.
The authors of accepted papers will have 15 minutes to present their work in a Conference Work or Internet Session; approximately 5 minutes of discussion will follow each presentation.
PUBLICATION AND INDEXING
To ensure that an accepted paper will be published, at least one of the authors must be fully registered by the 01st of August 2018, and the paper must comply with the suggested layout and page-limit (until 8 pages). Additionally, all recommended changes must be addressed by the authors before they submit the camera-ready version.
No more than one paper per registration will be published. An extra fee must be paid for publication of additional papers, with a maximum of one additional paper per registration. One registration permits only the participation of one author in the conference.
Accepted Papers written in English and registered will be published in Proceedings by Springer in a book of the Advances in Intelligent Systems and Computing <http://www.springer.com/series/11156>series, and then submitted for indexation by Web of Science, SCOPUS, and will be available in the SpringerLink Digital Library <http://link.springer.com/>.
IMPORTANT DATES
Paper Submission Deadline: July 14, 2018
Notification of Acceptance: July 23, 2018
Payment of Registration, to ensure the inclusion of an accepted paper in the conference proceedings: AUGUST 01, 2018.
Camera-ready Submission: AUGUST 01, 2018
DSIC’18 Website: https://digscience.org/ <https://digscience.org/>
---
This email has been checked for viruses by AVG.
https://www.avg.com
[-- Attachment #1.2: Type: text/html, Size: 7976 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH 0/9] Fix references for some missing documentation files
From: Jonathan Corbet @ 2018-07-02 17:27 UTC (permalink / raw)
To: Mauro Carvalho Chehab
Cc: Mark Rutland, James Morris, kvm, Linux Doc Mailing List,
virtualization, linux-leds, linux-scsi, Ingo Molnar,
intel-wired-lan, Jeff Kirsher, linux-input, devicetree,
Steven Rostedt, Mauro Carvalho Chehab, linux-gpio,
Jacek Anaszewski, Andy Whitcroft, netdev, linux-kernel,
Changbin Du, Masami Hiramatsu, linux-kernel, Joe Perches,
Andrew Morton, linuxppc-dev
In-Reply-To: <cover.1530005114.git.mchehab+samsung@kernel.org>
On Tue, 26 Jun 2018 06:49:02 -0300
Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:
> Having nothing to do while waiting for my plane to arrive while
> returning back from Japan, I ended by writing a small series of
> patches meant to reduce the number of bad Documentation/*
> links that are detected by:
> ./scripts/documentation-file-ref-check
I've applied everything except the two networking patches, since I expect
those to go through Dave's tree.
Thanks,
jon
^ permalink raw reply
* Re: [PATCH] x86-64: use RIP-relative calls for paravirt indirect ones
From: Juergen Gross @ 2018-07-02 13:53 UTC (permalink / raw)
To: Jan Beulich, Alok Kataria; +Cc: virtualization
In-Reply-To: <5B30C3F102000078001CD6F1@suse.com>
On 25/06/18 12:29, Jan Beulich wrote:
> This saves one insn byte per instance, summing up to a savings of over
> 4k in my (stripped down) configuration. No variant of to be patched in
> replacement code relies on the one byte larger size.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Juergen
^ permalink raw reply
* [PATCH net-next v4 4/4] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530536228-17462-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch improves the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.
We set the poll-us=100us and use the iperf3 to test
its bandwidth, use the netperf to test throughput and mean
latency. When running the tests, the vhost-net kthread of
that VM, is alway 100% CPU. The commands are shown as below.
iperf3 -s -D
iperf3 -c IP -i 1 -P 1 -t 20 -M 1400
or
netserver
netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
host -> guest:
iperf3:
* With the patch: 27.0 Gbits/sec
* Without the patch: 14.4 Gbits/sec
netperf (TCP_RR):
* With the patch: 48039.56 trans/s, 20.64us mean latency
* Without the patch: 46027.07 trans/s, 21.58us mean latency
This patch also improves the guest transmit performance.
guest -> host:
iperf3:
* With the patch: 27.2 Gbits/sec
* Without the patch: 24.4 Gbits/sec
netperf (TCP_RR):
* With the patch: 47963.25 trans/s, 20.71us mean latency
* Without the patch: 45796.70 trans/s, 21.68us mean latency
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
drivers/vhost/net.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2790959..3f26547 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -480,17 +480,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
{
- unsigned long uninitialized_var(endtime);
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
if (r == vq->num && vq->busyloop_timeout) {
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(vq->dev, endtime) &&
- vhost_vq_avail_empty(vq->dev, vq))
- cpu_relax();
- preempt_enable();
+ vhost_net_busy_poll(net, &rnvq->vq, vq, false);
+
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-07-02 12:57 UTC (permalink / raw)
To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530536228-17462-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
drivers/vhost/net.c | 94 +++++++++++++++++++++++++++++++----------------------
1 file changed, 55 insertions(+), 39 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..2790959 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool rx)
+{
+ unsigned long uninitialized_var(endtime);
+ unsigned long busyloop_timeout;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+ mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+
+ vhost_disable_notify(&net->dev, vq);
+ sock = rvq->private_data;
+ busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+ while (vhost_can_busy_poll(tvq->dev, endtime) &&
+ !(sock && sk_has_rx_data(sock->sk)) &&
+ vhost_vq_avail_empty(tvq->dev, tvq))
+ cpu_relax();
+ preempt_enable();
+
+ if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+ (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+
+ mutex_unlock(&vq->mutex);
+}
+
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
@@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
{
struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
- struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
- unsigned long uninitialized_var(endtime);
- int len = peek_head_len(rvq, sk);
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
- if (!len && vq->busyloop_timeout) {
- /* Flush batched heads first */
- vhost_rx_signal_used(rvq);
- /* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(&net->dev, vq);
+ int len = peek_head_len(rnvq, sk);
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
-
- while (vhost_can_busy_poll(&net->dev, endtime) &&
- !sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, vq))
- cpu_relax();
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, vq))
- vhost_poll_queue(&vq->poll);
- else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- vhost_disable_notify(&net->dev, vq);
- vhost_poll_queue(&vq->poll);
- }
+ if (!len && rnvq->vq.busyloop_timeout) {
+ /* Flush batched heads first */
+ vhost_rx_signal_used(rnvq);
- mutex_unlock(&vq->mutex);
+ /* Both tx vq and rx socket were polled here */
+ vhost_net_busy_poll(net, &rnvq->vq, &tnvq->vq, true);
- len = peek_head_len(rvq, sk);
+ len = peek_head_len(rnvq, sk);
}
return len;
--
1.8.3.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox