* [PATCH net-next v7 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-08-01 3:00 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1533092454-37196-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.
In the handle_tx, the busypoll will vhost_net_disable/enable_vq
because we have poll the sock. This can improve performance.
[This is suggested by Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>]
And when the sock receive skb, we should queue the poll if necessary.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 131 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 91 insertions(+), 40 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 32c1b52..5b45463 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -440,6 +440,95 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
nvq->done_idx = 0;
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ if (!vhost_vq_avail_empty(&net->dev, vq)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+}
+
+static void vhost_net_busy_poll_check(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool rx)
+{
+ struct socket *sock = rvq->private_data;
+
+ if (rx)
+ vhost_net_busy_poll_try_queue(net, tvq);
+ else if (sock && sk_has_rx_data(sock->sk))
+ vhost_net_busy_poll_try_queue(net, rvq);
+ else {
+ /* On tx here, sock has no rx data, so we
+ * will wait for sock wakeup for rx, and
+ * vhost_enable_notify() is not needed. */
+ }
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool *busyloop_intr,
+ bool rx)
+{
+ unsigned long busyloop_timeout;
+ unsigned long endtime;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+ mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+ vhost_disable_notify(&net->dev, vq);
+ sock = rvq->private_data;
+
+ busyloop_timeout = rx ? rvq->busyloop_timeout:
+ tvq->busyloop_timeout;
+
+
+ /* Busypoll the sock, so don't need rx wakeups during it. */
+ if (!rx)
+ vhost_net_disable_vq(net, vq);
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+
+ if ((sock && sk_has_rx_data(sock->sk) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
+
+ cpu_relax();
+ }
+
+ preempt_enable();
+
+ if (!rx)
+ vhost_net_enable_vq(net, vq);
+
+ vhost_net_busy_poll_check(net, rvq, tvq, rx);
+
+ mutex_unlock(&vq->mutex);
+}
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_net_virtqueue *nvq,
unsigned int *out_num, unsigned int *in_num,
@@ -753,16 +842,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
bool *busyloop_intr)
{
@@ -770,41 +849,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
- if (!len && tvq->busyloop_timeout) {
+ if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(&net->dev, tvq);
-
- preempt_disable();
- endtime = busy_clock() + tvq->busyloop_timeout;
-
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(&net->dev)) {
- *busyloop_intr = true;
- break;
- }
- if ((sk_has_rx_data(sk) &&
- !vhost_vq_avail_empty(&net->dev, rvq)) ||
- !vhost_vq_avail_empty(&net->dev, tvq))
- break;
- cpu_relax();
- }
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, tvq)) {
- vhost_poll_queue(&tvq->poll);
- } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
- vhost_disable_notify(&net->dev, tvq);
- vhost_poll_queue(&tvq->poll);
- }
-
- mutex_unlock(&tvq->mutex);
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
len = peek_head_len(rnvq, sk);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v7 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-08-01 3:00 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1533092454-37196-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 367d802..32c1b52 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -712,7 +712,7 @@ static void handle_tx(struct vhost_net *net)
struct vhost_virtqueue *vq = &nvq->vq;
struct socket *sock;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
sock = vq->private_data;
if (!sock)
goto out;
@@ -777,7 +777,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, 1);
+ mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
vhost_disable_notify(&net->dev, tvq);
preempt_disable();
@@ -919,7 +919,7 @@ static void handle_rx(struct vhost_net *net)
__virtio16 num_buffers;
int recv_pkts = 0;
- mutex_lock_nested(&vq->mutex, 0);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
sock = vq->private_data;
if (!sock)
goto out;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v7 1/4] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-08-01 3:00 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1533092454-37196-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..a1c06e7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
{
int i;
- for (i = 0; i < d->nvqs; ++i)
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
__vhost_vq_meta_reset(d->vqs[i]);
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
}
static void vhost_vq_reset(struct vhost_dev *dev,
@@ -890,20 +893,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
#define vhost_get_used(vq, x, ptr) \
vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_unlock(&d->vqs[i]->mutex);
-}
-
static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -953,7 +942,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
if (msg->iova <= vq_msg->iova &&
msg->iova + msg->size - 1 > vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
+ mutex_lock(&node->vq->mutex);
vhost_poll_queue(&node->vq->poll);
+ mutex_unlock(&node->vq->mutex);
+
list_del(&node->node);
kfree(node);
}
@@ -985,7 +977,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
int ret = 0;
mutex_lock(&dev->mutex);
- vhost_dev_lock_vqs(dev);
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
if (!dev->iotlb) {
@@ -1019,7 +1010,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
- vhost_dev_unlock_vqs(dev);
mutex_unlock(&dev->mutex);
return ret;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v7 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-08-01 3:00 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patches improve the guest receive performance.
On the handle_tx side, we poll the sock receive queue
at the same time. handle_rx do that in the same way.
For more performance report, see patch 4.
v6->v7:
fix issue and rebase codes:
1. on tx, busypoll will vhost_net_disable/enable_vq rx vq.
[This is suggested by Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>]
2. introduce common helper vhost_net_busy_poll_try_queue().
v5->v6:
rebase the codes.
Tonghao Zhang (4):
net: vhost: lock the vqs one by one
net: vhost: replace magic number of lock annotation
net: vhost: factor out busy polling logic to vhost_net_busy_poll()
net: vhost: add rx busy polling in tx path
drivers/vhost/net.c | 168 +++++++++++++++++++++++++++++++-------------------
drivers/vhost/vhost.c | 24 +++-----
2 files changed, 113 insertions(+), 79 deletions(-)
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: Toshiaki Makita @ 2018-08-01 1:46 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <d11c65d8-20be-1a85-c687-c6f182264e41@redhat.com>
On 2018/08/01 10:39, Jason Wang wrote:
> On 2018年07月31日 18:02, Toshiaki Makita wrote:
>> On 2018/07/31 18:43, Jason Wang wrote:
>>> We don't maintain tx counters in rx stats any more. There's no need
>>> for an extra container of rq stats.
>>>
>>> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>> drivers/net/virtio_net.c | 80
>>> ++++++++++++++++++++++--------------------------
>>> 1 file changed, 36 insertions(+), 44 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 72d3f68..14f661c 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -87,7 +87,8 @@ struct virtnet_sq_stats {
>>> u64 kicks;
>>> };
>>> -struct virtnet_rq_stat_items {
>>> +struct virtnet_rq_stats {
>>> + struct u64_stats_sync syncp;
>>> u64 packets;
>>> u64 bytes;
>>> u64 drops;
>>> @@ -98,17 +99,8 @@ struct virtnet_rq_stat_items {
>>> u64 kicks;
>>> };
>>> -struct virtnet_rq_stats {
>>> - struct u64_stats_sync syncp;
>>> - struct virtnet_rq_stat_items items;
>>> -};
>> I'm not thinking removing sq stat is needed but even if it is I want to
>> keep virtnet_rq_stats to avoid allocating unnecessary u64_stats_syncp on
>> stack in virtnet_receive. I would just remove virtnet_rx_stats if
>> necessary.
>
> It's a nop on 64bit machines. And an unsigned on 32bit. So it's overhead
> could be ignored I think.
It's not a big problem so that's OK. I just felt like you reverted
unnecessarily too much. Anyway it is already applied and I'm not
thinking of changing this any more.
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: Toshiaki Makita @ 2018-08-01 1:42 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <ffbfa3ba-3019-5324-4ecb-eb7a2699a4e8@redhat.com>
On 2018/08/01 10:31, Jason Wang wrote:
> On 2018年07月31日 17:57, Toshiaki Makita wrote:
>> On 2018/07/31 18:43, Jason Wang wrote:
>>> Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
>>> count TX XDP stats in virtnet_receive(). This will cause several
>>> issues:
>>>
>>> - virtnet_xdp_sq() was called without checking whether or not XDP is
>>> set. This may cause out of bound access when there's no enough txq
>>> for XDP.
>>> - Stats were updated even if there's no XDP/XDP_TX.>
>>> Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
>>> TX XDP counter itself and remove the unnecessary tx stats embedded in
>>> rx stats.
>> Thanks for fixing this.
>> I wanted to avoid calling u64_stats_update_begin() (i.e. smp_wmb() in 32
>> bit systems) for every packet. So I'd like to keep sq stats in
>> virtnet_rx_stats.
>>
>
> We can optimize this by adding batching on top. (virtnet_xdp_xmit()
> accepts an array of xdp frames). If you like, please send a patch for this.
Yes, that sounds like a better optimization. will think about it...
Thanks,
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: Jason Wang @ 2018-08-01 1:39 UTC (permalink / raw)
To: Toshiaki Makita; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <da012c13-e438-f45e-5c41-2f159e791f00@lab.ntt.co.jp>
On 2018年07月31日 18:02, Toshiaki Makita wrote:
> On 2018/07/31 18:43, Jason Wang wrote:
>> We don't maintain tx counters in rx stats any more. There's no need
>> for an extra container of rq stats.
>>
>> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> drivers/net/virtio_net.c | 80 ++++++++++++++++++++++--------------------------
>> 1 file changed, 36 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 72d3f68..14f661c 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -87,7 +87,8 @@ struct virtnet_sq_stats {
>> u64 kicks;
>> };
>>
>> -struct virtnet_rq_stat_items {
>> +struct virtnet_rq_stats {
>> + struct u64_stats_sync syncp;
>> u64 packets;
>> u64 bytes;
>> u64 drops;
>> @@ -98,17 +99,8 @@ struct virtnet_rq_stat_items {
>> u64 kicks;
>> };
>>
>> -struct virtnet_rq_stats {
>> - struct u64_stats_sync syncp;
>> - struct virtnet_rq_stat_items items;
>> -};
> I'm not thinking removing sq stat is needed but even if it is I want to
> keep virtnet_rq_stats to avoid allocating unnecessary u64_stats_syncp on
> stack in virtnet_receive. I would just remove virtnet_rx_stats if necessary.
It's a nop on 64bit machines. And an unsigned on 32bit. So it's overhead
could be ignored I think.
Thanks
>> -
>> -struct virtnet_rx_stats {
>> - struct virtnet_rq_stat_items rx;
>> -};
>> -
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: Jason Wang @ 2018-08-01 1:31 UTC (permalink / raw)
To: Toshiaki Makita; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <2cdb8081-90a8-3a87-b6a6-4395825594a1@lab.ntt.co.jp>
On 2018年07月31日 17:57, Toshiaki Makita wrote:
> On 2018/07/31 18:43, Jason Wang wrote:
>> Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
>> count TX XDP stats in virtnet_receive(). This will cause several
>> issues:
>>
>> - virtnet_xdp_sq() was called without checking whether or not XDP is
>> set. This may cause out of bound access when there's no enough txq
>> for XDP.
>> - Stats were updated even if there's no XDP/XDP_TX.>
>> Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
>> TX XDP counter itself and remove the unnecessary tx stats embedded in
>> rx stats.
> Thanks for fixing this.
> I wanted to avoid calling u64_stats_update_begin() (i.e. smp_wmb() in 32
> bit systems) for every packet. So I'd like to keep sq stats in
> virtnet_rx_stats.
>
We can optimize this by adding batching on top. (virtnet_xdp_xmit()
accepts an array of xdp frames). If you like, please send a patch for this.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Benjamin Herrenschmidt @ 2018-07-31 20:36 UTC (permalink / raw)
To: Christoph Hellwig, Michael S. Tsirkin
Cc: robh, srikar, mpe, Will Deacon, linux-kernel, linuxram,
virtualization, paulus, marc.zyngier, joe, robin.murphy, david,
linuxppc-dev, elfring, haren, Anshuman Khandual
In-Reply-To: <20180731173052.GA17153@infradead.org>
On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote:
> > However the question people raise is that DMA API is already full of
> > arch-specific tricks the likes of which are outlined in your post linked
> > above. How is this one much worse?
>
> None of these warts is visible to the driver, they are all handled in
> the architecture (possibly on a per-bus basis).
>
> So for virtio we really need to decide if it has one set of behavior
> as specified in the virtio spec, or if it behaves exactly as if it
> was on a PCI bus, or in fact probably both as you lined up. But no
> magic arch specific behavior inbetween.
The only arch specific behaviour is needed in the case where it doesn't
behave like PCI. In this case, the PCI DMA ops are not suitable, but in
our secure VMs, we still need to make it use swiotlb in order to bounce
through non-secure pages.
It would be nice if "real PCI" was the default but it's not, VMs are
created in "legacy" mode all the times and we don't know at VM creation
time whether it will become a secure VM or not. The way our secure VMs
work is that they start as a normal VM, load a secure "payload" and
call the Ultravisor to "become" secure.
So we're in a bit of a bind here. We need that one-liner optional arch
hook to make virtio use swiotlb in that "IOMMU bypass" case.
Ben.
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Christoph Hellwig @ 2018-07-31 17:30 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: robh, srikar, benh, Will Deacon, linux-kernel, linuxram,
virtualization, Christoph Hellwig, paulus, marc.zyngier, mpe, joe,
robin.murphy, david, linuxppc-dev, elfring, haren,
Anshuman Khandual
In-Reply-To: <20180730155633-mutt-send-email-mst@kernel.org>
On Mon, Jul 30, 2018 at 04:26:32PM +0300, Michael S. Tsirkin wrote:
> Real hardware would reuse parts of the interface but by necessity it
> needs to behave slightly differently on some platforms. However for
> some platforms (such as x86) a PV virtio driver will by luck work with a
> PCI device backend without changes. As these platforms and drivers are
> widely deployed, some people will deploy hardware like that. Should be
> a non issue as by definition it's transparent to guests.
On some x86. As soon as you have an iommu or strange PCI root ports
things are going to start breaking apart.
> > And that very much excludes arch-specific (or
> > Xen-specific) overrides.
>
> We already committed to a xen specific hack but generally I prefer
> devices that describe how they work instead of platforms magically
> guessing, yes.
For legacy reasons I guess we'll have to keep it, but we really need
to avoid adding more junk than this.
> However the question people raise is that DMA API is already full of
> arch-specific tricks the likes of which are outlined in your post linked
> above. How is this one much worse?
None of these warts is visible to the driver, they are all handled in
the architecture (possibly on a per-bus basis).
So for virtio we really need to decide if it has one set of behavior
as specified in the virtio spec, or if it behaves exactly as if it
was on a PCI bus, or in fact probably both as you lined up. But no
magic arch specific behavior inbetween.
^ permalink raw reply
* Re: [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: David Miller @ 2018-07-31 17:03 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1533030219-9904-2-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 31 Jul 2018 17:43:39 +0800
> We don't maintain tx counters in rx stats any more. There's no need
> for an extra container of rq stats.
>
> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: David Miller @ 2018-07-31 17:03 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1533030219-9904-1-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 31 Jul 2018 17:43:38 +0800
> Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
> count TX XDP stats in virtnet_receive(). This will cause several
> issues:
>
> - virtnet_xdp_sq() was called without checking whether or not XDP is
> set. This may cause out of bound access when there's no enough txq
> for XDP.
> - Stats were updated even if there's no XDP/XDP_TX.
>
> Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
> TX XDP counter itself and remove the unnecessary tx stats embedded in
> rx stats.
>
> Reported-by: syzbot+604f8271211546f5b3c7@syzkaller.appspotmail.com
> Fixes: 5b8f3c8d30a6 ("virtio_net: Add XDP related stats")
> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Applied.
^ permalink raw reply
* Re: KASAN: use-after-free Read in vhost_transport_send_pkt
From: Stefan Hajnoczi @ 2018-07-31 15:43 UTC (permalink / raw)
To: syzbot
Cc: kvm, mst, netdev, syzkaller-bugs, linux-kernel, virtualization,
stefanha
In-Reply-To: <000000000000b4f77905723b70ee@google.com>
[-- Attachment #1.1: Type: text/plain, Size: 7138 bytes --]
On Mon, Jul 30, 2018 at 11:15:03AM -0700, syzbot wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit: acb1872577b3 Linux 4.18-rc7
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=14eb932c400000
> kernel config: https://syzkaller.appspot.com/x/.config?x=2dc0cd7c2eefb46f
> dashboard link: https://syzkaller.appspot.com/bug?extid=bd391451452fb0b93039
> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+bd391451452fb0b93039@syzkaller.appspotmail.com
>
> netlink: 'syz-executor5': attribute type 2 has an invalid length.
> binder: 28577:28588 transaction failed 29189/-22, size 0-0 line 2852
> ==================================================================
> BUG: KASAN: use-after-free in debug_spin_lock_before
> kernel/locking/spinlock_debug.c:83 [inline]
> BUG: KASAN: use-after-free in do_raw_spin_lock+0x1c0/0x200
> kernel/locking/spinlock_debug.c:112
> Read of size 4 at addr ffff880194d0ec6c by task syz-executor4/28583
>
> CPU: 1 PID: 28583 Comm: syz-executor4 Not tainted 4.18.0-rc7+ #169
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
> __dump_stack lib/dump_stack.c:77 [inline]
> dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
> print_address_description+0x6c/0x20b mm/kasan/report.c:256
> kasan_report_error mm/kasan/report.c:354 [inline]
> kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
> __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:432
> debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline]
> do_raw_spin_lock+0x1c0/0x200 kernel/locking/spinlock_debug.c:112
> __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline]
> _raw_spin_lock_bh+0x39/0x40 kernel/locking/spinlock.c:168
> spin_lock_bh include/linux/spinlock.h:315 [inline]
> vhost_transport_send_pkt+0x12e/0x380 drivers/vhost/vsock.c:223
Thanks for the vsock fuzzing. This is a useful bug report.
It looks like vhost_vsock_get() needs to involve a reference count so
that vhost_vsock instances cannot be freed while something is still
using them.
The reproducer probably involves racing close() with connect().
I am looking into a fix.
Stefan
> virtio_transport_send_pkt_info+0x31d/0x460
> net/vmw_vsock/virtio_transport_common.c:190
> virtio_transport_connect+0x17c/0x220
> net/vmw_vsock/virtio_transport_common.c:588
> vsock_stream_connect+0x4fb/0xfc0 net/vmw_vsock/af_vsock.c:1197
> __sys_connect+0x37d/0x4c0 net/socket.c:1673
> __do_sys_connect net/socket.c:1684 [inline]
> __se_sys_connect net/socket.c:1681 [inline]
> __x64_sys_connect+0x73/0xb0 net/socket.c:1681
> do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x456a09
> Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff
> 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:00007fa4aee5bc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
> RAX: ffffffffffffffda RBX: 00007fa4aee5c6d4 RCX: 0000000000456a09
> RDX: 0000000000000010 RSI: 0000000020000200 RDI: 0000000000000016
> RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
> R13: 00000000004ca838 R14: 00000000004c25fb R15: 0000000000000000
>
> Allocated by task 28583:
> save_stack+0x43/0xd0 mm/kasan/kasan.c:448
> set_track mm/kasan/kasan.c:460 [inline]
> kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
> __do_kmalloc_node mm/slab.c:3682 [inline]
> __kmalloc_node+0x47/0x70 mm/slab.c:3689
> kmalloc_node include/linux/slab.h:555 [inline]
> kvmalloc_node+0xb9/0xf0 mm/util.c:423
> kvmalloc include/linux/mm.h:573 [inline]
> vhost_vsock_dev_open+0xa2/0x5a0 drivers/vhost/vsock.c:511
> misc_open+0x3ca/0x560 drivers/char/misc.c:141
> chrdev_open+0x25a/0x770 fs/char_dev.c:417
> do_dentry_open+0x818/0xe40 fs/open.c:794
> vfs_open+0x139/0x230 fs/open.c:908
> do_last fs/namei.c:3399 [inline]
> path_openat+0x174a/0x4e10 fs/namei.c:3540
> do_filp_open+0x255/0x380 fs/namei.c:3574
> do_sys_open+0x584/0x760 fs/open.c:1101
> __do_sys_openat fs/open.c:1128 [inline]
> __se_sys_openat fs/open.c:1122 [inline]
> __x64_sys_openat+0x9d/0x100 fs/open.c:1122
> do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 28579:
> save_stack+0x43/0xd0 mm/kasan/kasan.c:448
> set_track mm/kasan/kasan.c:460 [inline]
> __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
> kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
> __cache_free mm/slab.c:3498 [inline]
> kfree+0xd9/0x260 mm/slab.c:3813
> kvfree+0x61/0x70 mm/util.c:442
> vhost_vsock_free drivers/vhost/vsock.c:499 [inline]
> vhost_vsock_dev_release+0x4fd/0x750 drivers/vhost/vsock.c:604
> __fput+0x355/0x8b0 fs/file_table.c:209
> ____fput+0x15/0x20 fs/file_table.c:243
> task_work_run+0x1ec/0x2a0 kernel/task_work.c:113
> tracehook_notify_resume include/linux/tracehook.h:192 [inline]
> exit_to_usermode_loop+0x313/0x370 arch/x86/entry/common.c:166
> prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
> syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
> do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> The buggy address belongs to the object at ffff880194d05f80
> which belongs to the cache kmalloc-65536 of size 65536
> The buggy address is located 36076 bytes inside of
> 65536-byte region [ffff880194d05f80, ffff880194d15f80)
> The buggy address belongs to the page:
> page:ffffea0006534000 count:1 mapcount:0 mapping:ffff8801dac02500 index:0x0
> compound_mapcount: 0
> flags: 0x2fffc0000008100(slab|head)
> raw: 02fffc0000008100 ffffea0006599808 ffff8801dac01e48 ffff8801dac02500
> raw: 0000000000000000 ffff880194d05f80 0000000100000001 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
> ffff880194d0eb00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff880194d0eb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ffff880194d0ec00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ^
> ffff880194d0ec80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff880194d0ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==================================================================
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
> syzbot.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PULL] vhost: last-minute fixes
From: Michael S. Tsirkin @ 2018-07-31 12:21 UTC (permalink / raw)
To: Linus Torvalds
Cc: kvm, mst, netdev, linux-kernel, stable, virtualization,
huang.chong, jiang.biao2
The following changes since commit d72e90f33aa4709ebecc5005562f52335e106a60:
Linux 4.18-rc6 (2018-07-22 14:12:20 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to 89da619bc18d79bca5304724c11d4ba3b67ce2c6:
virtio_balloon: fix another race between migration and ballooning (2018-07-30 16:45:33 +0300)
----------------------------------------------------------------
virtio: last-minute fixes
Some bugfixes that seem important and safe enough to merge at the last
minute.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
----------------------------------------------------------------
Jiang Biao (1):
virtio_balloon: fix another race between migration and ballooning
Michael S. Tsirkin (2):
tools/virtio: add dma barrier stubs
tools/virtio: add kmalloc_array stub
drivers/virtio/virtio_balloon.c | 2 ++
tools/virtio/asm/barrier.h | 4 ++--
tools/virtio/linux/kernel.h | 5 +++++
3 files changed, 9 insertions(+), 2 deletions(-)
^ permalink raw reply
* Re: [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: Michael S. Tsirkin @ 2018-07-31 11:22 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1533030219-9904-2-git-send-email-jasowang@redhat.com>
On Tue, Jul 31, 2018 at 05:43:39PM +0800, Jason Wang wrote:
> We don't maintain tx counters in rx stats any more. There's no need
> for an extra container of rq stats.
>
> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 80 ++++++++++++++++++++++--------------------------
> 1 file changed, 36 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 72d3f68..14f661c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -87,7 +87,8 @@ struct virtnet_sq_stats {
> u64 kicks;
> };
>
> -struct virtnet_rq_stat_items {
> +struct virtnet_rq_stats {
> + struct u64_stats_sync syncp;
> u64 packets;
> u64 bytes;
> u64 drops;
> @@ -98,17 +99,8 @@ struct virtnet_rq_stat_items {
> u64 kicks;
> };
>
> -struct virtnet_rq_stats {
> - struct u64_stats_sync syncp;
> - struct virtnet_rq_stat_items items;
> -};
> -
> -struct virtnet_rx_stats {
> - struct virtnet_rq_stat_items rx;
> -};
> -
> #define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m)
> -#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stat_items, m)
> +#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m)
>
> static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
> { "packets", VIRTNET_SQ_STAT(packets) },
> @@ -617,7 +609,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> void *buf, void *ctx,
> unsigned int len,
> unsigned int *xdp_xmit,
> - struct virtnet_rx_stats *stats)
> + struct virtnet_rq_stats *stats)
> {
> struct sk_buff *skb;
> struct bpf_prog *xdp_prog;
> @@ -632,7 +624,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> int err;
>
> len -= vi->hdr_len;
> - stats->rx.bytes += len;
> + stats->bytes += len;
>
> rcu_read_lock();
> xdp_prog = rcu_dereference(rq->xdp_prog);
> @@ -674,7 +666,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> xdp.rxq = &rq->xdp_rxq;
> orig_data = xdp.data;
> act = bpf_prog_run_xdp(xdp_prog, &xdp);
> - stats->rx.xdp_packets++;
> + stats->xdp_packets++;
>
> switch (act) {
> case XDP_PASS:
> @@ -683,7 +675,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> len = xdp.data_end - xdp.data;
> break;
> case XDP_TX:
> - stats->rx.xdp_tx++;
> + stats->xdp_tx++;
> xdpf = convert_to_xdp_frame(&xdp);
> if (unlikely(!xdpf))
> goto err_xdp;
> @@ -696,7 +688,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> rcu_read_unlock();
> goto xdp_xmit;
> case XDP_REDIRECT:
> - stats->rx.xdp_redirects++;
> + stats->xdp_redirects++;
> err = xdp_do_redirect(dev, &xdp, xdp_prog);
> if (err)
> goto err_xdp;
> @@ -730,8 +722,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
>
> err_xdp:
> rcu_read_unlock();
> - stats->rx.xdp_drops++;
> - stats->rx.drops++;
> + stats->xdp_drops++;
> + stats->drops++;
> put_page(page);
> xdp_xmit:
> return NULL;
> @@ -742,19 +734,19 @@ static struct sk_buff *receive_big(struct net_device *dev,
> struct receive_queue *rq,
> void *buf,
> unsigned int len,
> - struct virtnet_rx_stats *stats)
> + struct virtnet_rq_stats *stats)
> {
> struct page *page = buf;
> struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
>
> - stats->rx.bytes += len - vi->hdr_len;
> + stats->bytes += len - vi->hdr_len;
> if (unlikely(!skb))
> goto err;
>
> return skb;
>
> err:
> - stats->rx.drops++;
> + stats->drops++;
> give_pages(rq, page);
> return NULL;
> }
> @@ -766,7 +758,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> void *ctx,
> unsigned int len,
> unsigned int *xdp_xmit,
> - struct virtnet_rx_stats *stats)
> + struct virtnet_rq_stats *stats)
> {
> struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
> u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
> @@ -779,7 +771,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> int err;
>
> head_skb = NULL;
> - stats->rx.bytes += len - vi->hdr_len;
> + stats->bytes += len - vi->hdr_len;
>
> rcu_read_lock();
> xdp_prog = rcu_dereference(rq->xdp_prog);
> @@ -828,7 +820,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> xdp.rxq = &rq->xdp_rxq;
>
> act = bpf_prog_run_xdp(xdp_prog, &xdp);
> - stats->rx.xdp_packets++;
> + stats->xdp_packets++;
>
> switch (act) {
> case XDP_PASS:
> @@ -853,7 +845,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> }
> break;
> case XDP_TX:
> - stats->rx.xdp_tx++;
> + stats->xdp_tx++;
> xdpf = convert_to_xdp_frame(&xdp);
> if (unlikely(!xdpf))
> goto err_xdp;
> @@ -870,7 +862,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> rcu_read_unlock();
> goto xdp_xmit;
> case XDP_REDIRECT:
> - stats->rx.xdp_redirects++;
> + stats->xdp_redirects++;
> err = xdp_do_redirect(dev, &xdp, xdp_prog);
> if (err) {
> if (unlikely(xdp_page != page))
> @@ -920,7 +912,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> goto err_buf;
> }
>
> - stats->rx.bytes += len;
> + stats->bytes += len;
> page = virt_to_head_page(buf);
>
> truesize = mergeable_ctx_to_truesize(ctx);
> @@ -966,7 +958,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>
> err_xdp:
> rcu_read_unlock();
> - stats->rx.xdp_drops++;
> + stats->xdp_drops++;
> err_skb:
> put_page(page);
> while (num_buf-- > 1) {
> @@ -977,12 +969,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> dev->stats.rx_length_errors++;
> break;
> }
> - stats->rx.bytes += len;
> + stats->bytes += len;
> page = virt_to_head_page(buf);
> put_page(page);
> }
> err_buf:
> - stats->rx.drops++;
> + stats->drops++;
> dev_kfree_skb(head_skb);
> xdp_xmit:
> return NULL;
> @@ -991,7 +983,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
> void *buf, unsigned int len, void **ctx,
> unsigned int *xdp_xmit,
> - struct virtnet_rx_stats *stats)
> + struct virtnet_rq_stats *stats)
> {
> struct net_device *dev = vi->dev;
> struct sk_buff *skb;
> @@ -1212,7 +1204,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
> } while (rq->vq->num_free);
> if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
> u64_stats_update_begin(&rq->stats.syncp);
> - rq->stats.items.kicks++;
> + rq->stats.kicks++;
> u64_stats_update_end(&rq->stats.syncp);
> }
>
> @@ -1290,7 +1282,7 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> unsigned int *xdp_xmit)
> {
> struct virtnet_info *vi = rq->vq->vdev->priv;
> - struct virtnet_rx_stats stats = {};
> + struct virtnet_rq_stats stats = {};
> unsigned int len;
> void *buf;
> int i;
> @@ -1298,16 +1290,16 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> if (!vi->big_packets || vi->mergeable_rx_bufs) {
> void *ctx;
>
> - while (stats.rx.packets < budget &&
> + while (stats.packets < budget &&
> (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> - stats.rx.packets++;
> + stats.packets++;
> }
> } else {
> - while (stats.rx.packets < budget &&
> + while (stats.packets < budget &&
> (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> - stats.rx.packets++;
> + stats.packets++;
> }
> }
>
> @@ -1321,12 +1313,12 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> size_t offset = virtnet_rq_stats_desc[i].offset;
> u64 *item;
>
> - item = (u64 *)((u8 *)&rq->stats.items + offset);
> - *item += *(u64 *)((u8 *)&stats.rx + offset);
> + item = (u64 *)((u8 *)&rq->stats + offset);
> + *item += *(u64 *)((u8 *)&stats + offset);
> }
> u64_stats_update_end(&rq->stats.syncp);
>
> - return stats.rx.packets;
> + return stats.packets;
> }
>
> static void free_old_xmit_skbs(struct send_queue *sq)
> @@ -1686,9 +1678,9 @@ static void virtnet_stats(struct net_device *dev,
>
> do {
> start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
> - rpackets = rq->stats.items.packets;
> - rbytes = rq->stats.items.bytes;
> - rdrops = rq->stats.items.drops;
> + rpackets = rq->stats.packets;
> + rbytes = rq->stats.bytes;
> + rdrops = rq->stats.drops;
> } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
>
> tot->rx_packets += rpackets;
> @@ -2078,7 +2070,7 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> for (i = 0; i < vi->curr_queue_pairs; i++) {
> struct receive_queue *rq = &vi->rq[i];
>
> - stats_base = (u8 *)&rq->stats.items;
> + stats_base = (u8 *)&rq->stats;
> do {
> start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
> for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: Michael S. Tsirkin @ 2018-07-31 11:22 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1533030219-9904-1-git-send-email-jasowang@redhat.com>
On Tue, Jul 31, 2018 at 05:43:38PM +0800, Jason Wang wrote:
> Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
> count TX XDP stats in virtnet_receive(). This will cause several
> issues:
>
> - virtnet_xdp_sq() was called without checking whether or not XDP is
> set. This may cause out of bound access when there's no enough txq
> for XDP.
> - Stats were updated even if there's no XDP/XDP_TX.
>
> Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
> TX XDP counter itself and remove the unnecessary tx stats embedded in
> rx stats.
>
> Reported-by: syzbot+604f8271211546f5b3c7@syzkaller.appspotmail.com
> Fixes: 5b8f3c8d30a6 ("virtio_net: Add XDP related stats")
> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 39 ++++-----------------------------------
> 1 file changed, 4 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 1880c86..72d3f68 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -105,10 +105,6 @@ struct virtnet_rq_stats {
>
> struct virtnet_rx_stats {
> struct virtnet_rq_stat_items rx;
> - struct {
> - unsigned int xdp_tx;
> - unsigned int xdp_tx_drops;
> - } tx;
> };
>
> #define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m)
> @@ -485,22 +481,6 @@ static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
> return &vi->sq[qp];
> }
>
> -static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
> - struct xdp_frame *xdpf)
> -{
> - struct xdp_frame *xdpf_sent;
> - struct send_queue *sq;
> - unsigned int len;
> -
> - sq = virtnet_xdp_sq(vi);
> -
> - /* Free up any pending old buffers before queueing new ones. */
> - while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> - xdp_return_frame(xdpf_sent);
> -
> - return __virtnet_xdp_xmit_one(vi, sq, xdpf);
> -}
> -
> static int virtnet_xdp_xmit(struct net_device *dev,
> int n, struct xdp_frame **frames, u32 flags)
> {
> @@ -707,10 +687,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
> xdpf = convert_to_xdp_frame(&xdp);
> if (unlikely(!xdpf))
> goto err_xdp;
> - stats->tx.xdp_tx++;
> - err = __virtnet_xdp_tx_xmit(vi, xdpf);
> - if (unlikely(err)) {
> - stats->tx.xdp_tx_drops++;
> + err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
> + if (unlikely(err < 0)) {
> trace_xdp_exception(vi->dev, xdp_prog, act);
> goto err_xdp;
> }
> @@ -879,10 +857,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> xdpf = convert_to_xdp_frame(&xdp);
> if (unlikely(!xdpf))
> goto err_xdp;
> - stats->tx.xdp_tx++;
> - err = __virtnet_xdp_tx_xmit(vi, xdpf);
> - if (unlikely(err)) {
> - stats->tx.xdp_tx_drops++;
> + err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
> + if (unlikely(err < 0)) {
> trace_xdp_exception(vi->dev, xdp_prog, act);
> if (unlikely(xdp_page != page))
> put_page(xdp_page);
> @@ -1315,7 +1291,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> {
> struct virtnet_info *vi = rq->vq->vdev->priv;
> struct virtnet_rx_stats stats = {};
> - struct send_queue *sq;
> unsigned int len;
> void *buf;
> int i;
> @@ -1351,12 +1326,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> }
> u64_stats_update_end(&rq->stats.syncp);
>
> - sq = virtnet_xdp_sq(vi);
> - u64_stats_update_begin(&sq->stats.syncp);
> - sq->stats.xdp_tx += stats.tx.xdp_tx;
> - sq->stats.xdp_tx_drops += stats.tx.xdp_tx_drops;
> - u64_stats_update_end(&sq->stats.syncp);
> -
> return stats.rx.packets;
> }
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: Toshiaki Makita @ 2018-07-31 10:02 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1533030219-9904-2-git-send-email-jasowang@redhat.com>
On 2018/07/31 18:43, Jason Wang wrote:
> We don't maintain tx counters in rx stats any more. There's no need
> for an extra container of rq stats.
>
> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/virtio_net.c | 80 ++++++++++++++++++++++--------------------------
> 1 file changed, 36 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 72d3f68..14f661c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -87,7 +87,8 @@ struct virtnet_sq_stats {
> u64 kicks;
> };
>
> -struct virtnet_rq_stat_items {
> +struct virtnet_rq_stats {
> + struct u64_stats_sync syncp;
> u64 packets;
> u64 bytes;
> u64 drops;
> @@ -98,17 +99,8 @@ struct virtnet_rq_stat_items {
> u64 kicks;
> };
>
> -struct virtnet_rq_stats {
> - struct u64_stats_sync syncp;
> - struct virtnet_rq_stat_items items;
> -};
I'm not thinking removing sq stat is needed but even if it is I want to
keep virtnet_rq_stats to avoid allocating unnecessary u64_stats_syncp on
stack in virtnet_receive. I would just remove virtnet_rx_stats if necessary.
> -
> -struct virtnet_rx_stats {
> - struct virtnet_rq_stat_items rx;
> -};
> -
--
Toshiaki Makita
^ permalink raw reply
* Re: [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: Toshiaki Makita @ 2018-07-31 9:57 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1533030219-9904-1-git-send-email-jasowang@redhat.com>
On 2018/07/31 18:43, Jason Wang wrote:
> Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
> count TX XDP stats in virtnet_receive(). This will cause several
> issues:
>
> - virtnet_xdp_sq() was called without checking whether or not XDP is
> set. This may cause out of bound access when there's no enough txq
> for XDP.
> - Stats were updated even if there's no XDP/XDP_TX.>
> Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
> TX XDP counter itself and remove the unnecessary tx stats embedded in
> rx stats.
Thanks for fixing this.
I wanted to avoid calling u64_stats_update_begin() (i.e. smp_wmb() in 32
bit systems) for every packet. So I'd like to keep sq stats in
virtnet_rx_stats.
--
Toshiaki Makita
^ permalink raw reply
* [PATCH net-next 2/2] virtio-net: get rid of unnecessary container of rq stats
From: Jason Wang @ 2018-07-31 9:43 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1533030219-9904-1-git-send-email-jasowang@redhat.com>
We don't maintain tx counters in rx stats any more. There's no need
for an extra container of rq stats.
Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 80 ++++++++++++++++++++++--------------------------
1 file changed, 36 insertions(+), 44 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 72d3f68..14f661c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -87,7 +87,8 @@ struct virtnet_sq_stats {
u64 kicks;
};
-struct virtnet_rq_stat_items {
+struct virtnet_rq_stats {
+ struct u64_stats_sync syncp;
u64 packets;
u64 bytes;
u64 drops;
@@ -98,17 +99,8 @@ struct virtnet_rq_stat_items {
u64 kicks;
};
-struct virtnet_rq_stats {
- struct u64_stats_sync syncp;
- struct virtnet_rq_stat_items items;
-};
-
-struct virtnet_rx_stats {
- struct virtnet_rq_stat_items rx;
-};
-
#define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m)
-#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stat_items, m)
+#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m)
static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
{ "packets", VIRTNET_SQ_STAT(packets) },
@@ -617,7 +609,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
void *buf, void *ctx,
unsigned int len,
unsigned int *xdp_xmit,
- struct virtnet_rx_stats *stats)
+ struct virtnet_rq_stats *stats)
{
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
@@ -632,7 +624,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
int err;
len -= vi->hdr_len;
- stats->rx.bytes += len;
+ stats->bytes += len;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -674,7 +666,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.rxq = &rq->xdp_rxq;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
- stats->rx.xdp_packets++;
+ stats->xdp_packets++;
switch (act) {
case XDP_PASS:
@@ -683,7 +675,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
len = xdp.data_end - xdp.data;
break;
case XDP_TX:
- stats->rx.xdp_tx++;
+ stats->xdp_tx++;
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
@@ -696,7 +688,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
- stats->rx.xdp_redirects++;
+ stats->xdp_redirects++;
err = xdp_do_redirect(dev, &xdp, xdp_prog);
if (err)
goto err_xdp;
@@ -730,8 +722,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
err_xdp:
rcu_read_unlock();
- stats->rx.xdp_drops++;
- stats->rx.drops++;
+ stats->xdp_drops++;
+ stats->drops++;
put_page(page);
xdp_xmit:
return NULL;
@@ -742,19 +734,19 @@ static struct sk_buff *receive_big(struct net_device *dev,
struct receive_queue *rq,
void *buf,
unsigned int len,
- struct virtnet_rx_stats *stats)
+ struct virtnet_rq_stats *stats)
{
struct page *page = buf;
struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
- stats->rx.bytes += len - vi->hdr_len;
+ stats->bytes += len - vi->hdr_len;
if (unlikely(!skb))
goto err;
return skb;
err:
- stats->rx.drops++;
+ stats->drops++;
give_pages(rq, page);
return NULL;
}
@@ -766,7 +758,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
void *ctx,
unsigned int len,
unsigned int *xdp_xmit,
- struct virtnet_rx_stats *stats)
+ struct virtnet_rq_stats *stats)
{
struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
@@ -779,7 +771,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
int err;
head_skb = NULL;
- stats->rx.bytes += len - vi->hdr_len;
+ stats->bytes += len - vi->hdr_len;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -828,7 +820,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
- stats->rx.xdp_packets++;
+ stats->xdp_packets++;
switch (act) {
case XDP_PASS:
@@ -853,7 +845,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
}
break;
case XDP_TX:
- stats->rx.xdp_tx++;
+ stats->xdp_tx++;
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
@@ -870,7 +862,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
- stats->rx.xdp_redirects++;
+ stats->xdp_redirects++;
err = xdp_do_redirect(dev, &xdp, xdp_prog);
if (err) {
if (unlikely(xdp_page != page))
@@ -920,7 +912,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
goto err_buf;
}
- stats->rx.bytes += len;
+ stats->bytes += len;
page = virt_to_head_page(buf);
truesize = mergeable_ctx_to_truesize(ctx);
@@ -966,7 +958,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
err_xdp:
rcu_read_unlock();
- stats->rx.xdp_drops++;
+ stats->xdp_drops++;
err_skb:
put_page(page);
while (num_buf-- > 1) {
@@ -977,12 +969,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
dev->stats.rx_length_errors++;
break;
}
- stats->rx.bytes += len;
+ stats->bytes += len;
page = virt_to_head_page(buf);
put_page(page);
}
err_buf:
- stats->rx.drops++;
+ stats->drops++;
dev_kfree_skb(head_skb);
xdp_xmit:
return NULL;
@@ -991,7 +983,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
void *buf, unsigned int len, void **ctx,
unsigned int *xdp_xmit,
- struct virtnet_rx_stats *stats)
+ struct virtnet_rq_stats *stats)
{
struct net_device *dev = vi->dev;
struct sk_buff *skb;
@@ -1212,7 +1204,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
} while (rq->vq->num_free);
if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
u64_stats_update_begin(&rq->stats.syncp);
- rq->stats.items.kicks++;
+ rq->stats.kicks++;
u64_stats_update_end(&rq->stats.syncp);
}
@@ -1290,7 +1282,7 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
unsigned int *xdp_xmit)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
- struct virtnet_rx_stats stats = {};
+ struct virtnet_rq_stats stats = {};
unsigned int len;
void *buf;
int i;
@@ -1298,16 +1290,16 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
if (!vi->big_packets || vi->mergeable_rx_bufs) {
void *ctx;
- while (stats.rx.packets < budget &&
+ while (stats.packets < budget &&
(buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
- stats.rx.packets++;
+ stats.packets++;
}
} else {
- while (stats.rx.packets < budget &&
+ while (stats.packets < budget &&
(buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
- stats.rx.packets++;
+ stats.packets++;
}
}
@@ -1321,12 +1313,12 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
size_t offset = virtnet_rq_stats_desc[i].offset;
u64 *item;
- item = (u64 *)((u8 *)&rq->stats.items + offset);
- *item += *(u64 *)((u8 *)&stats.rx + offset);
+ item = (u64 *)((u8 *)&rq->stats + offset);
+ *item += *(u64 *)((u8 *)&stats + offset);
}
u64_stats_update_end(&rq->stats.syncp);
- return stats.rx.packets;
+ return stats.packets;
}
static void free_old_xmit_skbs(struct send_queue *sq)
@@ -1686,9 +1678,9 @@ static void virtnet_stats(struct net_device *dev,
do {
start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
- rpackets = rq->stats.items.packets;
- rbytes = rq->stats.items.bytes;
- rdrops = rq->stats.items.drops;
+ rpackets = rq->stats.packets;
+ rbytes = rq->stats.bytes;
+ rdrops = rq->stats.drops;
} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
tot->rx_packets += rpackets;
@@ -2078,7 +2070,7 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
for (i = 0; i < vi->curr_queue_pairs; i++) {
struct receive_queue *rq = &vi->rq[i];
- stats_base = (u8 *)&rq->stats.items;
+ stats_base = (u8 *)&rq->stats;
do {
start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 1/2] virtio-net: correctly update XDP_TX counters
From: Jason Wang @ 2018-07-31 9:43 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, virtualization
Commit 5b8f3c8d30a6 ("virtio_net: Add XDP related stats") tries to
count TX XDP stats in virtnet_receive(). This will cause several
issues:
- virtnet_xdp_sq() was called without checking whether or not XDP is
set. This may cause out of bound access when there's no enough txq
for XDP.
- Stats were updated even if there's no XDP/XDP_TX.
Fixing this by reusing virtnet_xdp_xmit() for XDP_TX which can counts
TX XDP counter itself and remove the unnecessary tx stats embedded in
rx stats.
Reported-by: syzbot+604f8271211546f5b3c7@syzkaller.appspotmail.com
Fixes: 5b8f3c8d30a6 ("virtio_net: Add XDP related stats")
Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 39 ++++-----------------------------------
1 file changed, 4 insertions(+), 35 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1880c86..72d3f68 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -105,10 +105,6 @@ struct virtnet_rq_stats {
struct virtnet_rx_stats {
struct virtnet_rq_stat_items rx;
- struct {
- unsigned int xdp_tx;
- unsigned int xdp_tx_drops;
- } tx;
};
#define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m)
@@ -485,22 +481,6 @@ static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
return &vi->sq[qp];
}
-static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
- struct xdp_frame *xdpf)
-{
- struct xdp_frame *xdpf_sent;
- struct send_queue *sq;
- unsigned int len;
-
- sq = virtnet_xdp_sq(vi);
-
- /* Free up any pending old buffers before queueing new ones. */
- while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
- xdp_return_frame(xdpf_sent);
-
- return __virtnet_xdp_xmit_one(vi, sq, xdpf);
-}
-
static int virtnet_xdp_xmit(struct net_device *dev,
int n, struct xdp_frame **frames, u32 flags)
{
@@ -707,10 +687,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
- stats->tx.xdp_tx++;
- err = __virtnet_xdp_tx_xmit(vi, xdpf);
- if (unlikely(err)) {
- stats->tx.xdp_tx_drops++;
+ err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
+ if (unlikely(err < 0)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
goto err_xdp;
}
@@ -879,10 +857,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
- stats->tx.xdp_tx++;
- err = __virtnet_xdp_tx_xmit(vi, xdpf);
- if (unlikely(err)) {
- stats->tx.xdp_tx_drops++;
+ err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
+ if (unlikely(err < 0)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page))
put_page(xdp_page);
@@ -1315,7 +1291,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rx_stats stats = {};
- struct send_queue *sq;
unsigned int len;
void *buf;
int i;
@@ -1351,12 +1326,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
}
u64_stats_update_end(&rq->stats.syncp);
- sq = virtnet_xdp_sq(vi);
- u64_stats_update_begin(&sq->stats.syncp);
- sq->stats.xdp_tx += stats.tx.xdp_tx;
- sq->stats.xdp_tx_drops += stats.tx.xdp_tx_drops;
- u64_stats_update_end(&sq->stats.syncp);
-
return stats.rx.packets;
}
--
2.7.4
^ permalink raw reply related
* Re: [RFC 2/4] virtio: Override device's DMA OPS with virtio_direct_dma_ops selectively
From: Anshuman Khandual @ 2018-07-31 7:00 UTC (permalink / raw)
To: Christoph Hellwig
Cc: robh, srikar, mst, linuxram, linux-kernel, virtualization, paulus,
joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <20180730092551.GB26245@infradead.org>
On 07/30/2018 02:55 PM, Christoph Hellwig wrote:
>> +const struct dma_map_ops virtio_direct_dma_ops;
>
> This belongs into a header if it is non-static. If you only
> use it in this file anyway please mark it static and avoid a forward
> declaration.
Sure, will make it static, move the definition up in the file to avoid
forward declaration.
>
>> +
>> int virtio_finalize_features(struct virtio_device *dev)
>> {
>> int ret = dev->config->finalize_features(dev);
>> @@ -174,6 +176,9 @@ int virtio_finalize_features(struct virtio_device *dev)
>> if (ret)
>> return ret;
>>
>> + if (virtio_has_iommu_quirk(dev))
>> + set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
>
> This needs a big fat comment explaining what is going on here.
Sure, will do. Also talk about the XEN domain exception as well once
that goes into this conditional statement.
>
> Also not new, but I find the existance of virtio_has_iommu_quirk and its
> name horribly confusing. It might be better to open code it here once
> only a single caller is left.
Sure will do. There is one definition in the tools directory which can
be removed and then this will be the only one left.
^ permalink raw reply
* Re: [RFC 2/4] virtio: Override device's DMA OPS with virtio_direct_dma_ops selectively
From: Anshuman Khandual @ 2018-07-31 6:39 UTC (permalink / raw)
To: Christoph Hellwig, Michael S. Tsirkin
Cc: robh, srikar, benh, linuxram, linux-kernel, virtualization,
paulus, mpe, joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <20180730093027.GC26245@infradead.org>
On 07/30/2018 03:00 PM, Christoph Hellwig wrote:
>>> +
>>> + if (xen_domain())
>>> + goto skip_override;
>>> +
>>> + if (virtio_has_iommu_quirk(dev))
>>> + set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
>>> +
>>> + skip_override:
>>> +
>>
>> I prefer normal if scoping as opposed to goto spaghetti pls.
>> Better yet move vring_use_dma_api here and use it.
>> Less of a chance something will break.
>
> I agree about avoid pointless gotos here, but we can do things
> perfectly well without either gotos or a confusing helper here
> if we structure it right. E.g.:
>
> // suitably detailed comment here
> if (!xen_domain() &&
> !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM))
> set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
I had updated this patch calling vring_use_dma_api() as a helper
as suggested by Michael but yes we can have the above condition
with a comment block. I will change this patch accordingly.
>
> and while we're at it - modifying dma ops for the parent looks very
> dangerous. I don't think we can do that, as it could break iommu
> setup interactions. IFF we set a specific dma map ops it has to be
> on the virtio device itself, of which we have full control.
I understand your concern. At present virtio core calls parent's DMA
ops callbacks when device has VIRTIO_F_IOMMU_PLATFORM flag set. Most
likely those DMA OPS are architecture specific ones which can really
configure IOMMU. Most probably all devices and their parents share
the same DMA ops callback. IIUC as long as the entire system has a
single DMA ops structure, it should be okay. But I may be missing
other implications. I tried changing virtio core so that it always
calls device's DMA ops instead of it's parent DMA ops, it hit the
following WARN_ON for devices without IOMMU flag and hit both the
WARN_ON and BUG_ON for devices with the IOMMU flag.
static inline void *dma_alloc_attrs(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flag,
unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
void *cpu_addr;
BUG_ON(!ops);
WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
--------
Seems like virtio device's DMA ops and coherent_dma_mask was never
set correctly assuming that virtio core always called parent's DMA
OPS all the time. We may have to change virtio device init to fix
this. Any thoughts ?
^ permalink raw reply
* IEEE Record # 44854: iCATccT 2018, Alva's Institute Of Engineering & Technology (AIET)-CFP
From: Dr. S K Niranjan Aradhya @ 2018-07-31 5:57 UTC (permalink / raw)
To: virtualization
[-- Attachment #1.1: Type: text/plain, Size: 1653 bytes --]
<< Apologies for cross-postings >>
<<< Please circulate among your friends, peers and researchers >>>
IEEE Conference Record No.: # 44854;
4th International Conference on Applied and Theoretical Computing and
Communication Technology (iCATccT - 2018)
Alva's Institute Of Engineering & Technology (AIET)
Conference Date : 6-8 Sept 2018
Submission Deadline: 10 August 2018
Submission Link: http://itekcmsonline.com/icatcct18/index.php/icatcct18/
icatcct18/login
Review is underway for submitted papers.
IEEE ISBN : 978-1-5386-7706-3
IEEE Part No. : CFP18D66-ART
Selected, accepted and extended paper will be published in Scopus Indexed
International Journal of Forensic Software Engineering published by
InderScience
All accepted and presented papers will be submitted to the IEEE for
possible publication in IEEE Xplore Digital Library. Previous edition
indexed in: SCOPUS, ISI Web of Science, Engineering Index, Google, etc.
If you like to join the TPC or propose a special session or symposiums
please write to: secretariat@icatcct.org
General Chair(s)
iCATccT 2018 Conference
----------------------
Disclaimer: We have clearly mentioned the subject lines and your email
address won't be misleading in any form. We have found your mail address
through our own efforts on the web search and not through any illegal way.
If you wish to remove your information from our mailing list or no longer
receive future announcements, please email with REMOVE in subject. Your
request to opt-out will be effective within a reasonable amount of time.
icatcct-cfp.pdf
<https://drive.google.com/file/d/1OWXPZVS1IRZlNoWTjfVyxl-yIL2CsByg/view?usp=drive_web>
[-- Attachment #1.2: Type: text/html, Size: 3240 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC 1/4] virtio: Define virtio_direct_dma_ops structure
From: Anshuman Khandual @ 2018-07-31 4:01 UTC (permalink / raw)
To: Christoph Hellwig
Cc: robh, srikar, mst, benh, linuxram, linux-kernel, virtualization,
paulus, mpe, joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <20180730092419.GA26245@infradead.org>
On 07/30/2018 02:54 PM, Christoph Hellwig wrote:
>> +/*
>> + * Virtio direct mapping DMA API operations structure
>> + *
>> + * This defines DMA API structure for all virtio devices which would not
>> + * either bring in their own DMA OPS from architecture or they would not
>> + * like to use architecture specific IOMMU based DMA OPS because QEMU
>> + * expects GPA instead of an IOVA in absence of VIRTIO_F_IOMMU_PLATFORM.
>> + */
>> +dma_addr_t virtio_direct_map_page(struct device *dev, struct page *page,
>> + unsigned long offset, size_t size,
>> + enum dma_data_direction dir,
>> + unsigned long attrs)
>
> All these functions should probably be marked static.
Sure.
>
>> +void virtio_direct_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
>> + size_t size, enum dma_data_direction dir,
>> + unsigned long attrs)
>> +{
>> +}
>
> No need to implement no-op callbacks in struct dma_map_ops.
Okay.
>
>> +
>> +int virtio_direct_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
>> +{
>> + return 0;
>> +}
>
> Including this one.
>
>> +void *virtio_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
>> + gfp_t gfp, unsigned long attrs)
>> +{
>> + void *queue = alloc_pages_exact(PAGE_ALIGN(size), gfp);
>> +
>> + if (queue) {
>> + phys_addr_t phys_addr = virt_to_phys(queue);
>> + *dma_handle = (dma_addr_t)phys_addr;
>> +
>> + if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
>> + free_pages_exact(queue, PAGE_ALIGN(size));
>> + return NULL;
>> + }
>> + }
>> + return queue;
>
> queue is a very odd name in a generic memory allocator.
Will change it to addr.
>
>> +void virtio_direct_free(struct device *dev, size_t size, void *vaddr,
>> + dma_addr_t dma_addr, unsigned long attrs)
>> +{
>> + free_pages_exact(vaddr, PAGE_ALIGN(size));
>> +}
>> +
>> +const struct dma_map_ops virtio_direct_dma_ops = {
>> + .alloc = virtio_direct_alloc,
>> + .free = virtio_direct_free,
>> + .map_page = virtio_direct_map_page,
>> + .unmap_page = virtio_direct_unmap_page,
>> + .mapping_error = virtio_direct_mapping_error,
>> +};
>
> This is missing a dma_map_sg implementation. In general this is
> mandatory for dma_ops. So either you implement it or explain in
> a common why you think you can skip it.
Hmm. IIUC virtio core never used dma_map_sg(). Am I missing something
here ? The only reference to dma_map_sg() is inside a comment.
$git grep dma_map_sg drivers/virtio/
drivers/virtio/virtio_ring.c: * We can't use dma_map_sg, because we don't use scatterlists in
>
>> +EXPORT_SYMBOL(virtio_direct_dma_ops);
>
> EXPORT_SYMBOL_GPL like all virtio symbols, please.
I am planning to drop EXPORT_SYMBOL from virtio_direct_dma_ops structure.
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-07-30 13:26 UTC (permalink / raw)
To: Christoph Hellwig
Cc: robh, srikar, benh, Will Deacon, linux-kernel, linuxram,
virtualization, paulus, marc.zyngier, mpe, joe, robin.murphy,
david, linuxppc-dev, elfring, haren, Anshuman Khandual
In-Reply-To: <20180730111802.GA9830@infradead.org>
On Mon, Jul 30, 2018 at 04:18:02AM -0700, Christoph Hellwig wrote:
> On Mon, Jul 30, 2018 at 01:28:03PM +0300, Michael S. Tsirkin wrote:
> > Let me reply to the "crappy" part first:
> > So virtio devices can run on another CPU or on a PCI bus. Configuration
> > can happen over mupltiple transports. There is a discovery protocol to
> > figure out where it is. It has some warts but any real system has warts.
> >
> > So IMHO virtio running on another CPU isn't "legacy virtual crappy
> > virtio". virtio devices that actually sit on a PCI bus aren't "sane"
> > simply because the DMA is more convoluted on some architectures.
>
> All of what you said would be true if virtio didn't claim to be
> a PCI device.
There's nothing virtio claims to be. It's a PV device that uses PCI for
its configuration. Configuration is enumerated on the virtual PCI bus.
That part of the interface is emulated PCI. Data path is through a
PV device enumerated on the virtio bus.
> Once it claims to be a PCI device and we also see
> real hardware written to the interface I stand to all what I said
> above.
Real hardware would reuse parts of the interface but by necessity it
needs to behave slightly differently on some platforms. However for
some platforms (such as x86) a PV virtio driver will by luck work with a
PCI device backend without changes. As these platforms and drivers are
widely deployed, some people will deploy hardware like that. Should be
a non issue as by definition it's transparent to guests.
> > With this out of my system:
> > I agree these approaches are hacky. I think it is generally better to
> > have virtio feature negotiation tell you whether device runs on a CPU or
> > not rather than rely on platform specific ways for this. To this end
> > there was a recent proposal to rename VIRTIO_F_IO_BARRIER to
> > VIRTIO_F_REAL_DEVICE. It got stuck since "real" sounds vague to people,
> > e.g. what if it's a VF - is that real or not? But I can see something
> > like e.g. VIRTIO_F_PLATFORM_DMA gaining support.
> >
> > We would then rename virtio_has_iommu_quirk to virtio_has_dma_quirk
> > and test VIRTIO_F_PLATFORM_DMA in addition to the IOMMU thing.
>
> I don't really care about the exact naming, and indeed a device that
> sets the flag doesn't have to be a 'real' device - it just has to act
> like one. I explained all the issues that this means (at least relating
> to DMA) in one of the previous threads.
I believe you refer to this:
https://lkml.org/lkml/2018/6/7/15
that was a very helpful list outlining the problems we need to solve,
thanks a lot for that!
> The important bit is that we can specify exact behavior for both
> devices that sets the "I'm real!" flag and that ones that don't exactly
> in the spec.
I would very much like that, yes.
> And that very much excludes arch-specific (or
> Xen-specific) overrides.
We already committed to a xen specific hack but generally I prefer
devices that describe how they work instead of platforms magically
guessing, yes.
However the question people raise is that DMA API is already full of
arch-specific tricks the likes of which are outlined in your post linked
above. How is this one much worse?
--
MST
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox