* [PATCH net-next 01/13] virtio_ring: introduce vring_need_unmap_buffer
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect Xuan Zhuo
` (11 subsequent siblings)
12 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
To make the code readable, introduce vring_need_unmap_buffer() to
replace do_unmap.
use_dma_api premapped -> vring_need_unmap_buffer()
1. false false false
2. true false true
3. true true false
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/virtio/virtio_ring.c | 27 ++++++++++++---------------
1 file changed, 12 insertions(+), 15 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index be7309b1e860..228e9fbcba3f 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -175,11 +175,6 @@ struct vring_virtqueue {
/* Do DMA mapping by driver */
bool premapped;
- /* Do unmap or not for desc. Just when premapped is False and
- * use_dma_api is true, this is true.
- */
- bool do_unmap;
-
/* Head of free buffer list. */
unsigned int free_head;
/* Number we've added since last sync. */
@@ -297,6 +292,11 @@ static bool vring_use_dma_api(const struct virtio_device *vdev)
return false;
}
+static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring)
+{
+ return vring->use_dma_api && !vring->premapped;
+}
+
size_t virtio_max_dma_size(const struct virtio_device *vdev)
{
size_t max_segment_size = SIZE_MAX;
@@ -445,7 +445,7 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
{
u16 flags;
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
@@ -475,7 +475,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
goto out;
dma_unmap_page(vring_dma_dev(vq),
@@ -643,7 +643,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
}
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
- if (!indirect && vq->do_unmap)
+ if (!indirect && vring_need_unmap_buffer(vq))
vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
~VRING_DESC_F_NEXT;
@@ -802,7 +802,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
VRING_DESC_F_INDIRECT));
BUG_ON(len == 0 || len % sizeof(struct vring_desc));
- if (vq->do_unmap) {
+ if (vring_need_unmap_buffer(vq)) {
for (j = 0; j < len / sizeof(struct vring_desc); j++)
vring_unmap_one_split_indirect(vq, &indir_desc[j]);
}
@@ -1236,7 +1236,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
dma_unmap_page(vring_dma_dev(vq),
@@ -1251,7 +1251,7 @@ static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
{
u16 flags;
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
flags = le16_to_cpu(desc->flags);
@@ -1632,7 +1632,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
if (!desc)
return;
- if (vq->do_unmap) {
+ if (vring_need_unmap_buffer(vq)) {
len = vq->packed.desc_extra[id].len;
for (i = 0; i < len / sizeof(struct vring_packed_desc);
i++)
@@ -2091,7 +2091,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
vq->premapped = false;
- vq->do_unmap = vq->use_dma_api;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2636,7 +2635,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
vq->premapped = false;
- vq->do_unmap = vq->use_dma_api;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2799,7 +2797,6 @@ int virtqueue_set_dma_premapped(struct virtqueue *_vq)
}
vq->premapped = true;
- vq->do_unmap = false;
END_USE(vq);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 01/13] virtio_ring: introduce vring_need_unmap_buffer Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 3:46 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 03/13] virtio_ring: packed: " Xuan Zhuo
` (10 subsequent siblings)
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
1. this commit hardens dma unmap for indirect
2. the subsequent commit uses the struct extra to record whether the
buffers need to be unmapped or not. So we need a struct extra for
every desc, whatever it is indirect or not.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/virtio/virtio_ring.c | 122 ++++++++++++++++-------------------
1 file changed, 57 insertions(+), 65 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 228e9fbcba3f..582d2c05498a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -67,9 +67,16 @@
#define LAST_ADD_TIME_INVALID(vq)
#endif
+struct vring_desc_extra {
+ dma_addr_t addr; /* Descriptor DMA addr. */
+ u32 len; /* Descriptor length. */
+ u16 flags; /* Descriptor flags. */
+ u16 next; /* The next desc state in a list. */
+};
+
struct vring_desc_state_split {
void *data; /* Data for callback. */
- struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
+ struct vring_desc_extra *indir; /* Indirect descriptor, if any. */
};
struct vring_desc_state_packed {
@@ -79,13 +86,6 @@ struct vring_desc_state_packed {
u16 last; /* The last desc state in a list. */
};
-struct vring_desc_extra {
- dma_addr_t addr; /* Descriptor DMA addr. */
- u32 len; /* Descriptor length. */
- u16 flags; /* Descriptor flags. */
- u16 next; /* The next desc state in a list. */
-};
-
struct vring_virtqueue_split {
/* Actual memory layout for this queue. */
struct vring vring;
@@ -440,38 +440,20 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num)
* Split ring specific functions - *_split().
*/
-static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
- const struct vring_desc *desc)
-{
- u16 flags;
-
- if (!vring_need_unmap_buffer(vq))
- return;
-
- flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
-
- dma_unmap_page(vring_dma_dev(vq),
- virtio64_to_cpu(vq->vq.vdev, desc->addr),
- virtio32_to_cpu(vq->vq.vdev, desc->len),
- (flags & VRING_DESC_F_WRITE) ?
- DMA_FROM_DEVICE : DMA_TO_DEVICE);
-}
-
static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
- unsigned int i)
+ struct vring_desc_extra *extra)
{
- struct vring_desc_extra *extra = vq->split.desc_extra;
u16 flags;
- flags = extra[i].flags;
+ flags = extra->flags;
if (flags & VRING_DESC_F_INDIRECT) {
if (!vq->use_dma_api)
goto out;
dma_unmap_single(vring_dma_dev(vq),
- extra[i].addr,
- extra[i].len,
+ extra->addr,
+ extra->len,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
@@ -479,20 +461,22 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
goto out;
dma_unmap_page(vring_dma_dev(vq),
- extra[i].addr,
- extra[i].len,
+ extra->addr,
+ extra->len,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
}
out:
- return extra[i].next;
+ return extra->next;
}
static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
unsigned int total_sg,
+ struct vring_desc_extra **pextra,
gfp_t gfp)
{
+ struct vring_desc_extra *extra;
struct vring_desc *desc;
unsigned int i;
@@ -503,40 +487,45 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
*/
gfp &= ~__GFP_HIGHMEM;
- desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
- if (!desc)
+ extra = kmalloc_array(total_sg, sizeof(*desc) + sizeof(*extra), gfp);
+ if (!extra)
return NULL;
- for (i = 0; i < total_sg; i++)
+ desc = (struct vring_desc *)&extra[total_sg];
+
+ for (i = 0; i < total_sg; i++) {
desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
+ extra[i].next = i + 1;
+ }
+
+ *pextra = extra;
+
return desc;
}
static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
struct vring_desc *desc,
+ struct vring_desc_extra *extra,
unsigned int i,
dma_addr_t addr,
unsigned int len,
u16 flags,
bool indirect)
{
- struct vring_virtqueue *vring = to_vvq(vq);
- struct vring_desc_extra *extra = vring->split.desc_extra;
u16 next;
desc[i].flags = cpu_to_virtio16(vq->vdev, flags);
desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
desc[i].len = cpu_to_virtio32(vq->vdev, len);
- if (!indirect) {
- next = extra[i].next;
- desc[i].next = cpu_to_virtio16(vq->vdev, next);
+ extra[i].addr = addr;
+ extra[i].len = len;
+ extra[i].flags = flags;
- extra[i].addr = addr;
- extra[i].len = len;
- extra[i].flags = flags;
- } else
- next = virtio16_to_cpu(vq->vdev, desc[i].next);
+ next = extra[i].next;
+
+ if (!indirect)
+ desc[i].next = cpu_to_virtio16(vq->vdev, next);
return next;
}
@@ -551,6 +540,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
+ struct vring_desc_extra *extra;
struct scatterlist *sg;
struct vring_desc *desc;
unsigned int i, n, avail, descs_used, prev, err_idx;
@@ -574,7 +564,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
head = vq->free_head;
if (virtqueue_use_indirect(vq, total_sg))
- desc = alloc_indirect_split(_vq, total_sg, gfp);
+ desc = alloc_indirect_split(_vq, total_sg, &extra, gfp);
else {
desc = NULL;
WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
@@ -589,6 +579,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
} else {
indirect = false;
desc = vq->split.vring.desc;
+ extra = vq->split.desc_extra;
i = head;
descs_used = total_sg;
}
@@ -618,7 +609,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
/* Note that we trust indirect descriptor
* table since it use stream DMA mapping.
*/
- i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length,
+ i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length,
VRING_DESC_F_NEXT,
indirect);
}
@@ -634,7 +625,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
/* Note that we trust indirect descriptor
* table since it use stream DMA mapping.
*/
- i = virtqueue_add_desc_split(_vq, desc, i, addr,
+ i = virtqueue_add_desc_split(_vq, desc, extra, i, addr,
sg->length,
VRING_DESC_F_NEXT |
VRING_DESC_F_WRITE,
@@ -660,6 +651,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
}
virtqueue_add_desc_split(_vq, vq->split.vring.desc,
+ vq->split.desc_extra,
head, addr,
total_sg * sizeof(struct vring_desc),
VRING_DESC_F_INDIRECT,
@@ -678,9 +670,9 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
/* Store token and indirect buffer state. */
vq->split.desc_state[head].data = data;
if (indirect)
- vq->split.desc_state[head].indir_desc = desc;
+ vq->split.desc_state[head].indir = extra;
else
- vq->split.desc_state[head].indir_desc = ctx;
+ vq->split.desc_state[head].indir = ctx;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
@@ -716,11 +708,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
for (n = 0; n < total_sg; n++) {
if (i == err_idx)
break;
- if (indirect) {
- vring_unmap_one_split_indirect(vq, &desc[i]);
- i = virtio16_to_cpu(_vq->vdev, desc[i].next);
- } else
- i = vring_unmap_one_split(vq, i);
+
+ i = vring_unmap_one_split(vq, &extra[i]);
}
free_indirect:
@@ -765,22 +754,25 @@ static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
void **ctx)
{
+ struct vring_desc_extra *extra;
unsigned int i, j;
__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
/* Clear data ptr. */
vq->split.desc_state[head].data = NULL;
+ extra = vq->split.desc_extra;
+
/* Put back on free list: unmap first-level descriptors and find end */
i = head;
while (vq->split.vring.desc[i].flags & nextflag) {
- vring_unmap_one_split(vq, i);
+ vring_unmap_one_split(vq, &extra[i]);
i = vq->split.desc_extra[i].next;
vq->vq.num_free++;
}
- vring_unmap_one_split(vq, i);
+ vring_unmap_one_split(vq, &extra[i]);
vq->split.desc_extra[i].next = vq->free_head;
vq->free_head = head;
@@ -788,12 +780,12 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
vq->vq.num_free++;
if (vq->indirect) {
- struct vring_desc *indir_desc =
- vq->split.desc_state[head].indir_desc;
u32 len;
+ extra = vq->split.desc_state[head].indir;
+
/* Free the indirect table, if any, now that it's unmapped. */
- if (!indir_desc)
+ if (!extra)
return;
len = vq->split.desc_extra[head].len;
@@ -804,13 +796,13 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
if (vring_need_unmap_buffer(vq)) {
for (j = 0; j < len / sizeof(struct vring_desc); j++)
- vring_unmap_one_split_indirect(vq, &indir_desc[j]);
+ vring_unmap_one_split(vq, &extra[j]);
}
- kfree(indir_desc);
- vq->split.desc_state[head].indir_desc = NULL;
+ kfree(extra);
+ vq->split.desc_state[head].indir = NULL;
} else if (ctx) {
- *ctx = vq->split.desc_state[head].indir_desc;
+ *ctx = vq->split.desc_state[head].indir;
}
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect
2024-08-20 7:33 ` [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect Xuan Zhuo
@ 2024-09-11 3:46 ` Jason Wang
2024-09-11 10:30 ` Michael S. Tsirkin
2024-09-12 7:30 ` Xuan Zhuo
0 siblings, 2 replies; 36+ messages in thread
From: Jason Wang @ 2024-09-11 3:46 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> 1. this commit hardens dma unmap for indirect
I think we need to explain why we need such hardening. For example
indirect use stream mapping which is read-only from the device. So it
looks to me like it doesn't require hardening by itself.
> 2. the subsequent commit uses the struct extra to record whether the
> buffers need to be unmapped or not.
It's better to explain why such a decision could not be implied with
the existing metadata.
> So we need a struct extra for
> every desc, whatever it is indirect or not.
If this is the real reason, we need to tweak the title.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/virtio/virtio_ring.c | 122 ++++++++++++++++-------------------
> 1 file changed, 57 insertions(+), 65 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 228e9fbcba3f..582d2c05498a 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -67,9 +67,16 @@
> #define LAST_ADD_TIME_INVALID(vq)
> #endif
>
> +struct vring_desc_extra {
> + dma_addr_t addr; /* Descriptor DMA addr. */
> + u32 len; /* Descriptor length. */
> + u16 flags; /* Descriptor flags. */
> + u16 next; /* The next desc state in a list. */
> +};
> +
> struct vring_desc_state_split {
> void *data; /* Data for callback. */
> - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
> + struct vring_desc_extra *indir; /* Indirect descriptor, if any. */
Btw, it might be worth explaining that this will be allocated with an
indirect descriptor table so we won't stress more to the memory
allocator.
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect
2024-09-11 3:46 ` Jason Wang
@ 2024-09-11 10:30 ` Michael S. Tsirkin
2024-09-12 7:30 ` Xuan Zhuo
1 sibling, 0 replies; 36+ messages in thread
From: Michael S. Tsirkin @ 2024-09-11 10:30 UTC (permalink / raw)
To: Jason Wang
Cc: Xuan Zhuo, netdev, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, Sep 11, 2024 at 11:46:30AM +0800, Jason Wang wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > 1. this commit hardens dma unmap for indirect
>
> I think we need to explain why we need such hardening.
yes pls be more specific. Recording same state in two
places is just a source of bugs, not hardening.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect
2024-09-11 3:46 ` Jason Wang
2024-09-11 10:30 ` Michael S. Tsirkin
@ 2024-09-12 7:30 ` Xuan Zhuo
1 sibling, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:30 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 11:46:30 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > 1. this commit hardens dma unmap for indirect
>
> I think we need to explain why we need such hardening. For example
> indirect use stream mapping which is read-only from the device. So it
> looks to me like it doesn't require hardening by itself.
>
> > 2. the subsequent commit uses the struct extra to record whether the
> > buffers need to be unmapped or not.
>
> It's better to explain why such a decision could not be implied with
> the existing metadata.
>
> > So we need a struct extra for
> > every desc, whatever it is indirect or not.
>
> If this is the real reason, we need to tweak the title.
YES. It is.
I will tweak the title in next version.
>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/virtio/virtio_ring.c | 122 ++++++++++++++++-------------------
> > 1 file changed, 57 insertions(+), 65 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 228e9fbcba3f..582d2c05498a 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -67,9 +67,16 @@
> > #define LAST_ADD_TIME_INVALID(vq)
> > #endif
> >
> > +struct vring_desc_extra {
> > + dma_addr_t addr; /* Descriptor DMA addr. */
> > + u32 len; /* Descriptor length. */
> > + u16 flags; /* Descriptor flags. */
> > + u16 next; /* The next desc state in a list. */
> > +};
> > +
> > struct vring_desc_state_split {
> > void *data; /* Data for callback. */
> > - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
> > + struct vring_desc_extra *indir; /* Indirect descriptor, if any. */
>
> Btw, it might be worth explaining that this will be allocated with an
> indirect descriptor table so we won't stress more to the memory
> allocator.
Will do.
Thanks.
>
> Thanks
>
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 01/13] virtio_ring: introduce vring_need_unmap_buffer Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 02/13] virtio_ring: split: harden dma unmap for indirect Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-21 8:54 ` Dan Carpenter
2024-09-11 11:28 ` Michael S. Tsirkin
2024-08-20 7:33 ` [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer Xuan Zhuo
` (9 subsequent siblings)
12 siblings, 2 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
1. this commit hardens dma unmap for indirect
2. the subsequent commit uses the struct extra to record whether the
buffers need to be unmapped or not. So we need a struct extra for
every desc, whatever it is indirect or not.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/virtio/virtio_ring.c | 57 ++++++++++++++++++------------------
1 file changed, 29 insertions(+), 28 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 582d2c05498a..b43eca93015c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -81,7 +81,7 @@ struct vring_desc_state_split {
struct vring_desc_state_packed {
void *data; /* Data for callback. */
- struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
+ struct vring_desc_extra *indir; /* Indirect descriptor, if any. */
u16 num; /* Descriptor list length. */
u16 last; /* The last desc state in a list. */
};
@@ -1238,27 +1238,13 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
}
}
-static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
- const struct vring_packed_desc *desc)
-{
- u16 flags;
-
- if (!vring_need_unmap_buffer(vq))
- return;
-
- flags = le16_to_cpu(desc->flags);
-
- dma_unmap_page(vring_dma_dev(vq),
- le64_to_cpu(desc->addr),
- le32_to_cpu(desc->len),
- (flags & VRING_DESC_F_WRITE) ?
- DMA_FROM_DEVICE : DMA_TO_DEVICE);
-}
-
static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
+ struct vring_desc_extra **pextra,
gfp_t gfp)
{
+ struct vring_desc_extra *extra;
struct vring_packed_desc *desc;
+ int i;
/*
* We require lowmem mappings for the descriptors because
@@ -1267,7 +1253,14 @@ static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
*/
gfp &= ~__GFP_HIGHMEM;
- desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp);
+ extra = kmalloc_array(total_sg, sizeof(*desc) + sizeof(*extra), gfp);
+
+ desc = (struct vring_packed_desc *)&extra[total_sg];
+
+ for (i = 0; i < total_sg; i++)
+ extra[i].next = i + 1;
+
+ *pextra = extra;
return desc;
}
@@ -1280,6 +1273,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
void *data,
gfp_t gfp)
{
+ struct vring_desc_extra *extra;
struct vring_packed_desc *desc;
struct scatterlist *sg;
unsigned int i, n, err_idx;
@@ -1287,7 +1281,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
dma_addr_t addr;
head = vq->packed.next_avail_idx;
- desc = alloc_indirect_packed(total_sg, gfp);
+ desc = alloc_indirect_packed(total_sg, &extra, gfp);
if (!desc)
return -ENOMEM;
@@ -1313,6 +1307,12 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
desc[i].addr = cpu_to_le64(addr);
desc[i].len = cpu_to_le32(sg->length);
i++;
+
+ if (unlikely(vq->use_dma_api)) {
+ extra[i].addr = addr;
+ extra[i].len = sg->length;
+ extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE;
+ }
}
}
@@ -1367,7 +1367,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
/* Store token and indirect buffer state. */
vq->packed.desc_state[id].num = 1;
vq->packed.desc_state[id].data = data;
- vq->packed.desc_state[id].indir_desc = desc;
+ vq->packed.desc_state[id].indir = extra;
vq->packed.desc_state[id].last = id;
vq->num_added += 1;
@@ -1381,7 +1381,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
err_idx = i;
for (i = 0; i < err_idx; i++)
- vring_unmap_desc_packed(vq, &desc[i]);
+ vring_unmap_extra_packed(vq, &extra[i]);
free_desc:
kfree(desc);
@@ -1504,7 +1504,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
/* Store token. */
vq->packed.desc_state[id].num = descs_used;
vq->packed.desc_state[id].data = data;
- vq->packed.desc_state[id].indir_desc = ctx;
+ vq->packed.desc_state[id].indir = ctx;
vq->packed.desc_state[id].last = prev;
/*
@@ -1617,23 +1617,24 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
}
if (vq->indirect) {
+ struct vring_desc_extra *extra;
u32 len;
/* Free the indirect table, if any, now that it's unmapped. */
- desc = state->indir_desc;
- if (!desc)
+ extra = state->indir;
+ if (!extra)
return;
if (vring_need_unmap_buffer(vq)) {
len = vq->packed.desc_extra[id].len;
for (i = 0; i < len / sizeof(struct vring_packed_desc);
i++)
- vring_unmap_desc_packed(vq, &desc[i]);
+ vring_unmap_extra_packed(vq, &extra[i]);
}
kfree(desc);
- state->indir_desc = NULL;
+ state->indir = NULL;
} else if (ctx) {
- *ctx = state->indir_desc;
+ *ctx = state->indir;
}
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-08-20 7:33 ` [PATCH net-next 03/13] virtio_ring: packed: " Xuan Zhuo
@ 2024-08-21 8:54 ` Dan Carpenter
2024-09-11 11:28 ` Michael S. Tsirkin
1 sibling, 0 replies; 36+ messages in thread
From: Dan Carpenter @ 2024-08-21 8:54 UTC (permalink / raw)
To: oe-kbuild, Xuan Zhuo, netdev
Cc: lkp, oe-kbuild-all, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
Eugenio Pérez, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Hi Xuan,
kernel test robot noticed the following build warnings:
url: https://github.com/intel-lab-lkp/linux/commits/Xuan-Zhuo/virtio_ring-introduce-vring_need_unmap_buffer/20240820-153644
base: net-next/main
patch link: https://lore.kernel.org/r/20240820073330.9161-4-xuanzhuo%40linux.alibaba.com
patch subject: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
config: x86_64-randconfig-161-20240820 (https://download.01.org/0day-ci/archive/20240821/202408210655.dx8v5uRW-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202408210655.dx8v5uRW-lkp@intel.com/
New smatch warnings:
drivers/virtio/virtio_ring.c:1634 detach_buf_packed() error: uninitialized symbol 'desc'.
vim +/desc +1634 drivers/virtio/virtio_ring.c
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1594 static void detach_buf_packed(struct vring_virtqueue *vq,
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1595 unsigned int id, void **ctx)
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1596 {
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1597 struct vring_desc_state_packed *state = NULL;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1598 struct vring_packed_desc *desc;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1599 unsigned int i, curr;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1600
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1601 state = &vq->packed.desc_state[id];
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1602
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1603 /* Clear data ptr. */
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1604 state->data = NULL;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1605
aeef9b4733c5c2 Jason Wang 2021-06-04 1606 vq->packed.desc_extra[state->last].next = vq->free_head;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1607 vq->free_head = id;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1608 vq->vq.num_free += state->num;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1609
d5c0ed17fea60c Xuan Zhuo 2024-02-23 1610 if (unlikely(vq->use_dma_api)) {
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1611 curr = id;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1612 for (i = 0; i < state->num; i++) {
d80dc15bb6e76a Xuan Zhuo 2022-02-24 1613 vring_unmap_extra_packed(vq,
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1614 &vq->packed.desc_extra[curr]);
aeef9b4733c5c2 Jason Wang 2021-06-04 1615 curr = vq->packed.desc_extra[curr].next;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1616 }
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1617 }
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1618
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1619 if (vq->indirect) {
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1620 struct vring_desc_extra *extra;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1621 u32 len;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1622
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1623 /* Free the indirect table, if any, now that it's unmapped. */
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1624 extra = state->indir;
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1625 if (!extra)
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1626 return;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1627
de6a29c4b4c442 Xuan Zhuo 2024-08-20 1628 if (vring_need_unmap_buffer(vq)) {
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1629 len = vq->packed.desc_extra[id].len;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1630 for (i = 0; i < len / sizeof(struct vring_packed_desc);
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1631 i++)
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1632 vring_unmap_extra_packed(vq, &extra[i]);
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1633 }
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 @1634 kfree(desc);
^^^^
desc is never initialized/used.
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1635 state->indir = NULL;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1636 } else if (ctx) {
dfcc54f92ab71c Xuan Zhuo 2024-08-20 1637 *ctx = state->indir;
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1638 }
1ce9e6055fa0a9 Tiwei Bie 2018-11-21 1639 }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-08-20 7:33 ` [PATCH net-next 03/13] virtio_ring: packed: " Xuan Zhuo
2024-08-21 8:54 ` Dan Carpenter
@ 2024-09-11 11:28 ` Michael S. Tsirkin
2024-09-12 6:55 ` Xuan Zhuo
1 sibling, 1 reply; 36+ messages in thread
From: Michael S. Tsirkin @ 2024-09-11 11:28 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
As gcc luckily noted:
On Tue, Aug 20, 2024 at 03:33:20PM +0800, Xuan Zhuo wrote:
> @@ -1617,23 +1617,24 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> }
>
> if (vq->indirect) {
> + struct vring_desc_extra *extra;
> u32 len;
>
> /* Free the indirect table, if any, now that it's unmapped. */
> - desc = state->indir_desc;
> - if (!desc)
desc is no longer initialized here
> + extra = state->indir;
> + if (!extra)
> return;
>
> if (vring_need_unmap_buffer(vq)) {
> len = vq->packed.desc_extra[id].len;
> for (i = 0; i < len / sizeof(struct vring_packed_desc);
> i++)
> - vring_unmap_desc_packed(vq, &desc[i]);
> + vring_unmap_extra_packed(vq, &extra[i]);
> }
> kfree(desc);
but freed here
> - state->indir_desc = NULL;
> + state->indir = NULL;
> } else if (ctx) {
> - *ctx = state->indir_desc;
> + *ctx = state->indir;
> }
> }
It seems unlikely this was always 0 on all paths with even
a small amount of stress, so now I question how this was tested.
Besides, do not ignore compiler warnings, and do not tweak code
to just make compiler shut up - they are your friend.
>
> --
> 2.32.0.3.g01195cf9f
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-09-11 11:28 ` Michael S. Tsirkin
@ 2024-09-12 6:55 ` Xuan Zhuo
2024-09-12 7:38 ` Michael S. Tsirkin
0 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 6:55 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 07:28:36 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> As gcc luckily noted:
>
> On Tue, Aug 20, 2024 at 03:33:20PM +0800, Xuan Zhuo wrote:
> > @@ -1617,23 +1617,24 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > }
> >
> > if (vq->indirect) {
> > + struct vring_desc_extra *extra;
> > u32 len;
> >
> > /* Free the indirect table, if any, now that it's unmapped. */
> > - desc = state->indir_desc;
> > - if (!desc)
>
> desc is no longer initialized here
Will fix.
>
> > + extra = state->indir;
> > + if (!extra)
> > return;
> >
> > if (vring_need_unmap_buffer(vq)) {
> > len = vq->packed.desc_extra[id].len;
> > for (i = 0; i < len / sizeof(struct vring_packed_desc);
> > i++)
> > - vring_unmap_desc_packed(vq, &desc[i]);
> > + vring_unmap_extra_packed(vq, &extra[i]);
> > }
> > kfree(desc);
>
>
> but freed here
>
> > - state->indir_desc = NULL;
> > + state->indir = NULL;
> > } else if (ctx) {
> > - *ctx = state->indir_desc;
> > + *ctx = state->indir;
> > }
> > }
>
>
> It seems unlikely this was always 0 on all paths with even
> a small amount of stress, so now I question how this was tested.
> Besides, do not ignore compiler warnings, and do not tweak code
> to just make compiler shut up - they are your friend.
I agree.
Normally I do this by make W=12, but we have too many message,
so I missed this.
make W=12 drivers/net/virtio_net.o drivers/virtio/virtio_ring.o
If not W=12, then I did not get any warning message.
How do you get the message quickly?
Thanks.
>
> >
> > --
> > 2.32.0.3.g01195cf9f
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-09-12 6:55 ` Xuan Zhuo
@ 2024-09-12 7:38 ` Michael S. Tsirkin
2024-09-12 7:43 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Michael S. Tsirkin @ 2024-09-12 7:38 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Thu, Sep 12, 2024 at 02:55:38PM +0800, Xuan Zhuo wrote:
> On Wed, 11 Sep 2024 07:28:36 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > As gcc luckily noted:
> >
> > On Tue, Aug 20, 2024 at 03:33:20PM +0800, Xuan Zhuo wrote:
> > > @@ -1617,23 +1617,24 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > > }
> > >
> > > if (vq->indirect) {
> > > + struct vring_desc_extra *extra;
> > > u32 len;
> > >
> > > /* Free the indirect table, if any, now that it's unmapped. */
> > > - desc = state->indir_desc;
> > > - if (!desc)
> >
> > desc is no longer initialized here
>
>
> Will fix.
>
>
> >
> > > + extra = state->indir;
> > > + if (!extra)
> > > return;
> > >
> > > if (vring_need_unmap_buffer(vq)) {
> > > len = vq->packed.desc_extra[id].len;
> > > for (i = 0; i < len / sizeof(struct vring_packed_desc);
> > > i++)
> > > - vring_unmap_desc_packed(vq, &desc[i]);
> > > + vring_unmap_extra_packed(vq, &extra[i]);
> > > }
> > > kfree(desc);
> >
> >
> > but freed here
> >
> > > - state->indir_desc = NULL;
> > > + state->indir = NULL;
> > > } else if (ctx) {
> > > - *ctx = state->indir_desc;
> > > + *ctx = state->indir;
> > > }
> > > }
> >
> >
> > It seems unlikely this was always 0 on all paths with even
> > a small amount of stress, so now I question how this was tested.
> > Besides, do not ignore compiler warnings, and do not tweak code
> > to just make compiler shut up - they are your friend.
>
> I agree.
>
> Normally I do this by make W=12, but we have too many message,
> so I missed this.
>
> make W=12 drivers/net/virtio_net.o drivers/virtio/virtio_ring.o
>
> If not W=12, then I did not get any warning message.
> How do you get the message quickly?
>
> Thanks.
If you stress test this for a long enough time, and with
debug enabled, you will see a crash.
> >
> > >
> > > --
> > > 2.32.0.3.g01195cf9f
> >
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 03/13] virtio_ring: packed: harden dma unmap for indirect
2024-09-12 7:38 ` Michael S. Tsirkin
@ 2024-09-12 7:43 ` Xuan Zhuo
0 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:43 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Thu, 12 Sep 2024 03:38:12 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Sep 12, 2024 at 02:55:38PM +0800, Xuan Zhuo wrote:
> > On Wed, 11 Sep 2024 07:28:36 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > As gcc luckily noted:
> > >
> > > On Tue, Aug 20, 2024 at 03:33:20PM +0800, Xuan Zhuo wrote:
> > > > @@ -1617,23 +1617,24 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > > > }
> > > >
> > > > if (vq->indirect) {
> > > > + struct vring_desc_extra *extra;
> > > > u32 len;
> > > >
> > > > /* Free the indirect table, if any, now that it's unmapped. */
> > > > - desc = state->indir_desc;
> > > > - if (!desc)
> > >
> > > desc is no longer initialized here
> >
> >
> > Will fix.
> >
> >
> > >
> > > > + extra = state->indir;
> > > > + if (!extra)
> > > > return;
> > > >
> > > > if (vring_need_unmap_buffer(vq)) {
> > > > len = vq->packed.desc_extra[id].len;
> > > > for (i = 0; i < len / sizeof(struct vring_packed_desc);
> > > > i++)
> > > > - vring_unmap_desc_packed(vq, &desc[i]);
> > > > + vring_unmap_extra_packed(vq, &extra[i]);
> > > > }
> > > > kfree(desc);
> > >
> > >
> > > but freed here
> > >
> > > > - state->indir_desc = NULL;
> > > > + state->indir = NULL;
> > > > } else if (ctx) {
> > > > - *ctx = state->indir_desc;
> > > > + *ctx = state->indir;
> > > > }
> > > > }
> > >
> > >
> > > It seems unlikely this was always 0 on all paths with even
> > > a small amount of stress, so now I question how this was tested.
> > > Besides, do not ignore compiler warnings, and do not tweak code
> > > to just make compiler shut up - they are your friend.
> >
> > I agree.
> >
> > Normally I do this by make W=12, but we have too many message,
> > so I missed this.
> >
> > make W=12 drivers/net/virtio_net.o drivers/virtio/virtio_ring.o
> >
> > If not W=12, then I did not get any warning message.
> > How do you get the message quickly?
> >
> > Thanks.
>
>
> If you stress test this for a long enough time, and with
> debug enabled, you will see a crash.
I only stress tested the split ring. For the packed ring, I
just performed a simple verification.
I will street test for two mode in next version.
Thanks.
>
>
> > >
> > > >
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (2 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 03/13] virtio_ring: packed: " Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 3:54 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 05/13] virtio-net: rq submits premapped buffer per buffer Xuan Zhuo
` (8 subsequent siblings)
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
The current configuration sets the virtqueue (vq) to premapped mode,
implying that all buffers submitted to this queue must be mapped ahead
of time. This presents a challenge for the virtnet send queue (sq): the
virtnet driver would be required to keep track of dma information for vq
size * 17, which can be substantial. However, if the premapped mode were
applied on a per-buffer basis, the complexity would be greatly reduced.
With AF_XDP enabled, AF_XDP buffers would become premapped, while kernel
skb buffers could remain unmapped.
We can distinguish them by sg_page(sg), When sg_page(sg) is NULL, this
indicates that the driver has performed DMA mapping in advance, allowing
the Virtio core to directly utilize sg_dma_address(sg) without
conducting any internal DMA mapping. Additionally, DMA unmap operations
for this buffer will be bypassed.
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/virtio/virtio_ring.c | 70 +++++++++++++++++++++---------------
1 file changed, 41 insertions(+), 29 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b43eca93015c..7efddc71af67 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -235,6 +235,7 @@ static void vring_free(struct virtqueue *_vq);
*/
#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
+#define sg_is_premapped(sg) (!sg_page(sg))
static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
unsigned int total_sg)
@@ -292,9 +293,10 @@ static bool vring_use_dma_api(const struct virtio_device *vdev)
return false;
}
-static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring)
+static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring,
+ const struct vring_desc_extra *extra)
{
- return vring->use_dma_api && !vring->premapped;
+ return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR);
}
size_t virtio_max_dma_size(const struct virtio_device *vdev)
@@ -366,7 +368,7 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq)
static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg,
enum dma_data_direction direction, dma_addr_t *addr)
{
- if (vq->premapped) {
+ if (sg_is_premapped(sg)) {
*addr = sg_dma_address(sg);
return 0;
}
@@ -457,7 +459,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vring_need_unmap_buffer(vq))
+ if (!vring_need_unmap_buffer(vq, extra))
goto out;
dma_unmap_page(vring_dma_dev(vq),
@@ -510,7 +512,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
dma_addr_t addr,
unsigned int len,
u16 flags,
- bool indirect)
+ bool indirect, bool premapped)
{
u16 next;
@@ -518,7 +520,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
desc[i].len = cpu_to_virtio32(vq->vdev, len);
- extra[i].addr = addr;
+ extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr;
extra[i].len = len;
extra[i].flags = flags;
@@ -611,7 +613,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
*/
i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length,
VRING_DESC_F_NEXT,
- indirect);
+ indirect, sg_is_premapped(sg));
}
}
for (; n < (out_sgs + in_sgs); n++) {
@@ -629,12 +631,12 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
sg->length,
VRING_DESC_F_NEXT |
VRING_DESC_F_WRITE,
- indirect);
+ indirect, sg_is_premapped(sg));
}
}
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
- if (!indirect && vring_need_unmap_buffer(vq))
+ if (!indirect && vring_need_unmap_buffer(vq, &extra[prev]))
vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
~VRING_DESC_F_NEXT;
@@ -643,19 +645,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
dma_addr_t addr = vring_map_single(
vq, desc, total_sg * sizeof(struct vring_desc),
DMA_TO_DEVICE);
- if (vring_mapping_error(vq, addr)) {
- if (vq->premapped)
- goto free_indirect;
-
+ if (vring_mapping_error(vq, addr))
goto unmap_release;
- }
virtqueue_add_desc_split(_vq, vq->split.vring.desc,
vq->split.desc_extra,
head, addr,
total_sg * sizeof(struct vring_desc),
VRING_DESC_F_INDIRECT,
- false);
+ false, false);
}
/* We're using some buffers from the free list. */
@@ -712,7 +710,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
i = vring_unmap_one_split(vq, &extra[i]);
}
-free_indirect:
if (indirect)
kfree(desc);
@@ -794,7 +791,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
VRING_DESC_F_INDIRECT));
BUG_ON(len == 0 || len % sizeof(struct vring_desc));
- if (vring_need_unmap_buffer(vq)) {
+ if (vq->use_dma_api) {
for (j = 0; j < len / sizeof(struct vring_desc); j++)
vring_unmap_one_split(vq, &extra[j]);
}
@@ -1228,7 +1225,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vring_need_unmap_buffer(vq))
+ if (!vring_need_unmap_buffer(vq, extra))
return;
dma_unmap_page(vring_dma_dev(vq),
@@ -1309,7 +1306,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
i++;
if (unlikely(vq->use_dma_api)) {
- extra[i].addr = addr;
+ extra[i].addr = sg_is_premapped(sg) ? DMA_MAPPING_ERROR : addr;
extra[i].len = sg->length;
extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE;
}
@@ -1320,12 +1317,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
addr = vring_map_single(vq, desc,
total_sg * sizeof(struct vring_packed_desc),
DMA_TO_DEVICE);
- if (vring_mapping_error(vq, addr)) {
- if (vq->premapped)
- goto free_desc;
-
+ if (vring_mapping_error(vq, addr))
goto unmap_release;
- }
vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
@@ -1383,7 +1376,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
for (i = 0; i < err_idx; i++)
vring_unmap_extra_packed(vq, &extra[i]);
-free_desc:
kfree(desc);
END_USE(vq);
@@ -1474,7 +1466,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
desc[i].id = cpu_to_le16(id);
if (unlikely(vq->use_dma_api)) {
- vq->packed.desc_extra[curr].addr = addr;
+ vq->packed.desc_extra[curr].addr = sg_is_premapped(sg) ?
+ DMA_MAPPING_ERROR : addr;
vq->packed.desc_extra[curr].len = sg->length;
vq->packed.desc_extra[curr].flags =
le16_to_cpu(flags);
@@ -1625,10 +1618,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
if (!extra)
return;
- if (vring_need_unmap_buffer(vq)) {
+ if (vq->use_dma_api) {
len = vq->packed.desc_extra[id].len;
- for (i = 0; i < len / sizeof(struct vring_packed_desc);
- i++)
+ for (i = 0; i < len / sizeof(struct vring_packed_desc); i++)
vring_unmap_extra_packed(vq, &extra[i]);
}
kfree(desc);
@@ -2212,6 +2204,11 @@ static inline int virtqueue_add(struct virtqueue *_vq,
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
+ * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
+ * mapping in advance, allowing the virtio core to directly utilize
+ * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
+ * DMA unmap operations for this buffer will be bypassed.
+ *
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
@@ -2246,6 +2243,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
+ * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
+ * mapping in advance, allowing the virtio core to directly utilize
+ * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
+ * DMA unmap operations for this buffer will be bypassed.
+ *
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
@@ -2268,6 +2270,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
+ * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
+ * mapping in advance, allowing the virtio core to directly utilize
+ * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
+ * DMA unmap operations for this buffer will be bypassed.
+ *
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
@@ -2291,6 +2298,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
* @ctx: extra context for the token
* @gfp: how to do memory allocations (if necessary).
*
+ * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
+ * mapping in advance, allowing the virtio core to directly utilize
+ * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
+ * DMA unmap operations for this buffer will be bypassed.
+ *
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer
2024-08-20 7:33 ` [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer Xuan Zhuo
@ 2024-09-11 3:54 ` Jason Wang
2024-09-12 7:36 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Jason Wang @ 2024-09-11 3:54 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> The current configuration sets the virtqueue (vq) to premapped mode,
> implying that all buffers submitted to this queue must be mapped ahead
> of time. This presents a challenge for the virtnet send queue (sq): the
> virtnet driver would be required to keep track of dma information for vq
> size * 17, which can be substantial. However, if the premapped mode were
> applied on a per-buffer basis, the complexity would be greatly reduced.
> With AF_XDP enabled, AF_XDP buffers would become premapped, while kernel
> skb buffers could remain unmapped.
Is this only applied to TX or both TX and RX.
>
> We can distinguish them by sg_page(sg), When sg_page(sg) is NULL, this
> indicates that the driver has performed DMA mapping in advance, allowing
> the Virtio core to directly utilize sg_dma_address(sg) without
> conducting any internal DMA mapping.
This seems conflict with the code below?
#define sg_is_premapped(sg) (!sg_page(sg))
And rethink of the design, a question is that is there a chance that
VM_PFNMAP area could be used for virtio-net? If it is true, the trick
for sg_page() can not work?
A quick glance told me AF_XEP seems to be safe as it uses
pin_user_pages(), but we need to check other possibilities.
Or we need to fall back to our previous idea, having new APIs.
> Additionally, DMA unmap operations
> for this buffer will be bypassed.
>
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/virtio/virtio_ring.c | 70 +++++++++++++++++++++---------------
> 1 file changed, 41 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index b43eca93015c..7efddc71af67 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -235,6 +235,7 @@ static void vring_free(struct virtqueue *_vq);
> */
>
> #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> +#define sg_is_premapped(sg) (!sg_page(sg))
>
> static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> unsigned int total_sg)
> @@ -292,9 +293,10 @@ static bool vring_use_dma_api(const struct virtio_device *vdev)
> return false;
> }
>
> -static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring)
> +static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring,
> + const struct vring_desc_extra *extra)
> {
> - return vring->use_dma_api && !vring->premapped;
> + return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR);
> }
>
> size_t virtio_max_dma_size(const struct virtio_device *vdev)
> @@ -366,7 +368,7 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq)
> static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg,
> enum dma_data_direction direction, dma_addr_t *addr)
> {
> - if (vq->premapped) {
> + if (sg_is_premapped(sg)) {
> *addr = sg_dma_address(sg);
> return 0;
> }
> @@ -457,7 +459,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> (flags & VRING_DESC_F_WRITE) ?
> DMA_FROM_DEVICE : DMA_TO_DEVICE);
> } else {
> - if (!vring_need_unmap_buffer(vq))
> + if (!vring_need_unmap_buffer(vq, extra))
> goto out;
>
> dma_unmap_page(vring_dma_dev(vq),
> @@ -510,7 +512,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
> dma_addr_t addr,
> unsigned int len,
> u16 flags,
> - bool indirect)
> + bool indirect, bool premapped)
> {
> u16 next;
>
> @@ -518,7 +520,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
> desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
> desc[i].len = cpu_to_virtio32(vq->vdev, len);
>
> - extra[i].addr = addr;
> + extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr;
> extra[i].len = len;
> extra[i].flags = flags;
>
> @@ -611,7 +613,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> */
> i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length,
> VRING_DESC_F_NEXT,
> - indirect);
> + indirect, sg_is_premapped(sg));
> }
> }
> for (; n < (out_sgs + in_sgs); n++) {
> @@ -629,12 +631,12 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> sg->length,
> VRING_DESC_F_NEXT |
> VRING_DESC_F_WRITE,
> - indirect);
> + indirect, sg_is_premapped(sg));
> }
> }
> /* Last one doesn't continue. */
> desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> - if (!indirect && vring_need_unmap_buffer(vq))
> + if (!indirect && vring_need_unmap_buffer(vq, &extra[prev]))
> vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
> ~VRING_DESC_F_NEXT;
>
> @@ -643,19 +645,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> dma_addr_t addr = vring_map_single(
> vq, desc, total_sg * sizeof(struct vring_desc),
> DMA_TO_DEVICE);
> - if (vring_mapping_error(vq, addr)) {
> - if (vq->premapped)
> - goto free_indirect;
> -
> + if (vring_mapping_error(vq, addr))
> goto unmap_release;
> - }
>
> virtqueue_add_desc_split(_vq, vq->split.vring.desc,
> vq->split.desc_extra,
> head, addr,
> total_sg * sizeof(struct vring_desc),
> VRING_DESC_F_INDIRECT,
> - false);
> + false, false);
> }
>
> /* We're using some buffers from the free list. */
> @@ -712,7 +710,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> i = vring_unmap_one_split(vq, &extra[i]);
> }
>
> -free_indirect:
> if (indirect)
> kfree(desc);
>
> @@ -794,7 +791,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> VRING_DESC_F_INDIRECT));
> BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>
> - if (vring_need_unmap_buffer(vq)) {
> + if (vq->use_dma_api) {
> for (j = 0; j < len / sizeof(struct vring_desc); j++)
> vring_unmap_one_split(vq, &extra[j]);
> }
> @@ -1228,7 +1225,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> (flags & VRING_DESC_F_WRITE) ?
> DMA_FROM_DEVICE : DMA_TO_DEVICE);
> } else {
> - if (!vring_need_unmap_buffer(vq))
> + if (!vring_need_unmap_buffer(vq, extra))
> return;
>
> dma_unmap_page(vring_dma_dev(vq),
> @@ -1309,7 +1306,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> i++;
>
> if (unlikely(vq->use_dma_api)) {
> - extra[i].addr = addr;
> + extra[i].addr = sg_is_premapped(sg) ? DMA_MAPPING_ERROR : addr;
> extra[i].len = sg->length;
> extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE;
> }
> @@ -1320,12 +1317,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> addr = vring_map_single(vq, desc,
> total_sg * sizeof(struct vring_packed_desc),
> DMA_TO_DEVICE);
> - if (vring_mapping_error(vq, addr)) {
> - if (vq->premapped)
> - goto free_desc;
> -
> + if (vring_mapping_error(vq, addr))
> goto unmap_release;
> - }
>
> vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
> vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
> @@ -1383,7 +1376,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> for (i = 0; i < err_idx; i++)
> vring_unmap_extra_packed(vq, &extra[i]);
>
> -free_desc:
> kfree(desc);
>
> END_USE(vq);
> @@ -1474,7 +1466,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> desc[i].id = cpu_to_le16(id);
>
> if (unlikely(vq->use_dma_api)) {
> - vq->packed.desc_extra[curr].addr = addr;
> + vq->packed.desc_extra[curr].addr = sg_is_premapped(sg) ?
> + DMA_MAPPING_ERROR : addr;
> vq->packed.desc_extra[curr].len = sg->length;
> vq->packed.desc_extra[curr].flags =
> le16_to_cpu(flags);
> @@ -1625,10 +1618,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> if (!extra)
> return;
>
> - if (vring_need_unmap_buffer(vq)) {
> + if (vq->use_dma_api) {
> len = vq->packed.desc_extra[id].len;
> - for (i = 0; i < len / sizeof(struct vring_packed_desc);
> - i++)
> + for (i = 0; i < len / sizeof(struct vring_packed_desc); i++)
Unnecessary changes?
> vring_unmap_extra_packed(vq, &extra[i]);
> }
> kfree(desc);
> @@ -2212,6 +2204,11 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> * @data: the token identifying the buffer.
> * @gfp: how to do memory allocations (if necessary).
> *
> + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> + * mapping in advance, allowing the virtio core to directly utilize
> + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> + * DMA unmap operations for this buffer will be bypassed.
> + *
> * Caller must ensure we don't call this with other virtqueue operations
> * at the same time (except where noted).
> *
> @@ -2246,6 +2243,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
> * @data: the token identifying the buffer.
> * @gfp: how to do memory allocations (if necessary).
> *
> + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> + * mapping in advance, allowing the virtio core to directly utilize
> + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> + * DMA unmap operations for this buffer will be bypassed.
> + *
> * Caller must ensure we don't call this with other virtqueue operations
> * at the same time (except where noted).
> *
> @@ -2268,6 +2270,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
> * @data: the token identifying the buffer.
> * @gfp: how to do memory allocations (if necessary).
> *
> + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> + * mapping in advance, allowing the virtio core to directly utilize
> + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> + * DMA unmap operations for this buffer will be bypassed.
> + *
> * Caller must ensure we don't call this with other virtqueue operations
> * at the same time (except where noted).
> *
> @@ -2291,6 +2298,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
> * @ctx: extra context for the token
> * @gfp: how to do memory allocations (if necessary).
> *
> + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> + * mapping in advance, allowing the virtio core to directly utilize
> + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> + * DMA unmap operations for this buffer will be bypassed.
> + *
> * Caller must ensure we don't call this with other virtqueue operations
> * at the same time (except where noted).
> *
> --
> 2.32.0.3.g01195cf9f
>
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer
2024-09-11 3:54 ` Jason Wang
@ 2024-09-12 7:36 ` Xuan Zhuo
2024-09-13 3:36 ` Jason Wang
0 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:36 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 11:54:25 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > The current configuration sets the virtqueue (vq) to premapped mode,
> > implying that all buffers submitted to this queue must be mapped ahead
> > of time. This presents a challenge for the virtnet send queue (sq): the
> > virtnet driver would be required to keep track of dma information for vq
> > size * 17, which can be substantial. However, if the premapped mode were
> > applied on a per-buffer basis, the complexity would be greatly reduced.
> > With AF_XDP enabled, AF_XDP buffers would become premapped, while kernel
> > skb buffers could remain unmapped.
>
> Is this only applied to TX or both TX and RX.
For rx, if you mean per-buffer dma buffer, I think it is yes,
rx can reuse this. If you mean should we do premapped for the
normal rx buffers, I think we should, that can reduce the
dma map operations.
>
> >
> > We can distinguish them by sg_page(sg), When sg_page(sg) is NULL, this
> > indicates that the driver has performed DMA mapping in advance, allowing
> > the Virtio core to directly utilize sg_dma_address(sg) without
> > conducting any internal DMA mapping.
>
> This seems conflict with the code below?
>
> #define sg_is_premapped(sg) (!sg_page(sg))
Sorry, I do not get for you.
The key point is that the sg->page is setted by driver.
So I mean if the driver sets sg->page = NULL, then for this sg,
the virtio core can skip dma mapping. If the driver sets
sg->page to the page of the buffer, then the virtio core should
do dma mapping for this sg.
>
> And rethink of the design, a question is that is there a chance that
> VM_PFNMAP area could be used for virtio-net? If it is true, the trick
> for sg_page() can not work?
>
> A quick glance told me AF_XEP seems to be safe as it uses
> pin_user_pages(), but we need to check other possibilities.
>
> Or we need to fall back to our previous idea, having new APIs.
>
> > Additionally, DMA unmap operations
> > for this buffer will be bypassed.
> >
> > Suggested-by: Jason Wang <jasowang@redhat.com>
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/virtio/virtio_ring.c | 70 +++++++++++++++++++++---------------
> > 1 file changed, 41 insertions(+), 29 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index b43eca93015c..7efddc71af67 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -235,6 +235,7 @@ static void vring_free(struct virtqueue *_vq);
> > */
> >
> > #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> > +#define sg_is_premapped(sg) (!sg_page(sg))
> >
> > static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> > unsigned int total_sg)
> > @@ -292,9 +293,10 @@ static bool vring_use_dma_api(const struct virtio_device *vdev)
> > return false;
> > }
> >
> > -static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring)
> > +static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring,
> > + const struct vring_desc_extra *extra)
> > {
> > - return vring->use_dma_api && !vring->premapped;
> > + return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR);
> > }
> >
> > size_t virtio_max_dma_size(const struct virtio_device *vdev)
> > @@ -366,7 +368,7 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq)
> > static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg,
> > enum dma_data_direction direction, dma_addr_t *addr)
> > {
> > - if (vq->premapped) {
> > + if (sg_is_premapped(sg)) {
> > *addr = sg_dma_address(sg);
> > return 0;
> > }
> > @@ -457,7 +459,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> > (flags & VRING_DESC_F_WRITE) ?
> > DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > } else {
> > - if (!vring_need_unmap_buffer(vq))
> > + if (!vring_need_unmap_buffer(vq, extra))
> > goto out;
> >
> > dma_unmap_page(vring_dma_dev(vq),
> > @@ -510,7 +512,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
> > dma_addr_t addr,
> > unsigned int len,
> > u16 flags,
> > - bool indirect)
> > + bool indirect, bool premapped)
> > {
> > u16 next;
> >
> > @@ -518,7 +520,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
> > desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
> > desc[i].len = cpu_to_virtio32(vq->vdev, len);
> >
> > - extra[i].addr = addr;
> > + extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr;
> > extra[i].len = len;
> > extra[i].flags = flags;
> >
> > @@ -611,7 +613,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > */
> > i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length,
> > VRING_DESC_F_NEXT,
> > - indirect);
> > + indirect, sg_is_premapped(sg));
> > }
> > }
> > for (; n < (out_sgs + in_sgs); n++) {
> > @@ -629,12 +631,12 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > sg->length,
> > VRING_DESC_F_NEXT |
> > VRING_DESC_F_WRITE,
> > - indirect);
> > + indirect, sg_is_premapped(sg));
> > }
> > }
> > /* Last one doesn't continue. */
> > desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > - if (!indirect && vring_need_unmap_buffer(vq))
> > + if (!indirect && vring_need_unmap_buffer(vq, &extra[prev]))
> > vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
> > ~VRING_DESC_F_NEXT;
> >
> > @@ -643,19 +645,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > dma_addr_t addr = vring_map_single(
> > vq, desc, total_sg * sizeof(struct vring_desc),
> > DMA_TO_DEVICE);
> > - if (vring_mapping_error(vq, addr)) {
> > - if (vq->premapped)
> > - goto free_indirect;
> > -
> > + if (vring_mapping_error(vq, addr))
> > goto unmap_release;
> > - }
> >
> > virtqueue_add_desc_split(_vq, vq->split.vring.desc,
> > vq->split.desc_extra,
> > head, addr,
> > total_sg * sizeof(struct vring_desc),
> > VRING_DESC_F_INDIRECT,
> > - false);
> > + false, false);
> > }
> >
> > /* We're using some buffers from the free list. */
> > @@ -712,7 +710,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > i = vring_unmap_one_split(vq, &extra[i]);
> > }
> >
> > -free_indirect:
> > if (indirect)
> > kfree(desc);
> >
> > @@ -794,7 +791,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> > VRING_DESC_F_INDIRECT));
> > BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> >
> > - if (vring_need_unmap_buffer(vq)) {
> > + if (vq->use_dma_api) {
> > for (j = 0; j < len / sizeof(struct vring_desc); j++)
> > vring_unmap_one_split(vq, &extra[j]);
> > }
> > @@ -1228,7 +1225,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> > (flags & VRING_DESC_F_WRITE) ?
> > DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > } else {
> > - if (!vring_need_unmap_buffer(vq))
> > + if (!vring_need_unmap_buffer(vq, extra))
> > return;
> >
> > dma_unmap_page(vring_dma_dev(vq),
> > @@ -1309,7 +1306,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > i++;
> >
> > if (unlikely(vq->use_dma_api)) {
> > - extra[i].addr = addr;
> > + extra[i].addr = sg_is_premapped(sg) ? DMA_MAPPING_ERROR : addr;
> > extra[i].len = sg->length;
> > extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE;
> > }
> > @@ -1320,12 +1317,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > addr = vring_map_single(vq, desc,
> > total_sg * sizeof(struct vring_packed_desc),
> > DMA_TO_DEVICE);
> > - if (vring_mapping_error(vq, addr)) {
> > - if (vq->premapped)
> > - goto free_desc;
> > -
> > + if (vring_mapping_error(vq, addr))
> > goto unmap_release;
> > - }
> >
> > vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
> > vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
> > @@ -1383,7 +1376,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > for (i = 0; i < err_idx; i++)
> > vring_unmap_extra_packed(vq, &extra[i]);
> >
> > -free_desc:
> > kfree(desc);
> >
> > END_USE(vq);
> > @@ -1474,7 +1466,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > desc[i].id = cpu_to_le16(id);
> >
> > if (unlikely(vq->use_dma_api)) {
> > - vq->packed.desc_extra[curr].addr = addr;
> > + vq->packed.desc_extra[curr].addr = sg_is_premapped(sg) ?
> > + DMA_MAPPING_ERROR : addr;
> > vq->packed.desc_extra[curr].len = sg->length;
> > vq->packed.desc_extra[curr].flags =
> > le16_to_cpu(flags);
> > @@ -1625,10 +1618,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > if (!extra)
> > return;
> >
> > - if (vring_need_unmap_buffer(vq)) {
> > + if (vq->use_dma_api) {
> > len = vq->packed.desc_extra[id].len;
> > - for (i = 0; i < len / sizeof(struct vring_packed_desc);
> > - i++)
> > + for (i = 0; i < len / sizeof(struct vring_packed_desc); i++)
>
> Unnecessary changes?
Will fix.
Thanks.
>
>
> > vring_unmap_extra_packed(vq, &extra[i]);
> > }
> > kfree(desc);
> > @@ -2212,6 +2204,11 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> > * @data: the token identifying the buffer.
> > * @gfp: how to do memory allocations (if necessary).
> > *
> > + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> > + * mapping in advance, allowing the virtio core to directly utilize
> > + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> > + * DMA unmap operations for this buffer will be bypassed.
> > + *
> > * Caller must ensure we don't call this with other virtqueue operations
> > * at the same time (except where noted).
> > *
> > @@ -2246,6 +2243,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
> > * @data: the token identifying the buffer.
> > * @gfp: how to do memory allocations (if necessary).
> > *
> > + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> > + * mapping in advance, allowing the virtio core to directly utilize
> > + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> > + * DMA unmap operations for this buffer will be bypassed.
> > + *
> > * Caller must ensure we don't call this with other virtqueue operations
> > * at the same time (except where noted).
> > *
> > @@ -2268,6 +2270,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
> > * @data: the token identifying the buffer.
> > * @gfp: how to do memory allocations (if necessary).
> > *
> > + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> > + * mapping in advance, allowing the virtio core to directly utilize
> > + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> > + * DMA unmap operations for this buffer will be bypassed.
> > + *
> > * Caller must ensure we don't call this with other virtqueue operations
> > * at the same time (except where noted).
> > *
> > @@ -2291,6 +2298,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
> > * @ctx: extra context for the token
> > * @gfp: how to do memory allocations (if necessary).
> > *
> > + * When sg_page(sg) is NULL, this indicates that the driver has performed DMA
> > + * mapping in advance, allowing the virtio core to directly utilize
> > + * sg_dma_address(sg) without conducting any internal DMA mapping. Additionally,
> > + * DMA unmap operations for this buffer will be bypassed.
> > + *
> > * Caller must ensure we don't call this with other virtqueue operations
> > * at the same time (except where noted).
> > *
> > --
> > 2.32.0.3.g01195cf9f
> >
>
> Thanks
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer
2024-09-12 7:36 ` Xuan Zhuo
@ 2024-09-13 3:36 ` Jason Wang
0 siblings, 0 replies; 36+ messages in thread
From: Jason Wang @ 2024-09-13 3:36 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Thu, Sep 12, 2024 at 3:43 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 11 Sep 2024 11:54:25 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > The current configuration sets the virtqueue (vq) to premapped mode,
> > > implying that all buffers submitted to this queue must be mapped ahead
> > > of time. This presents a challenge for the virtnet send queue (sq): the
> > > virtnet driver would be required to keep track of dma information for vq
> > > size * 17, which can be substantial. However, if the premapped mode were
> > > applied on a per-buffer basis, the complexity would be greatly reduced.
> > > With AF_XDP enabled, AF_XDP buffers would become premapped, while kernel
> > > skb buffers could remain unmapped.
> >
> > Is this only applied to TX or both TX and RX.
>
>
> For rx, if you mean per-buffer dma buffer, I think it is yes,
> rx can reuse this. If you mean should we do premapped for the
> normal rx buffers, I think we should, that can reduce the
> dma map operations.
>
>
> >
> > >
> > > We can distinguish them by sg_page(sg), When sg_page(sg) is NULL, this
> > > indicates that the driver has performed DMA mapping in advance, allowing
> > > the Virtio core to directly utilize sg_dma_address(sg) without
> > > conducting any internal DMA mapping.
> >
> > This seems conflict with the code below?
> >
> > #define sg_is_premapped(sg) (!sg_page(sg))
>
> Sorry, I do not get for you.
>
> The key point is that the sg->page is setted by driver.
Ok, I forget that but let's document this assumption in the changelog.
>
> So I mean if the driver sets sg->page = NULL, then for this sg,
> the virtio core can skip dma mapping. If the driver sets
> sg->page to the page of the buffer, then the virtio core should
> do dma mapping for this sg.
>
Ok, let's describe this in the changelog.
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 05/13] virtio-net: rq submits premapped buffer per buffer
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (3 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 04/13] virtio_ring: perform premapped operations based on per-buffer Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 06/13] virtio_ring: remove API virtqueue_set_dma_premapped Xuan Zhuo
` (7 subsequent siblings)
12 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
virtio-net rq submits premapped buffer per buffer.
And removes the call of the virtnet_rq_set_premapped().
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 31 ++++++++-----------------------
1 file changed, 8 insertions(+), 23 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6c79fc43fa31..41aaea3b90fd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -539,6 +539,13 @@ static struct sk_buff *ptr_to_skb(void *ptr)
return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG);
}
+static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
+{
+ sg_assign_page(sg, NULL);
+ sg->dma_address = addr;
+ sg->length = len;
+}
+
static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
bool in_napi, struct virtnet_sq_free_stats *stats)
{
@@ -907,8 +914,7 @@ static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len)
addr = dma->addr - sizeof(*dma) + offset;
sg_init_table(rq->sg, 1);
- rq->sg[0].dma_address = addr;
- rq->sg[0].length = len;
+ sg_fill_dma(&rq->sg[0], addr, len);
}
static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp)
@@ -967,19 +973,6 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp)
return buf;
}
-static void virtnet_rq_set_premapped(struct virtnet_info *vi)
-{
- int i;
-
- /* disable for big mode */
- if (!vi->mergeable_rx_bufs && vi->big_packets)
- return;
-
- for (i = 0; i < vi->max_queue_pairs; i++)
- /* error should never happen */
- BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq));
-}
-
static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
{
struct virtnet_info *vi = vq->vdev->priv;
@@ -1071,12 +1064,6 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
-static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
-{
- sg->dma_address = addr;
- sg->length = len;
-}
-
static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
struct receive_queue *rq, void *buf, u32 len)
{
@@ -6109,8 +6096,6 @@ static int init_vqs(struct virtnet_info *vi)
if (ret)
goto err_free;
- virtnet_rq_set_premapped(vi);
-
cpus_read_lock();
virtnet_set_affinity(vi);
cpus_read_unlock();
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH net-next 06/13] virtio_ring: remove API virtqueue_set_dma_premapped
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (4 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 05/13] virtio-net: rq submits premapped buffer per buffer Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 07/13] virtio_net: refactor the xmit type Xuan Zhuo
` (6 subsequent siblings)
12 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Now, this API is useless. remove it.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/virtio/virtio_ring.c | 48 ------------------------------------
include/linux/virtio.h | 2 --
2 files changed, 50 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 7efddc71af67..b920cdf1d513 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -172,9 +172,6 @@ struct vring_virtqueue {
/* Host publishes avail event idx */
bool event;
- /* Do DMA mapping by driver */
- bool premapped;
-
/* Head of free buffer list. */
unsigned int free_head;
/* Number we've added since last sync. */
@@ -2075,7 +2072,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->packed_ring = true;
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
- vq->premapped = false;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2639,7 +2635,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
#endif
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
- vq->premapped = false;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2766,49 +2761,6 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
}
EXPORT_SYMBOL_GPL(virtqueue_resize);
-/**
- * virtqueue_set_dma_premapped - set the vring premapped mode
- * @_vq: the struct virtqueue we're talking about.
- *
- * Enable the premapped mode of the vq.
- *
- * The vring in premapped mode does not do dma internally, so the driver must
- * do dma mapping in advance. The driver must pass the dma_address through
- * dma_address of scatterlist. When the driver got a used buffer from
- * the vring, it has to unmap the dma address.
- *
- * This function must be called immediately after creating the vq, or after vq
- * reset, and before adding any buffers to it.
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error.
- * 0: success.
- * -EINVAL: too late to enable premapped mode, the vq already contains buffers.
- */
-int virtqueue_set_dma_premapped(struct virtqueue *_vq)
-{
- struct vring_virtqueue *vq = to_vvq(_vq);
- u32 num;
-
- START_USE(vq);
-
- num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
-
- if (num != vq->vq.num_free) {
- END_USE(vq);
- return -EINVAL;
- }
-
- vq->premapped = true;
-
- END_USE(vq);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped);
-
/**
* virtqueue_reset - detach and recycle all unused buffers
* @_vq: the struct virtqueue we're talking about.
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 306137a15d07..238728ed62fa 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -82,8 +82,6 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
-int virtqueue_set_dma_premapped(struct virtqueue *_vq);
-
bool virtqueue_poll(struct virtqueue *vq, unsigned);
bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH net-next 07/13] virtio_net: refactor the xmit type
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (5 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 06/13] virtio_ring: remove API virtqueue_set_dma_premapped Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 4:04 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx Xuan Zhuo
` (5 subsequent siblings)
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Because the af-xdp will introduce a new xmit type, so I refactor the
xmit type mechanism first.
We use the last two bits of the pointer to distinguish the xmit type,
so we can distinguish four xmit types. Now we have three types: skb,
orphan and xdp.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 90 +++++++++++++++++++++++-----------------
1 file changed, 51 insertions(+), 39 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 41aaea3b90fd..96abee36738b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -45,9 +45,6 @@ module_param(napi_tx, bool, 0644);
#define VIRTIO_XDP_TX BIT(0)
#define VIRTIO_XDP_REDIR BIT(1)
-#define VIRTIO_XDP_FLAG BIT(0)
-#define VIRTIO_ORPHAN_FLAG BIT(1)
-
/* RX packet size EWMA. The average packet size is used to determine the packet
* buffer size when refilling RX rings. As the entire RX ring may be refilled
* at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -509,34 +506,35 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
struct page *page, void *buf,
int len, int truesize);
-static bool is_xdp_frame(void *ptr)
-{
- return (unsigned long)ptr & VIRTIO_XDP_FLAG;
-}
+enum virtnet_xmit_type {
+ VIRTNET_XMIT_TYPE_SKB,
+ VIRTNET_XMIT_TYPE_ORPHAN,
+ VIRTNET_XMIT_TYPE_XDP,
+};
-static void *xdp_to_ptr(struct xdp_frame *ptr)
-{
- return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
-}
+#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
+ | VIRTNET_XMIT_TYPE_XDP)
-static struct xdp_frame *ptr_to_xdp(void *ptr)
+static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
{
- return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
-}
+ unsigned long p = (unsigned long)*ptr;
-static bool is_orphan_skb(void *ptr)
-{
- return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG;
+ *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK);
+
+ return p & VIRTNET_XMIT_TYPE_MASK;
}
-static void *skb_to_ptr(struct sk_buff *skb, bool orphan)
+static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
{
- return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0));
+ return (void *)((unsigned long)ptr | type);
}
-static struct sk_buff *ptr_to_skb(void *ptr)
+static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
+ enum virtnet_xmit_type type)
{
- return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG);
+ return virtqueue_add_outbuf(sq->vq, sq->sg, num,
+ virtnet_xmit_ptr_mix(data, type),
+ GFP_ATOMIC);
}
static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
@@ -549,29 +547,37 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
bool in_napi, struct virtnet_sq_free_stats *stats)
{
+ struct xdp_frame *frame;
+ struct sk_buff *skb;
unsigned int len;
void *ptr;
while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
- if (!is_xdp_frame(ptr)) {
- struct sk_buff *skb = ptr_to_skb(ptr);
+ switch (virtnet_xmit_ptr_strip(&ptr)) {
+ case VIRTNET_XMIT_TYPE_SKB:
+ skb = ptr;
pr_debug("Sent skb %p\n", skb);
+ stats->napi_packets++;
+ stats->napi_bytes += skb->len;
+ napi_consume_skb(skb, in_napi);
+ break;
- if (is_orphan_skb(ptr)) {
- stats->packets++;
- stats->bytes += skb->len;
- } else {
- stats->napi_packets++;
- stats->napi_bytes += skb->len;
- }
+ case VIRTNET_XMIT_TYPE_ORPHAN:
+ skb = ptr;
+
+ stats->packets++;
+ stats->bytes += skb->len;
napi_consume_skb(skb, in_napi);
- } else {
- struct xdp_frame *frame = ptr_to_xdp(ptr);
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ frame = ptr;
stats->packets++;
stats->bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
+ break;
}
}
netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
@@ -1421,8 +1427,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
skb_frag_size(frag), skb_frag_off(frag));
}
- err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
- xdp_to_ptr(xdpf), GFP_ATOMIC);
+ err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP);
if (unlikely(err))
return -ENOSPC; /* Caller handle free/refcnt */
@@ -3028,8 +3033,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
return num_sg;
num_sg++;
}
- return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
- skb_to_ptr(skb, orphan), GFP_ATOMIC);
+
+ return virtnet_add_outbuf(sq, num_sg, skb,
+ orphan ? VIRTNET_XMIT_TYPE_ORPHAN : VIRTNET_XMIT_TYPE_SKB);
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -5906,10 +5912,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{
- if (!is_xdp_frame(buf))
+ switch (virtnet_xmit_ptr_strip(&buf)) {
+ case VIRTNET_XMIT_TYPE_SKB:
+ case VIRTNET_XMIT_TYPE_ORPHAN:
dev_kfree_skb(buf);
- else
- xdp_return_frame(ptr_to_xdp(buf));
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ xdp_return_frame(buf);
+ break;
+ }
}
static void free_unused_bufs(struct virtnet_info *vi)
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 07/13] virtio_net: refactor the xmit type
2024-08-20 7:33 ` [PATCH net-next 07/13] virtio_net: refactor the xmit type Xuan Zhuo
@ 2024-09-11 4:04 ` Jason Wang
2024-09-12 7:50 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Jason Wang @ 2024-09-11 4:04 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> Because the af-xdp will introduce a new xmit type, so I refactor the
> xmit type mechanism first.
>
> We use the last two bits of the pointer to distinguish the xmit type,
> so we can distinguish four xmit types. Now we have three types: skb,
> orphan and xdp.
And if I was not wrong, we do not anymore use bitmasks. If yes, let's
explain the reason here.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 90 +++++++++++++++++++++++-----------------
> 1 file changed, 51 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 41aaea3b90fd..96abee36738b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -45,9 +45,6 @@ module_param(napi_tx, bool, 0644);
> #define VIRTIO_XDP_TX BIT(0)
> #define VIRTIO_XDP_REDIR BIT(1)
>
> -#define VIRTIO_XDP_FLAG BIT(0)
> -#define VIRTIO_ORPHAN_FLAG BIT(1)
> -
> /* RX packet size EWMA. The average packet size is used to determine the packet
> * buffer size when refilling RX rings. As the entire RX ring may be refilled
> * at once, the weight is chosen so that the EWMA will be insensitive to short-
> @@ -509,34 +506,35 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> struct page *page, void *buf,
> int len, int truesize);
>
> -static bool is_xdp_frame(void *ptr)
> -{
> - return (unsigned long)ptr & VIRTIO_XDP_FLAG;
> -}
> +enum virtnet_xmit_type {
> + VIRTNET_XMIT_TYPE_SKB,
> + VIRTNET_XMIT_TYPE_ORPHAN,
Let's rename this to SKB_ORPHAN?
> + VIRTNET_XMIT_TYPE_XDP,
> +};
>
> -static void *xdp_to_ptr(struct xdp_frame *ptr)
> -{
> - return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
> -}
> +#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
> + | VIRTNET_XMIT_TYPE_XDP)
I may miss something but it seems not a correct bitmask definition as
each member is not a bit actually?
>
> -static struct xdp_frame *ptr_to_xdp(void *ptr)
> +static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> {
> - return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> -}
> + unsigned long p = (unsigned long)*ptr;
>
> -static bool is_orphan_skb(void *ptr)
> -{
> - return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG;
> + *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK);
> +
> + return p & VIRTNET_XMIT_TYPE_MASK;
> }
>
> -static void *skb_to_ptr(struct sk_buff *skb, bool orphan)
> +static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
> {
> - return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0));
> + return (void *)((unsigned long)ptr | type);
> }
>
> -static struct sk_buff *ptr_to_skb(void *ptr)
> +static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> + enum virtnet_xmit_type type)
> {
> - return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG);
> + return virtqueue_add_outbuf(sq->vq, sq->sg, num,
> + virtnet_xmit_ptr_mix(data, type),
> + GFP_ATOMIC);
> }
>
> static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> @@ -549,29 +547,37 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> bool in_napi, struct virtnet_sq_free_stats *stats)
> {
> + struct xdp_frame *frame;
> + struct sk_buff *skb;
> unsigned int len;
> void *ptr;
>
> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> - if (!is_xdp_frame(ptr)) {
> - struct sk_buff *skb = ptr_to_skb(ptr);
> + switch (virtnet_xmit_ptr_strip(&ptr)) {
> + case VIRTNET_XMIT_TYPE_SKB:
> + skb = ptr;
>
> pr_debug("Sent skb %p\n", skb);
> + stats->napi_packets++;
> + stats->napi_bytes += skb->len;
> + napi_consume_skb(skb, in_napi);
> + break;
>
> - if (is_orphan_skb(ptr)) {
> - stats->packets++;
> - stats->bytes += skb->len;
> - } else {
> - stats->napi_packets++;
> - stats->napi_bytes += skb->len;
> - }
> + case VIRTNET_XMIT_TYPE_ORPHAN:
> + skb = ptr;
> +
> + stats->packets++;
> + stats->bytes += skb->len;
> napi_consume_skb(skb, in_napi);
> - } else {
> - struct xdp_frame *frame = ptr_to_xdp(ptr);
> + break;
> +
> + case VIRTNET_XMIT_TYPE_XDP:
> + frame = ptr;
>
> stats->packets++;
> stats->bytes += xdp_get_frame_len(frame);
> xdp_return_frame(frame);
> + break;
> }
> }
> netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> @@ -1421,8 +1427,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> skb_frag_size(frag), skb_frag_off(frag));
> }
>
> - err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> - xdp_to_ptr(xdpf), GFP_ATOMIC);
> + err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP);
> if (unlikely(err))
> return -ENOSPC; /* Caller handle free/refcnt */
>
> @@ -3028,8 +3033,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
> return num_sg;
> num_sg++;
> }
> - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> - skb_to_ptr(skb, orphan), GFP_ATOMIC);
> +
> + return virtnet_add_outbuf(sq, num_sg, skb,
> + orphan ? VIRTNET_XMIT_TYPE_ORPHAN : VIRTNET_XMIT_TYPE_SKB);
> }
>
> static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> @@ -5906,10 +5912,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>
> static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> {
> - if (!is_xdp_frame(buf))
> + switch (virtnet_xmit_ptr_strip(&buf)) {
> + case VIRTNET_XMIT_TYPE_SKB:
> + case VIRTNET_XMIT_TYPE_ORPHAN:
> dev_kfree_skb(buf);
> - else
> - xdp_return_frame(ptr_to_xdp(buf));
> + break;
> +
> + case VIRTNET_XMIT_TYPE_XDP:
> + xdp_return_frame(buf);
> + break;
> + }
> }
Others look fine.
Thanks
>
> static void free_unused_bufs(struct virtnet_info *vi)
> --
> 2.32.0.3.g01195cf9f
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 07/13] virtio_net: refactor the xmit type
2024-09-11 4:04 ` Jason Wang
@ 2024-09-12 7:50 ` Xuan Zhuo
2024-09-13 3:22 ` Jason Wang
0 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:50 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 12:04:16 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > Because the af-xdp will introduce a new xmit type, so I refactor the
> > xmit type mechanism first.
> >
> > We use the last two bits of the pointer to distinguish the xmit type,
> > so we can distinguish four xmit types. Now we have three types: skb,
> > orphan and xdp.
>
> And if I was not wrong, we do not anymore use bitmasks. If yes, let's
> explain the reason here.
In general, pointers are aligned to 4 or 8 bytes. If it is aligned to 4 bytes,
then only two bits are free for a pointer. So we can only use two bits.
But there are 4 types here, so we can't use bits to distinguish them.
b00 for skb
b01 for SKB_ORPHAN
b10 for XDP
b11 for af-xdp tx
>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/net/virtio_net.c | 90 +++++++++++++++++++++++-----------------
> > 1 file changed, 51 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 41aaea3b90fd..96abee36738b 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -45,9 +45,6 @@ module_param(napi_tx, bool, 0644);
> > #define VIRTIO_XDP_TX BIT(0)
> > #define VIRTIO_XDP_REDIR BIT(1)
> >
> > -#define VIRTIO_XDP_FLAG BIT(0)
> > -#define VIRTIO_ORPHAN_FLAG BIT(1)
> > -
> > /* RX packet size EWMA. The average packet size is used to determine the packet
> > * buffer size when refilling RX rings. As the entire RX ring may be refilled
> > * at once, the weight is chosen so that the EWMA will be insensitive to short-
> > @@ -509,34 +506,35 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > struct page *page, void *buf,
> > int len, int truesize);
> >
> > -static bool is_xdp_frame(void *ptr)
> > -{
> > - return (unsigned long)ptr & VIRTIO_XDP_FLAG;
> > -}
> > +enum virtnet_xmit_type {
> > + VIRTNET_XMIT_TYPE_SKB,
> > + VIRTNET_XMIT_TYPE_ORPHAN,
>
> Let's rename this to SKB_ORPHAN?
>
> > + VIRTNET_XMIT_TYPE_XDP,
> > +};
> >
> > -static void *xdp_to_ptr(struct xdp_frame *ptr)
> > -{
> > - return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
> > -}
> > +#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
> > + | VIRTNET_XMIT_TYPE_XDP)
>
Maybe I should define VIRTNET_XMIT_TYPE_MASK to 0x3 directly with some comments.
Thanks.
> I may miss something but it seems not a correct bitmask definition as
> each member is not a bit actually?
>
> >
> > -static struct xdp_frame *ptr_to_xdp(void *ptr)
> > +static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> > {
> > - return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > -}
> > + unsigned long p = (unsigned long)*ptr;
> >
> > -static bool is_orphan_skb(void *ptr)
> > -{
> > - return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG;
> > + *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK);
> > +
> > + return p & VIRTNET_XMIT_TYPE_MASK;
> > }
> >
> > -static void *skb_to_ptr(struct sk_buff *skb, bool orphan)
> > +static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
> > {
> > - return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0));
> > + return (void *)((unsigned long)ptr | type);
> > }
> >
> > -static struct sk_buff *ptr_to_skb(void *ptr)
> > +static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> > + enum virtnet_xmit_type type)
> > {
> > - return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG);
> > + return virtqueue_add_outbuf(sq->vq, sq->sg, num,
> > + virtnet_xmit_ptr_mix(data, type),
> > + GFP_ATOMIC);
> > }
> >
> > static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > @@ -549,29 +547,37 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > bool in_napi, struct virtnet_sq_free_stats *stats)
> > {
> > + struct xdp_frame *frame;
> > + struct sk_buff *skb;
> > unsigned int len;
> > void *ptr;
> >
> > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > - if (!is_xdp_frame(ptr)) {
> > - struct sk_buff *skb = ptr_to_skb(ptr);
> > + switch (virtnet_xmit_ptr_strip(&ptr)) {
> > + case VIRTNET_XMIT_TYPE_SKB:
> > + skb = ptr;
> >
> > pr_debug("Sent skb %p\n", skb);
> > + stats->napi_packets++;
> > + stats->napi_bytes += skb->len;
> > + napi_consume_skb(skb, in_napi);
> > + break;
> >
> > - if (is_orphan_skb(ptr)) {
> > - stats->packets++;
> > - stats->bytes += skb->len;
> > - } else {
> > - stats->napi_packets++;
> > - stats->napi_bytes += skb->len;
> > - }
> > + case VIRTNET_XMIT_TYPE_ORPHAN:
> > + skb = ptr;
> > +
> > + stats->packets++;
> > + stats->bytes += skb->len;
> > napi_consume_skb(skb, in_napi);
> > - } else {
> > - struct xdp_frame *frame = ptr_to_xdp(ptr);
> > + break;
> > +
> > + case VIRTNET_XMIT_TYPE_XDP:
> > + frame = ptr;
> >
> > stats->packets++;
> > stats->bytes += xdp_get_frame_len(frame);
> > xdp_return_frame(frame);
> > + break;
> > }
> > }
> > netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > @@ -1421,8 +1427,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > skb_frag_size(frag), skb_frag_off(frag));
> > }
> >
> > - err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > - xdp_to_ptr(xdpf), GFP_ATOMIC);
> > + err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP);
> > if (unlikely(err))
> > return -ENOSPC; /* Caller handle free/refcnt */
> >
> > @@ -3028,8 +3033,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
> > return num_sg;
> > num_sg++;
> > }
> > - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> > - skb_to_ptr(skb, orphan), GFP_ATOMIC);
> > +
> > + return virtnet_add_outbuf(sq, num_sg, skb,
> > + orphan ? VIRTNET_XMIT_TYPE_ORPHAN : VIRTNET_XMIT_TYPE_SKB);
> > }
> >
> > static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > @@ -5906,10 +5912,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > {
> > - if (!is_xdp_frame(buf))
> > + switch (virtnet_xmit_ptr_strip(&buf)) {
> > + case VIRTNET_XMIT_TYPE_SKB:
> > + case VIRTNET_XMIT_TYPE_ORPHAN:
> > dev_kfree_skb(buf);
> > - else
> > - xdp_return_frame(ptr_to_xdp(buf));
> > + break;
> > +
> > + case VIRTNET_XMIT_TYPE_XDP:
> > + xdp_return_frame(buf);
> > + break;
> > + }
> > }
>
> Others look fine.
>
> Thanks
>
> >
> > static void free_unused_bufs(struct virtnet_info *vi)
> > --
> > 2.32.0.3.g01195cf9f
> >
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 07/13] virtio_net: refactor the xmit type
2024-09-12 7:50 ` Xuan Zhuo
@ 2024-09-13 3:22 ` Jason Wang
0 siblings, 0 replies; 36+ messages in thread
From: Jason Wang @ 2024-09-13 3:22 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Thu, Sep 12, 2024 at 3:54 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 11 Sep 2024 12:04:16 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > Because the af-xdp will introduce a new xmit type, so I refactor the
> > > xmit type mechanism first.
> > >
> > > We use the last two bits of the pointer to distinguish the xmit type,
> > > so we can distinguish four xmit types. Now we have three types: skb,
> > > orphan and xdp.
> >
> > And if I was not wrong, we do not anymore use bitmasks. If yes, let's
> > explain the reason here.
>
> In general, pointers are aligned to 4 or 8 bytes. If it is aligned to 4 bytes,
> then only two bits are free for a pointer. So we can only use two bits.
>
> But there are 4 types here, so we can't use bits to distinguish them.
>
> b00 for skb
> b01 for SKB_ORPHAN
> b10 for XDP
> b11 for af-xdp tx
Let's add these in the changelog.
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (6 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 07/13] virtio_net: refactor the xmit type Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 4:08 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 09/13] virtio_net: xsk: prevent disable tx napi Xuan Zhuo
` (4 subsequent siblings)
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
This patch implement the logic of bind/unbind xsk pool to sq and rq.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 54 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 96abee36738b..6a36a204e967 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -295,6 +295,10 @@ struct send_queue {
/* Record whether sq is in reset state. */
bool reset;
+
+ struct xsk_buff_pool *xsk_pool;
+
+ dma_addr_t xsk_hdr_dma_addr;
};
/* Internal representation of a receive virtqueue */
@@ -494,6 +498,8 @@ struct virtio_net_common_hdr {
};
};
+static struct virtio_net_common_hdr xsk_hdr;
+
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
struct net_device *dev,
@@ -5476,6 +5482,29 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu
return err;
}
+static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
+ struct send_queue *sq,
+ struct xsk_buff_pool *pool)
+{
+ int err, qindex;
+
+ qindex = sq - vi->sq;
+
+ virtnet_tx_pause(vi, sq);
+
+ err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
+ if (err) {
+ netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
+ pool = NULL;
+ }
+
+ sq->xsk_pool = pool;
+
+ virtnet_tx_resume(vi, sq);
+
+ return err;
+}
+
static int virtnet_xsk_pool_enable(struct net_device *dev,
struct xsk_buff_pool *pool,
u16 qid)
@@ -5484,6 +5513,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
struct receive_queue *rq;
struct device *dma_dev;
struct send_queue *sq;
+ dma_addr_t hdr_dma;
int err, size;
if (vi->hdr_len > xsk_pool_get_headroom(pool))
@@ -5521,6 +5551,10 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
if (!rq->xsk_buffs)
return -ENOMEM;
+ hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, hdr_dma))
+ return -ENOMEM;
+
err = xsk_pool_dma_map(pool, dma_dev, 0);
if (err)
goto err_xsk_map;
@@ -5529,11 +5563,23 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
if (err)
goto err_rq;
+ err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
+ if (err)
+ goto err_sq;
+
+ /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
+ * So all the tx packets can share a single hdr.
+ */
+ sq->xsk_hdr_dma_addr = hdr_dma;
+
return 0;
+err_sq:
+ virtnet_rq_bind_xsk_pool(vi, rq, NULL);
err_rq:
xsk_pool_dma_unmap(pool, 0);
err_xsk_map:
+ dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
return err;
}
@@ -5542,19 +5588,27 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
struct virtnet_info *vi = netdev_priv(dev);
struct xsk_buff_pool *pool;
struct receive_queue *rq;
+ struct device *dma_dev;
+ struct send_queue *sq;
int err;
if (qid >= vi->curr_queue_pairs)
return -EINVAL;
+ sq = &vi->sq[qid];
rq = &vi->rq[qid];
pool = rq->xsk_pool;
err = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
+ err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL);
xsk_pool_dma_unmap(pool, 0);
+ dma_dev = virtqueue_dma_dev(sq->vq);
+
+ dma_unmap_single(dma_dev, sq->xsk_hdr_dma_addr, vi->hdr_len, DMA_TO_DEVICE);
+
kvfree(rq->xsk_buffs);
return err;
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx
2024-08-20 7:33 ` [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx Xuan Zhuo
@ 2024-09-11 4:08 ` Jason Wang
2024-09-12 7:54 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Jason Wang @ 2024-09-11 4:08 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> This patch implement the logic of bind/unbind xsk pool to sq and rq.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 54 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 54 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 96abee36738b..6a36a204e967 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -295,6 +295,10 @@ struct send_queue {
>
> /* Record whether sq is in reset state. */
> bool reset;
> +
> + struct xsk_buff_pool *xsk_pool;
> +
> + dma_addr_t xsk_hdr_dma_addr;
> };
>
> /* Internal representation of a receive virtqueue */
> @@ -494,6 +498,8 @@ struct virtio_net_common_hdr {
> };
> };
>
> +static struct virtio_net_common_hdr xsk_hdr;
> +
> static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
> struct net_device *dev,
> @@ -5476,6 +5482,29 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu
> return err;
> }
>
> +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
> + struct send_queue *sq,
> + struct xsk_buff_pool *pool)
> +{
> + int err, qindex;
> +
> + qindex = sq - vi->sq;
> +
> + virtnet_tx_pause(vi, sq);
> +
> + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
> + if (err) {
> + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
> + pool = NULL;
> + }
> +
> + sq->xsk_pool = pool;
> +
> + virtnet_tx_resume(vi, sq);
> +
> + return err;
> +}
> +
> static int virtnet_xsk_pool_enable(struct net_device *dev,
> struct xsk_buff_pool *pool,
> u16 qid)
> @@ -5484,6 +5513,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> struct receive_queue *rq;
> struct device *dma_dev;
> struct send_queue *sq;
> + dma_addr_t hdr_dma;
> int err, size;
>
> if (vi->hdr_len > xsk_pool_get_headroom(pool))
> @@ -5521,6 +5551,10 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> if (!rq->xsk_buffs)
> return -ENOMEM;
>
> + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
Let's use the virtqueue_dma_xxx() wrappers here.
> + if (dma_mapping_error(dma_dev, hdr_dma))
> + return -ENOMEM;
> +
> err = xsk_pool_dma_map(pool, dma_dev, 0);
> if (err)
> goto err_xsk_map;
> @@ -5529,11 +5563,23 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> if (err)
> goto err_rq;
>
> + err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
> + if (err)
> + goto err_sq;
> +
> + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
> + * So all the tx packets can share a single hdr.
> + */
> + sq->xsk_hdr_dma_addr = hdr_dma;
> +
> return 0;
>
> +err_sq:
> + virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> err_rq:
> xsk_pool_dma_unmap(pool, 0);
> err_xsk_map:
> + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
> return err;
> }
>
> @@ -5542,19 +5588,27 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> struct virtnet_info *vi = netdev_priv(dev);
> struct xsk_buff_pool *pool;
> struct receive_queue *rq;
> + struct device *dma_dev;
> + struct send_queue *sq;
> int err;
>
> if (qid >= vi->curr_queue_pairs)
> return -EINVAL;
>
> + sq = &vi->sq[qid];
> rq = &vi->rq[qid];
>
> pool = rq->xsk_pool;
>
> err = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> + err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL);
>
> xsk_pool_dma_unmap(pool, 0);
>
> + dma_dev = virtqueue_dma_dev(sq->vq);
> +
> + dma_unmap_single(dma_dev, sq->xsk_hdr_dma_addr, vi->hdr_len, DMA_TO_DEVICE);
And here.
Thanks
> +
> kvfree(rq->xsk_buffs);
>
> return err;
> --
> 2.32.0.3.g01195cf9f
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx
2024-09-11 4:08 ` Jason Wang
@ 2024-09-12 7:54 ` Xuan Zhuo
0 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:54 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 12:08:06 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > This patch implement the logic of bind/unbind xsk pool to sq and rq.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/net/virtio_net.c | 54 ++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 54 insertions(+)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 96abee36738b..6a36a204e967 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -295,6 +295,10 @@ struct send_queue {
> >
> > /* Record whether sq is in reset state. */
> > bool reset;
> > +
> > + struct xsk_buff_pool *xsk_pool;
> > +
> > + dma_addr_t xsk_hdr_dma_addr;
> > };
> >
> > /* Internal representation of a receive virtqueue */
> > @@ -494,6 +498,8 @@ struct virtio_net_common_hdr {
> > };
> > };
> >
> > +static struct virtio_net_common_hdr xsk_hdr;
> > +
> > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> > static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
> > struct net_device *dev,
> > @@ -5476,6 +5482,29 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu
> > return err;
> > }
> >
> > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
> > + struct send_queue *sq,
> > + struct xsk_buff_pool *pool)
> > +{
> > + int err, qindex;
> > +
> > + qindex = sq - vi->sq;
> > +
> > + virtnet_tx_pause(vi, sq);
> > +
> > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
> > + if (err) {
> > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
> > + pool = NULL;
> > + }
> > +
> > + sq->xsk_pool = pool;
> > +
> > + virtnet_tx_resume(vi, sq);
> > +
> > + return err;
> > +}
> > +
> > static int virtnet_xsk_pool_enable(struct net_device *dev,
> > struct xsk_buff_pool *pool,
> > u16 qid)
> > @@ -5484,6 +5513,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> > struct receive_queue *rq;
> > struct device *dma_dev;
> > struct send_queue *sq;
> > + dma_addr_t hdr_dma;
> > int err, size;
> >
> > if (vi->hdr_len > xsk_pool_get_headroom(pool))
> > @@ -5521,6 +5551,10 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> > if (!rq->xsk_buffs)
> > return -ENOMEM;
> >
> > + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
>
> Let's use the virtqueue_dma_xxx() wrappers here.
Will fix.
Thanks.
>
> > + if (dma_mapping_error(dma_dev, hdr_dma))
> > + return -ENOMEM;
> > +
> > err = xsk_pool_dma_map(pool, dma_dev, 0);
> > if (err)
> > goto err_xsk_map;
> > @@ -5529,11 +5563,23 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> > if (err)
> > goto err_rq;
> >
> > + err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
> > + if (err)
> > + goto err_sq;
> > +
> > + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
> > + * So all the tx packets can share a single hdr.
> > + */
> > + sq->xsk_hdr_dma_addr = hdr_dma;
> > +
> > return 0;
> >
> > +err_sq:
> > + virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> > err_rq:
> > xsk_pool_dma_unmap(pool, 0);
> > err_xsk_map:
> > + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
> > return err;
> > }
> >
> > @@ -5542,19 +5588,27 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> > struct virtnet_info *vi = netdev_priv(dev);
> > struct xsk_buff_pool *pool;
> > struct receive_queue *rq;
> > + struct device *dma_dev;
> > + struct send_queue *sq;
> > int err;
> >
> > if (qid >= vi->curr_queue_pairs)
> > return -EINVAL;
> >
> > + sq = &vi->sq[qid];
> > rq = &vi->rq[qid];
> >
> > pool = rq->xsk_pool;
> >
> > err = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> > + err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL);
> >
> > xsk_pool_dma_unmap(pool, 0);
> >
> > + dma_dev = virtqueue_dma_dev(sq->vq);
> > +
> > + dma_unmap_single(dma_dev, sq->xsk_hdr_dma_addr, vi->hdr_len, DMA_TO_DEVICE);
>
> And here.
>
> Thanks
>
> > +
> > kvfree(rq->xsk_buffs);
> >
> > return err;
> > --
> > 2.32.0.3.g01195cf9f
> >
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 09/13] virtio_net: xsk: prevent disable tx napi
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (7 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 08/13] virtio_net: xsk: bind/unbind xsk for tx Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
` (3 subsequent siblings)
12 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Since xsk's TX queue is consumed by TX NAPI, if sq is bound to xsk, then
we must stop tx napi from being disabled.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6a36a204e967..221681926d23 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -5005,7 +5005,7 @@ static int virtnet_set_coalesce(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct virtnet_info *vi = netdev_priv(dev);
- int ret, queue_number, napi_weight;
+ int ret, queue_number, napi_weight, i;
bool update_napi = false;
/* Can't change NAPI weight if the link is up */
@@ -5034,6 +5034,14 @@ static int virtnet_set_coalesce(struct net_device *dev,
return ret;
if (update_napi) {
+ /* xsk xmit depends on the tx napi. So if xsk is active,
+ * prevent modifications to tx napi.
+ */
+ for (i = queue_number; i < vi->max_queue_pairs; i++) {
+ if (vi->sq[i].xsk_pool)
+ return -EBUSY;
+ }
+
for (; queue_number < vi->max_queue_pairs; queue_number++)
vi->sq[queue_number].napi.weight = napi_weight;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (8 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 09/13] virtio_net: xsk: prevent disable tx napi Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 4:31 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted " Xuan Zhuo
` (2 subsequent siblings)
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
The driver's tx napi is very important for XSK. It is responsible for
obtaining data from the XSK queue and sending it out.
At the beginning, we need to trigger tx napi.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 127 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 125 insertions(+), 2 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 221681926d23..3743694d3c3b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -516,10 +516,13 @@ enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
VIRTNET_XMIT_TYPE_ORPHAN,
VIRTNET_XMIT_TYPE_XDP,
+ VIRTNET_XMIT_TYPE_XSK,
};
#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
- | VIRTNET_XMIT_TYPE_XDP)
+ | VIRTNET_XMIT_TYPE_XDP | VIRTNET_XMIT_TYPE_XSK)
+
+#define VIRTIO_XSK_FLAG_OFFSET 4
static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
{
@@ -543,6 +546,11 @@ static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
GFP_ATOMIC);
}
+static u32 virtnet_ptr_to_xsk(void *ptr)
+{
+ return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
+}
+
static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
{
sg_assign_page(sg, NULL);
@@ -584,6 +592,10 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
stats->bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
break;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ stats->bytes += virtnet_ptr_to_xsk(ptr);
+ break;
}
}
netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
@@ -1393,6 +1405,97 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
return 0;
}
+static void *virtnet_xsk_to_ptr(u32 len)
+{
+ unsigned long p;
+
+ p = len << VIRTIO_XSK_FLAG_OFFSET;
+
+ return virtnet_xmit_ptr_mix((void *)p, VIRTNET_XMIT_TYPE_XSK);
+}
+
+static int virtnet_xsk_xmit_one(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ struct virtnet_info *vi;
+ dma_addr_t addr;
+
+ vi = sq->vq->vdev->priv;
+
+ addr = xsk_buff_raw_get_dma(pool, desc->addr);
+ xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
+
+ sg_init_table(sq->sg, 2);
+
+ sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len);
+ sg_fill_dma(sq->sg + 1, addr, desc->len);
+
+ return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
+ virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
+}
+
+static int virtnet_xsk_xmit_batch(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ unsigned int budget,
+ u64 *kicks)
+{
+ struct xdp_desc *descs = pool->tx_descs;
+ bool kick = false;
+ u32 nb_pkts, i;
+ int err;
+
+ budget = min_t(u32, budget, sq->vq->num_free);
+
+ nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
+ if (!nb_pkts)
+ return 0;
+
+ for (i = 0; i < nb_pkts; i++) {
+ err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
+ if (unlikely(err)) {
+ xsk_tx_completed(sq->xsk_pool, nb_pkts - i);
+ break;
+ }
+
+ kick = true;
+ }
+
+ if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
+ (*kicks)++;
+
+ return i;
+}
+
+static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
+ int budget)
+{
+ struct virtnet_info *vi = sq->vq->vdev->priv;
+ struct virtnet_sq_free_stats stats = {};
+ struct net_device *dev = vi->dev;
+ u64 kicks = 0;
+ int sent;
+
+ __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
+
+ sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
+
+ if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
+ check_sq_full_and_disable(vi, vi->dev, sq);
+
+ u64_stats_update_begin(&sq->stats.syncp);
+ u64_stats_add(&sq->stats.packets, stats.packets);
+ u64_stats_add(&sq->stats.bytes, stats.bytes);
+ u64_stats_add(&sq->stats.kicks, kicks);
+ u64_stats_add(&sq->stats.xdp_tx, sent);
+ u64_stats_update_end(&sq->stats.syncp);
+
+ if (xsk_uses_need_wakeup(pool))
+ xsk_set_tx_need_wakeup(pool);
+
+ return sent == budget;
+}
+
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
@@ -2949,6 +3052,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
struct virtnet_info *vi = sq->vq->vdev->priv;
unsigned int index = vq2txq(sq->vq);
struct netdev_queue *txq;
+ bool xsk_busy = false;
int opaque;
bool done;
@@ -2961,7 +3065,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
txq = netdev_get_tx_queue(vi->dev, index);
__netif_tx_lock(txq, raw_smp_processor_id());
virtqueue_disable_cb(sq->vq);
- free_old_xmit(sq, txq, !!budget);
+
+ if (sq->xsk_pool)
+ xsk_busy = virtnet_xsk_xmit(sq, sq->xsk_pool, budget);
+ else
+ free_old_xmit(sq, txq, !!budget);
if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
if (netif_tx_queue_stopped(txq)) {
@@ -2972,6 +3080,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
netif_tx_wake_queue(txq);
}
+ if (xsk_busy) {
+ __netif_tx_unlock(txq);
+ return budget;
+ }
+
opaque = virtqueue_enable_cb_prepare(sq->vq);
done = napi_complete_done(napi, 0);
@@ -5974,6 +6087,12 @@ static void free_receive_page_frags(struct virtnet_info *vi)
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{
+ struct virtnet_info *vi = vq->vdev->priv;
+ struct send_queue *sq;
+ int i = vq2rxq(vq);
+
+ sq = &vi->sq[i];
+
switch (virtnet_xmit_ptr_strip(&buf)) {
case VIRTNET_XMIT_TYPE_SKB:
case VIRTNET_XMIT_TYPE_ORPHAN:
@@ -5983,6 +6102,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
case VIRTNET_XMIT_TYPE_XDP:
xdp_return_frame(buf);
break;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ xsk_tx_completed(sq->xsk_pool, 1);
+ break;
}
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer
2024-08-20 7:33 ` [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
@ 2024-09-11 4:31 ` Jason Wang
2024-09-12 8:48 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Jason Wang @ 2024-09-11 4:31 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> The driver's tx napi is very important for XSK. It is responsible for
> obtaining data from the XSK queue and sending it out.
>
> At the beginning, we need to trigger tx napi.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 127 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 221681926d23..3743694d3c3b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -516,10 +516,13 @@ enum virtnet_xmit_type {
> VIRTNET_XMIT_TYPE_SKB,
> VIRTNET_XMIT_TYPE_ORPHAN,
> VIRTNET_XMIT_TYPE_XDP,
> + VIRTNET_XMIT_TYPE_XSK,
> };
>
> #define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
> - | VIRTNET_XMIT_TYPE_XDP)
> + | VIRTNET_XMIT_TYPE_XDP | VIRTNET_XMIT_TYPE_XSK)
> +
> +#define VIRTIO_XSK_FLAG_OFFSET 4
>
> static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> {
> @@ -543,6 +546,11 @@ static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> GFP_ATOMIC);
> }
>
> +static u32 virtnet_ptr_to_xsk(void *ptr)
> +{
> + return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
> +}
> +
This needs a better name, otherwise readers might be confused.
E.g something like virtnet_ptr_to_xsk_buff_len()?
> static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> {
> sg_assign_page(sg, NULL);
> @@ -584,6 +592,10 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> stats->bytes += xdp_get_frame_len(frame);
> xdp_return_frame(frame);
> break;
> +
> + case VIRTNET_XMIT_TYPE_XSK:
> + stats->bytes += virtnet_ptr_to_xsk(ptr);
> + break;
Do we miss xsk_tx_completed() here?
> }
> }
> netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> @@ -1393,6 +1405,97 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> return 0;
> }
>
> +static void *virtnet_xsk_to_ptr(u32 len)
> +{
> + unsigned long p;
> +
> + p = len << VIRTIO_XSK_FLAG_OFFSET;
> +
> + return virtnet_xmit_ptr_mix((void *)p, VIRTNET_XMIT_TYPE_XSK);
> +}
> +
> +static int virtnet_xsk_xmit_one(struct send_queue *sq,
> + struct xsk_buff_pool *pool,
> + struct xdp_desc *desc)
> +{
> + struct virtnet_info *vi;
> + dma_addr_t addr;
> +
> + vi = sq->vq->vdev->priv;
> +
> + addr = xsk_buff_raw_get_dma(pool, desc->addr);
> + xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
> +
> + sg_init_table(sq->sg, 2);
> +
> + sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len);
> + sg_fill_dma(sq->sg + 1, addr, desc->len);
> +
> + return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
> + virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
> +}
> +
> +static int virtnet_xsk_xmit_batch(struct send_queue *sq,
> + struct xsk_buff_pool *pool,
> + unsigned int budget,
> + u64 *kicks)
> +{
> + struct xdp_desc *descs = pool->tx_descs;
> + bool kick = false;
> + u32 nb_pkts, i;
> + int err;
> +
> + budget = min_t(u32, budget, sq->vq->num_free);
> +
> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> + if (!nb_pkts)
> + return 0;
> +
> + for (i = 0; i < nb_pkts; i++) {
> + err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
> + if (unlikely(err)) {
> + xsk_tx_completed(sq->xsk_pool, nb_pkts - i);
Should we kick in this condition?
> + break;
> + }
> +
> + kick = true;
> + }
> +
> + if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
Can we simply use virtqueue_kick() here?
> + (*kicks)++;
> +
> + return i;
> +}
> +
> +static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> + int budget)
> +{
> + struct virtnet_info *vi = sq->vq->vdev->priv;
> + struct virtnet_sq_free_stats stats = {};
> + struct net_device *dev = vi->dev;
> + u64 kicks = 0;
> + int sent;
> +
> + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
I haven't checked in depth, but I wonder if we have some side effects
when using non NAPI tx mode:
if (napi->weight)
virtqueue_napi_schedule(napi, vq);
else
/* We were probably waiting for more output buffers. */
netif_wake_subqueue(vi->dev, vq2txq(vq));
Does this mean xsk will suffer the same issue like when there's no xsk
xmit request, we could end up with no way to reclaim xmitted xsk
buffers? (Or should we disallow AF_XDP to be used for non TX-NAPI
mode?)
> +
> + sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
> +
> + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
> + check_sq_full_and_disable(vi, vi->dev, sq);
> +
> + u64_stats_update_begin(&sq->stats.syncp);
> + u64_stats_add(&sq->stats.packets, stats.packets);
> + u64_stats_add(&sq->stats.bytes, stats.bytes);
> + u64_stats_add(&sq->stats.kicks, kicks);
> + u64_stats_add(&sq->stats.xdp_tx, sent);
> + u64_stats_update_end(&sq->stats.syncp);
> +
> + if (xsk_uses_need_wakeup(pool))
> + xsk_set_tx_need_wakeup(pool);
> +
> + return sent == budget;
> +}
> +
> static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> struct send_queue *sq,
> struct xdp_frame *xdpf)
> @@ -2949,6 +3052,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> struct virtnet_info *vi = sq->vq->vdev->priv;
> unsigned int index = vq2txq(sq->vq);
> struct netdev_queue *txq;
> + bool xsk_busy = false;
> int opaque;
> bool done;
>
> @@ -2961,7 +3065,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> txq = netdev_get_tx_queue(vi->dev, index);
> __netif_tx_lock(txq, raw_smp_processor_id());
> virtqueue_disable_cb(sq->vq);
> - free_old_xmit(sq, txq, !!budget);
> +
> + if (sq->xsk_pool)
> + xsk_busy = virtnet_xsk_xmit(sq, sq->xsk_pool, budget);
> + else
> + free_old_xmit(sq, txq, !!budget);
>
> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> if (netif_tx_queue_stopped(txq)) {
> @@ -2972,6 +3080,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> netif_tx_wake_queue(txq);
> }
>
> + if (xsk_busy) {
> + __netif_tx_unlock(txq);
> + return budget;
> + }
> +
> opaque = virtqueue_enable_cb_prepare(sq->vq);
>
> done = napi_complete_done(napi, 0);
> @@ -5974,6 +6087,12 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>
> static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> {
> + struct virtnet_info *vi = vq->vdev->priv;
> + struct send_queue *sq;
> + int i = vq2rxq(vq);
> +
> + sq = &vi->sq[i];
> +
> switch (virtnet_xmit_ptr_strip(&buf)) {
> case VIRTNET_XMIT_TYPE_SKB:
> case VIRTNET_XMIT_TYPE_ORPHAN:
> @@ -5983,6 +6102,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> case VIRTNET_XMIT_TYPE_XDP:
> xdp_return_frame(buf);
> break;
> +
> + case VIRTNET_XMIT_TYPE_XSK:
> + xsk_tx_completed(sq->xsk_pool, 1);
> + break;
> }
> }
>
> --
> 2.32.0.3.g01195cf9f
>
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer
2024-09-11 4:31 ` Jason Wang
@ 2024-09-12 8:48 ` Xuan Zhuo
2024-09-13 3:21 ` Jason Wang
0 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 8:48 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 12:31:32 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > The driver's tx napi is very important for XSK. It is responsible for
> > obtaining data from the XSK queue and sending it out.
> >
> > At the beginning, we need to trigger tx napi.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/net/virtio_net.c | 127 ++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 125 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 221681926d23..3743694d3c3b 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -516,10 +516,13 @@ enum virtnet_xmit_type {
> > VIRTNET_XMIT_TYPE_SKB,
> > VIRTNET_XMIT_TYPE_ORPHAN,
> > VIRTNET_XMIT_TYPE_XDP,
> > + VIRTNET_XMIT_TYPE_XSK,
> > };
> >
> > #define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
> > - | VIRTNET_XMIT_TYPE_XDP)
> > + | VIRTNET_XMIT_TYPE_XDP | VIRTNET_XMIT_TYPE_XSK)
> > +
> > +#define VIRTIO_XSK_FLAG_OFFSET 4
> >
> > static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> > {
> > @@ -543,6 +546,11 @@ static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> > GFP_ATOMIC);
> > }
> >
> > +static u32 virtnet_ptr_to_xsk(void *ptr)
> > +{
> > + return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
> > +}
> > +
>
> This needs a better name, otherwise readers might be confused.
>
> E.g something like virtnet_ptr_to_xsk_buff_len()?
>
> > static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > {
> > sg_assign_page(sg, NULL);
> > @@ -584,6 +592,10 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > stats->bytes += xdp_get_frame_len(frame);
> > xdp_return_frame(frame);
> > break;
> > +
> > + case VIRTNET_XMIT_TYPE_XSK:
> > + stats->bytes += virtnet_ptr_to_xsk(ptr);
> > + break;
>
> Do we miss xsk_tx_completed() here?
>
> > }
> > }
> > netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > @@ -1393,6 +1405,97 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > return 0;
> > }
> >
> > +static void *virtnet_xsk_to_ptr(u32 len)
> > +{
> > + unsigned long p;
> > +
> > + p = len << VIRTIO_XSK_FLAG_OFFSET;
> > +
> > + return virtnet_xmit_ptr_mix((void *)p, VIRTNET_XMIT_TYPE_XSK);
> > +}
> > +
> > +static int virtnet_xsk_xmit_one(struct send_queue *sq,
> > + struct xsk_buff_pool *pool,
> > + struct xdp_desc *desc)
> > +{
> > + struct virtnet_info *vi;
> > + dma_addr_t addr;
> > +
> > + vi = sq->vq->vdev->priv;
> > +
> > + addr = xsk_buff_raw_get_dma(pool, desc->addr);
> > + xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
> > +
> > + sg_init_table(sq->sg, 2);
> > +
> > + sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len);
> > + sg_fill_dma(sq->sg + 1, addr, desc->len);
> > +
> > + return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
> > + virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
> > +}
> > +
> > +static int virtnet_xsk_xmit_batch(struct send_queue *sq,
> > + struct xsk_buff_pool *pool,
> > + unsigned int budget,
> > + u64 *kicks)
> > +{
> > + struct xdp_desc *descs = pool->tx_descs;
> > + bool kick = false;
> > + u32 nb_pkts, i;
> > + int err;
> > +
> > + budget = min_t(u32, budget, sq->vq->num_free);
> > +
> > + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> > + if (!nb_pkts)
> > + return 0;
> > +
> > + for (i = 0; i < nb_pkts; i++) {
> > + err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
> > + if (unlikely(err)) {
> > + xsk_tx_completed(sq->xsk_pool, nb_pkts - i);
>
> Should we kick in this condition?
>
> > + break;
> > + }
> > +
> > + kick = true;
> > + }
> > +
> > + if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
>
> Can we simply use virtqueue_kick() here?
>
> > + (*kicks)++;
> > +
> > + return i;
> > +}
> > +
> > +static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > + int budget)
> > +{
> > + struct virtnet_info *vi = sq->vq->vdev->priv;
> > + struct virtnet_sq_free_stats stats = {};
> > + struct net_device *dev = vi->dev;
> > + u64 kicks = 0;
> > + int sent;
> > +
> > + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
>
> I haven't checked in depth, but I wonder if we have some side effects
> when using non NAPI tx mode:
>
> if (napi->weight)
> virtqueue_napi_schedule(napi, vq);
> else
> /* We were probably waiting for more output buffers. */
> netif_wake_subqueue(vi->dev, vq2txq(vq));
>
> Does this mean xsk will suffer the same issue like when there's no xsk
> xmit request, we could end up with no way to reclaim xmitted xsk
> buffers? (Or should we disallow AF_XDP to be used for non TX-NAPI
> mode?)
Disallow AF_XDP to be used for non TX-NAPI mode.
The last patch #9 does this.
#9 [PATCH net-next 09/13] virtio_net: xsk: prevent disable tx napi
Thanks.
>
> > +
> > + sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
> > +
> > + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
> > + check_sq_full_and_disable(vi, vi->dev, sq);
> > +
> > + u64_stats_update_begin(&sq->stats.syncp);
> > + u64_stats_add(&sq->stats.packets, stats.packets);
> > + u64_stats_add(&sq->stats.bytes, stats.bytes);
> > + u64_stats_add(&sq->stats.kicks, kicks);
> > + u64_stats_add(&sq->stats.xdp_tx, sent);
> > + u64_stats_update_end(&sq->stats.syncp);
> > +
> > + if (xsk_uses_need_wakeup(pool))
> > + xsk_set_tx_need_wakeup(pool);
> > +
> > + return sent == budget;
> > +}
> > +
> > static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > struct send_queue *sq,
> > struct xdp_frame *xdpf)
> > @@ -2949,6 +3052,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > struct virtnet_info *vi = sq->vq->vdev->priv;
> > unsigned int index = vq2txq(sq->vq);
> > struct netdev_queue *txq;
> > + bool xsk_busy = false;
> > int opaque;
> > bool done;
> >
> > @@ -2961,7 +3065,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > txq = netdev_get_tx_queue(vi->dev, index);
> > __netif_tx_lock(txq, raw_smp_processor_id());
> > virtqueue_disable_cb(sq->vq);
> > - free_old_xmit(sq, txq, !!budget);
> > +
> > + if (sq->xsk_pool)
> > + xsk_busy = virtnet_xsk_xmit(sq, sq->xsk_pool, budget);
> > + else
> > + free_old_xmit(sq, txq, !!budget);
> >
> > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> > if (netif_tx_queue_stopped(txq)) {
> > @@ -2972,6 +3080,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > netif_tx_wake_queue(txq);
> > }
> >
> > + if (xsk_busy) {
> > + __netif_tx_unlock(txq);
> > + return budget;
> > + }
> > +
> > opaque = virtqueue_enable_cb_prepare(sq->vq);
> >
> > done = napi_complete_done(napi, 0);
> > @@ -5974,6 +6087,12 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > {
> > + struct virtnet_info *vi = vq->vdev->priv;
> > + struct send_queue *sq;
> > + int i = vq2rxq(vq);
> > +
> > + sq = &vi->sq[i];
> > +
> > switch (virtnet_xmit_ptr_strip(&buf)) {
> > case VIRTNET_XMIT_TYPE_SKB:
> > case VIRTNET_XMIT_TYPE_ORPHAN:
> > @@ -5983,6 +6102,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > case VIRTNET_XMIT_TYPE_XDP:
> > xdp_return_frame(buf);
> > break;
> > +
> > + case VIRTNET_XMIT_TYPE_XSK:
> > + xsk_tx_completed(sq->xsk_pool, 1);
> > + break;
> > }
> > }
> >
> > --
> > 2.32.0.3.g01195cf9f
> >
>
> Thanks
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer
2024-09-12 8:48 ` Xuan Zhuo
@ 2024-09-13 3:21 ` Jason Wang
0 siblings, 0 replies; 36+ messages in thread
From: Jason Wang @ 2024-09-13 3:21 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Thu, Sep 12, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 11 Sep 2024 12:31:32 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > The driver's tx napi is very important for XSK. It is responsible for
> > > obtaining data from the XSK queue and sending it out.
> > >
> > > At the beginning, we need to trigger tx napi.
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > > drivers/net/virtio_net.c | 127 ++++++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 125 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 221681926d23..3743694d3c3b 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -516,10 +516,13 @@ enum virtnet_xmit_type {
> > > VIRTNET_XMIT_TYPE_SKB,
> > > VIRTNET_XMIT_TYPE_ORPHAN,
> > > VIRTNET_XMIT_TYPE_XDP,
> > > + VIRTNET_XMIT_TYPE_XSK,
> > > };
> > >
> > > #define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_ORPHAN \
> > > - | VIRTNET_XMIT_TYPE_XDP)
> > > + | VIRTNET_XMIT_TYPE_XDP | VIRTNET_XMIT_TYPE_XSK)
> > > +
> > > +#define VIRTIO_XSK_FLAG_OFFSET 4
> > >
> > > static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> > > {
> > > @@ -543,6 +546,11 @@ static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> > > GFP_ATOMIC);
> > > }
> > >
> > > +static u32 virtnet_ptr_to_xsk(void *ptr)
> > > +{
> > > + return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
> > > +}
> > > +
> >
> > This needs a better name, otherwise readers might be confused.
> >
> > E.g something like virtnet_ptr_to_xsk_buff_len()?
> >
> > > static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > {
> > > sg_assign_page(sg, NULL);
> > > @@ -584,6 +592,10 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > stats->bytes += xdp_get_frame_len(frame);
> > > xdp_return_frame(frame);
> > > break;
> > > +
> > > + case VIRTNET_XMIT_TYPE_XSK:
> > > + stats->bytes += virtnet_ptr_to_xsk(ptr);
> > > + break;
> >
> > Do we miss xsk_tx_completed() here?
> >
> > > }
> > > }
> > > netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > @@ -1393,6 +1405,97 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > return 0;
> > > }
> > >
> > > +static void *virtnet_xsk_to_ptr(u32 len)
> > > +{
> > > + unsigned long p;
> > > +
> > > + p = len << VIRTIO_XSK_FLAG_OFFSET;
> > > +
> > > + return virtnet_xmit_ptr_mix((void *)p, VIRTNET_XMIT_TYPE_XSK);
> > > +}
> > > +
> > > +static int virtnet_xsk_xmit_one(struct send_queue *sq,
> > > + struct xsk_buff_pool *pool,
> > > + struct xdp_desc *desc)
> > > +{
> > > + struct virtnet_info *vi;
> > > + dma_addr_t addr;
> > > +
> > > + vi = sq->vq->vdev->priv;
> > > +
> > > + addr = xsk_buff_raw_get_dma(pool, desc->addr);
> > > + xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
> > > +
> > > + sg_init_table(sq->sg, 2);
> > > +
> > > + sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len);
> > > + sg_fill_dma(sq->sg + 1, addr, desc->len);
> > > +
> > > + return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
> > > + virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
> > > +}
> > > +
> > > +static int virtnet_xsk_xmit_batch(struct send_queue *sq,
> > > + struct xsk_buff_pool *pool,
> > > + unsigned int budget,
> > > + u64 *kicks)
> > > +{
> > > + struct xdp_desc *descs = pool->tx_descs;
> > > + bool kick = false;
> > > + u32 nb_pkts, i;
> > > + int err;
> > > +
> > > + budget = min_t(u32, budget, sq->vq->num_free);
> > > +
> > > + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> > > + if (!nb_pkts)
> > > + return 0;
> > > +
> > > + for (i = 0; i < nb_pkts; i++) {
> > > + err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
> > > + if (unlikely(err)) {
> > > + xsk_tx_completed(sq->xsk_pool, nb_pkts - i);
> >
> > Should we kick in this condition?
> >
> > > + break;
> > > + }
> > > +
> > > + kick = true;
> > > + }
> > > +
> > > + if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
> >
> > Can we simply use virtqueue_kick() here?
> >
> > > + (*kicks)++;
> > > +
> > > + return i;
> > > +}
> > > +
> > > +static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > + int budget)
> > > +{
> > > + struct virtnet_info *vi = sq->vq->vdev->priv;
> > > + struct virtnet_sq_free_stats stats = {};
> > > + struct net_device *dev = vi->dev;
> > > + u64 kicks = 0;
> > > + int sent;
> > > +
> > > + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> >
> > I haven't checked in depth, but I wonder if we have some side effects
> > when using non NAPI tx mode:
> >
> > if (napi->weight)
> > virtqueue_napi_schedule(napi, vq);
> > else
> > /* We were probably waiting for more output buffers. */
> > netif_wake_subqueue(vi->dev, vq2txq(vq));
> >
> > Does this mean xsk will suffer the same issue like when there's no xsk
> > xmit request, we could end up with no way to reclaim xmitted xsk
> > buffers? (Or should we disallow AF_XDP to be used for non TX-NAPI
> > mode?)
>
> Disallow AF_XDP to be used for non TX-NAPI mode.
>
> The last patch #9 does this.
>
> #9 [PATCH net-next 09/13] virtio_net: xsk: prevent disable tx napi
Great, for some reason I miss that.
Thanks
>
> Thanks.
> >
> > > +
> > > + sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
> > > +
> > > + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
> > > + check_sq_full_and_disable(vi, vi->dev, sq);
> > > +
> > > + u64_stats_update_begin(&sq->stats.syncp);
> > > + u64_stats_add(&sq->stats.packets, stats.packets);
> > > + u64_stats_add(&sq->stats.bytes, stats.bytes);
> > > + u64_stats_add(&sq->stats.kicks, kicks);
> > > + u64_stats_add(&sq->stats.xdp_tx, sent);
> > > + u64_stats_update_end(&sq->stats.syncp);
> > > +
> > > + if (xsk_uses_need_wakeup(pool))
> > > + xsk_set_tx_need_wakeup(pool);
> > > +
> > > + return sent == budget;
> > > +}
> > > +
> > > static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > > struct send_queue *sq,
> > > struct xdp_frame *xdpf)
> > > @@ -2949,6 +3052,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > > struct virtnet_info *vi = sq->vq->vdev->priv;
> > > unsigned int index = vq2txq(sq->vq);
> > > struct netdev_queue *txq;
> > > + bool xsk_busy = false;
> > > int opaque;
> > > bool done;
> > >
> > > @@ -2961,7 +3065,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > > txq = netdev_get_tx_queue(vi->dev, index);
> > > __netif_tx_lock(txq, raw_smp_processor_id());
> > > virtqueue_disable_cb(sq->vq);
> > > - free_old_xmit(sq, txq, !!budget);
> > > +
> > > + if (sq->xsk_pool)
> > > + xsk_busy = virtnet_xsk_xmit(sq, sq->xsk_pool, budget);
> > > + else
> > > + free_old_xmit(sq, txq, !!budget);
> > >
> > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> > > if (netif_tx_queue_stopped(txq)) {
> > > @@ -2972,6 +3080,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > > netif_tx_wake_queue(txq);
> > > }
> > >
> > > + if (xsk_busy) {
> > > + __netif_tx_unlock(txq);
> > > + return budget;
> > > + }
> > > +
> > > opaque = virtqueue_enable_cb_prepare(sq->vq);
> > >
> > > done = napi_complete_done(napi, 0);
> > > @@ -5974,6 +6087,12 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > >
> > > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > {
> > > + struct virtnet_info *vi = vq->vdev->priv;
> > > + struct send_queue *sq;
> > > + int i = vq2rxq(vq);
> > > +
> > > + sq = &vi->sq[i];
> > > +
> > > switch (virtnet_xmit_ptr_strip(&buf)) {
> > > case VIRTNET_XMIT_TYPE_SKB:
> > > case VIRTNET_XMIT_TYPE_ORPHAN:
> > > @@ -5983,6 +6102,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > case VIRTNET_XMIT_TYPE_XDP:
> > > xdp_return_frame(buf);
> > > break;
> > > +
> > > + case VIRTNET_XMIT_TYPE_XSK:
> > > + xsk_tx_completed(sq->xsk_pool, 1);
> > > + break;
> > > }
> > > }
> > >
> > > --
> > > 2.32.0.3.g01195cf9f
> > >
> >
> > Thanks
> >
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted xsk buffer
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (9 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 10/13] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 4:32 ` Jason Wang
2024-08-20 7:33 ` [PATCH net-next 12/13] virtio_net: update tx timeout record Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 13/13] virtio_net: xdp_features add NETDEV_XDP_ACT_XSK_ZEROCOPY Xuan Zhuo
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk
buffer) by the last bits of the pointer.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 85 ++++++++++++++++++++++++++++------------
1 file changed, 59 insertions(+), 26 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3743694d3c3b..a898568bed5c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -83,6 +83,7 @@ struct virtnet_sq_free_stats {
u64 bytes;
u64 napi_packets;
u64 napi_bytes;
+ u64 xsk;
};
struct virtnet_sq_stats {
@@ -511,6 +512,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
struct sk_buff *curr_skb,
struct page *page, void *buf,
int len, int truesize);
+static void virtnet_xsk_completed(struct send_queue *sq, int num);
enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
@@ -595,12 +597,24 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
case VIRTNET_XMIT_TYPE_XSK:
stats->bytes += virtnet_ptr_to_xsk(ptr);
+ stats->xsk++;
break;
}
}
netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
}
+static void virtnet_free_old_xmit(struct send_queue *sq,
+ struct netdev_queue *txq,
+ bool in_napi,
+ struct virtnet_sq_free_stats *stats)
+{
+ __free_old_xmit(sq, txq, in_napi, stats);
+
+ if (stats->xsk)
+ virtnet_xsk_completed(sq, stats->xsk);
+}
+
/* Converting between virtqueue no. and kernel tx/rx queue no.
* 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
*/
@@ -1021,7 +1035,7 @@ static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
{
struct virtnet_sq_free_stats stats = {0};
- __free_old_xmit(sq, txq, in_napi, &stats);
+ virtnet_free_old_xmit(sq, txq, in_napi, &stats);
/* Avoid overhead when no packets have been processed
* happens when called speculatively from start_xmit.
@@ -1382,29 +1396,6 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
return err;
}
-static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
-{
- struct virtnet_info *vi = netdev_priv(dev);
- struct send_queue *sq;
-
- if (!netif_running(dev))
- return -ENETDOWN;
-
- if (qid >= vi->curr_queue_pairs)
- return -EINVAL;
-
- sq = &vi->sq[qid];
-
- if (napi_if_scheduled_mark_missed(&sq->napi))
- return 0;
-
- local_bh_disable();
- virtqueue_napi_schedule(&sq->napi, sq->vq);
- local_bh_enable();
-
- return 0;
-}
-
static void *virtnet_xsk_to_ptr(u32 len)
{
unsigned long p;
@@ -1476,8 +1467,12 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
u64 kicks = 0;
int sent;
+ /* Avoid to wakeup napi meanless, so call __free_old_xmit. */
__free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
+ if (stats.xsk)
+ xsk_tx_completed(sq->xsk_pool, stats.xsk);
+
sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
@@ -1496,6 +1491,44 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
return sent == budget;
}
+static void xsk_wakeup(struct send_queue *sq)
+{
+ if (napi_if_scheduled_mark_missed(&sq->napi))
+ return;
+
+ local_bh_disable();
+ virtqueue_napi_schedule(&sq->napi, sq->vq);
+ local_bh_enable();
+}
+
+static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct send_queue *sq;
+
+ if (!netif_running(dev))
+ return -ENETDOWN;
+
+ if (qid >= vi->curr_queue_pairs)
+ return -EINVAL;
+
+ sq = &vi->sq[qid];
+
+ xsk_wakeup(sq);
+ return 0;
+}
+
+static void virtnet_xsk_completed(struct send_queue *sq, int num)
+{
+ xsk_tx_completed(sq->xsk_pool, num);
+
+ /* If this is called by rx poll, start_xmit and xdp xmit we should
+ * wakeup the tx napi to consume the xsk tx queue, because the tx
+ * interrupt may not be triggered.
+ */
+ xsk_wakeup(sq);
+}
+
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
@@ -1609,8 +1642,8 @@ static int virtnet_xdp_xmit(struct net_device *dev,
}
/* Free up any pending old buffers before queueing new ones. */
- __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq),
- false, &stats);
+ virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq),
+ false, &stats);
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted xsk buffer
2024-08-20 7:33 ` [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted " Xuan Zhuo
@ 2024-09-11 4:32 ` Jason Wang
2024-09-12 7:55 ` Xuan Zhuo
0 siblings, 1 reply; 36+ messages in thread
From: Jason Wang @ 2024-09-11 4:32 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:34 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk
> buffer) by the last bits of the pointer.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
I suggest squashing this into the previous patch which looks more
logical and complete.
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted xsk buffer
2024-09-11 4:32 ` Jason Wang
@ 2024-09-12 7:55 ` Xuan Zhuo
0 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-09-12 7:55 UTC (permalink / raw)
To: Jason Wang
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 11 Sep 2024 12:32:54 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Aug 20, 2024 at 3:34 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk
> > buffer) by the last bits of the pointer.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>
> I suggest squashing this into the previous patch which looks more
> logical and complete.
OK.
Thanks.
>
> Thanks
>
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 12/13] virtio_net: update tx timeout record
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (10 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 11/13] virtio_net: xsk: tx: handle the transmitted " Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-08-20 7:33 ` [PATCH net-next 13/13] virtio_net: xdp_features add NETDEV_XDP_ACT_XSK_ZEROCOPY Xuan Zhuo
12 siblings, 0 replies; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
If send queue sent some packets, we update the tx timeout
record to prevent the tx timeout.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a898568bed5c..28c5f9e77fa3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1478,6 +1478,13 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
check_sq_full_and_disable(vi, vi->dev, sq);
+ if (stats.packets) {
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(vi->dev, sq - vi->sq);
+ txq_trans_cond_update(txq);
+ }
+
u64_stats_update_begin(&sq->stats.syncp);
u64_stats_add(&sq->stats.packets, stats.packets);
u64_stats_add(&sq->stats.bytes, stats.bytes);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH net-next 13/13] virtio_net: xdp_features add NETDEV_XDP_ACT_XSK_ZEROCOPY
2024-08-20 7:33 [PATCH net-next 00/13] virtio-net: support AF_XDP zero copy (tx) Xuan Zhuo
` (11 preceding siblings ...)
2024-08-20 7:33 ` [PATCH net-next 12/13] virtio_net: update tx timeout record Xuan Zhuo
@ 2024-08-20 7:33 ` Xuan Zhuo
2024-09-11 4:33 ` Jason Wang
12 siblings, 1 reply; 36+ messages in thread
From: Xuan Zhuo @ 2024-08-20 7:33 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Now, we supported AF_XDP(xsk). Add NETDEV_XDP_ACT_XSK_ZEROCOPY to
xdp_features.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 28c5f9e77fa3..8e4578ec1937 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -6594,7 +6594,8 @@ static int virtnet_probe(struct virtio_device *vdev)
dev->hw_features |= NETIF_F_GRO_HW;
dev->vlan_features = dev->features;
- dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
+ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+ NETDEV_XDP_ACT_XSK_ZEROCOPY;
/* MTU range: 68 - 65535 */
dev->min_mtu = MIN_MTU;
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH net-next 13/13] virtio_net: xdp_features add NETDEV_XDP_ACT_XSK_ZEROCOPY
2024-08-20 7:33 ` [PATCH net-next 13/13] virtio_net: xdp_features add NETDEV_XDP_ACT_XSK_ZEROCOPY Xuan Zhuo
@ 2024-09-11 4:33 ` Jason Wang
0 siblings, 0 replies; 36+ messages in thread
From: Jason Wang @ 2024-09-11 4:33 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Aug 20, 2024 at 3:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> Now, we supported AF_XDP(xsk). Add NETDEV_XDP_ACT_XSK_ZEROCOPY to
Should be "support".
> xdp_features.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Other than this.
Acked-by: Jason Wang <jasowang@redhat.com>
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread