* [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-12 23:20 ` Jakub Kicinski
2024-06-11 11:41 ` [PATCH net-next v4 02/15] virtio_ring: introduce vring_need_unmap_buffer Xuan Zhuo
` (13 subsequent siblings)
14 siblings, 1 reply; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
The virtio-net sq will use these APIs to map the scatterlist.
For scatterlist, the page dma APIs are more appropriate.
dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
size_t offset, size_t size,
enum dma_data_direction dir,
unsigned long attrs);
void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs);
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/virtio/virtio_ring.c | 54 ++++++++++++++++++++++++++++++++++++
include/linux/virtio.h | 7 +++++
2 files changed, 61 insertions(+)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 2a972752ff1b..3d28008d6cd4 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -3152,6 +3152,60 @@ void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr,
}
EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs);
+/**
+ * virtqueue_dma_map_page_attrs - map DMA for _vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @page: the page to do dma
+ * @offset: the offset inside the page
+ * @size: the size of the page to do dma
+ * @dir: DMA direction
+ * @attrs: DMA Attrs
+ *
+ * The caller calls this to do dma mapping in advance. The DMA address can be
+ * passed to this _vq when it is in pre-mapped mode.
+ *
+ * return DMA address. Caller should check that by virtqueue_dma_mapping_error().
+ */
+dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
+ size_t offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api) {
+ kmsan_handle_dma(page, offset, size, dir);
+ return page_to_phys(page) + offset;
+ }
+
+ return dma_map_page_attrs(vring_dma_dev(vq), page, offset, size, dir, attrs);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_map_page_attrs);
+
+/**
+ * virtqueue_dma_unmap_page_attrs - unmap DMA for _vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: the dma address to unmap
+ * @size: the size of the buffer
+ * @dir: DMA direction
+ * @attrs: DMA Attrs
+ *
+ * Unmap the address that is mapped by the virtqueue_dma_map_* APIs.
+ *
+ */
+void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api)
+ return;
+
+ dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_page_attrs);
+
/**
* virtqueue_dma_mapping_error - check dma address
* @_vq: the struct virtqueue we're talking about.
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 96fea920873b..ca318a66a7e1 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -234,6 +234,13 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size
void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs);
+dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
+ size_t offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs);
+void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr);
bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page
2024-06-11 11:41 ` [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page Xuan Zhuo
@ 2024-06-12 23:20 ` Jakub Kicinski
2024-06-13 2:38 ` Xuan Zhuo
0 siblings, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2024-06-12 23:20 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, 11 Jun 2024 19:41:33 +0800 Xuan Zhuo wrote:
> +/**
> + * virtqueue_dma_map_page_attrs - map DMA for _vq
> + * @_vq: the struct virtqueue we're talking about.
> + * @page: the page to do dma
> + * @offset: the offset inside the page
> + * @size: the size of the page to do dma
> + * @dir: DMA direction
> + * @attrs: DMA Attrs
> + *
> + * The caller calls this to do dma mapping in advance. The DMA address can be
> + * passed to this _vq when it is in pre-mapped mode.
> + *
> + * return DMA address. Caller should check that by virtqueue_dma_mapping_error().
You gotta format the return value doc in a kdoc-sanctioned way.
please run ./scripts/kernel-doc -none -Wall to find such issues
> + */
> +dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
> + size_t offset, size_t size,
> + enum dma_data_direction dir,
> + unsigned long attrs)
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page
2024-06-12 23:20 ` Jakub Kicinski
@ 2024-06-13 2:38 ` Xuan Zhuo
0 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-13 2:38 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 12 Jun 2024 16:20:11 -0700, Jakub Kicinski <kuba@kernel.org> wrote:
> On Tue, 11 Jun 2024 19:41:33 +0800 Xuan Zhuo wrote:
> > +/**
> > + * virtqueue_dma_map_page_attrs - map DMA for _vq
> > + * @_vq: the struct virtqueue we're talking about.
> > + * @page: the page to do dma
> > + * @offset: the offset inside the page
> > + * @size: the size of the page to do dma
> > + * @dir: DMA direction
> > + * @attrs: DMA Attrs
> > + *
> > + * The caller calls this to do dma mapping in advance. The DMA address can be
> > + * passed to this _vq when it is in pre-mapped mode.
> > + *
> > + * return DMA address. Caller should check that by virtqueue_dma_mapping_error().
>
> You gotta format the return value doc in a kdoc-sanctioned way.
> please run ./scripts/kernel-doc -none -Wall to find such issues
Will fix.
Thanks.
>
> > + */
> > +dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
> > + size_t offset, size_t size,
> > + enum dma_data_direction dir,
> > + unsigned long attrs)
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH net-next v4 02/15] virtio_ring: introduce vring_need_unmap_buffer
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 03/15] virtio_ring: virtqueue_set_dma_premapped() support to disable Xuan Zhuo
` (12 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
To make the code readable, introduce vring_need_unmap_buffer() to
replace do_unmap.
use_dma_api premapped -> vring_need_unmap_buffer()
1. false false false
2. true false true
3. true true false
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/virtio/virtio_ring.c | 27 ++++++++++++---------------
1 file changed, 12 insertions(+), 15 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 3d28008d6cd4..79e313de1566 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -175,11 +175,6 @@ struct vring_virtqueue {
/* Do DMA mapping by driver */
bool premapped;
- /* Do unmap or not for desc. Just when premapped is False and
- * use_dma_api is true, this is true.
- */
- bool do_unmap;
-
/* Head of free buffer list. */
unsigned int free_head;
/* Number we've added since last sync. */
@@ -297,6 +292,11 @@ static bool vring_use_dma_api(const struct virtio_device *vdev)
return false;
}
+static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring)
+{
+ return vring->use_dma_api && !vring->premapped;
+}
+
size_t virtio_max_dma_size(const struct virtio_device *vdev)
{
size_t max_segment_size = SIZE_MAX;
@@ -445,7 +445,7 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
{
u16 flags;
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
@@ -475,7 +475,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
goto out;
dma_unmap_page(vring_dma_dev(vq),
@@ -643,7 +643,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
}
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
- if (!indirect && vq->do_unmap)
+ if (!indirect && vring_need_unmap_buffer(vq))
vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
~VRING_DESC_F_NEXT;
@@ -802,7 +802,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
VRING_DESC_F_INDIRECT));
BUG_ON(len == 0 || len % sizeof(struct vring_desc));
- if (vq->do_unmap) {
+ if (vring_need_unmap_buffer(vq)) {
for (j = 0; j < len / sizeof(struct vring_desc); j++)
vring_unmap_one_split_indirect(vq, &indir_desc[j]);
}
@@ -1236,7 +1236,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
(flags & VRING_DESC_F_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
dma_unmap_page(vring_dma_dev(vq),
@@ -1251,7 +1251,7 @@ static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
{
u16 flags;
- if (!vq->do_unmap)
+ if (!vring_need_unmap_buffer(vq))
return;
flags = le16_to_cpu(desc->flags);
@@ -1632,7 +1632,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
if (!desc)
return;
- if (vq->do_unmap) {
+ if (vring_need_unmap_buffer(vq)) {
len = vq->packed.desc_extra[id].len;
for (i = 0; i < len / sizeof(struct vring_packed_desc);
i++)
@@ -2091,7 +2091,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
vq->premapped = false;
- vq->do_unmap = vq->use_dma_api;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2636,7 +2635,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->dma_dev = dma_dev;
vq->use_dma_api = vring_use_dma_api(vdev);
vq->premapped = false;
- vq->do_unmap = vq->use_dma_api;
vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
!context;
@@ -2799,7 +2797,6 @@ int virtqueue_set_dma_premapped(struct virtqueue *_vq)
}
vq->premapped = true;
- vq->do_unmap = false;
END_USE(vq);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 03/15] virtio_ring: virtqueue_set_dma_premapped() support to disable
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 01/15] virtio_ring: introduce dma map api for page Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 02/15] virtio_ring: introduce vring_need_unmap_buffer Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 04/15] virtio_net: separate virtnet_rx_resize() Xuan Zhuo
` (11 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
virtio-net sq will only enable premapped mode when sq is bound to
the af-xdp.
So we need the helper (virtqueue_set_dma_premapped) to enable the
premapped mode when af-xdp binds to sq. And to disable the
premapped mode when af-xdp unbinds to sq.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 2 +-
drivers/virtio/virtio_ring.c | 7 ++++---
include/linux/virtio.h | 2 +-
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 61a57d134544..838b450d9591 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -938,7 +938,7 @@ static void virtnet_rq_set_premapped(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++)
/* error should never happen */
- BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq));
+ BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq, true));
}
static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 79e313de1566..20705311f59a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2764,8 +2764,9 @@ EXPORT_SYMBOL_GPL(virtqueue_resize);
/**
* virtqueue_set_dma_premapped - set the vring premapped mode
* @_vq: the struct virtqueue we're talking about.
+ * @premapped: bool enable/disable the premapped mode
*
- * Enable the premapped mode of the vq.
+ * Enable/disable the premapped mode of the vq.
*
* The vring in premapped mode does not do dma internally, so the driver must
* do dma mapping in advance. The driver must pass the dma_address through
@@ -2782,7 +2783,7 @@ EXPORT_SYMBOL_GPL(virtqueue_resize);
* 0: success.
* -EINVAL: too late to enable premapped mode, the vq already contains buffers.
*/
-int virtqueue_set_dma_premapped(struct virtqueue *_vq)
+int virtqueue_set_dma_premapped(struct virtqueue *_vq, bool premapped)
{
struct vring_virtqueue *vq = to_vvq(_vq);
u32 num;
@@ -2796,7 +2797,7 @@ int virtqueue_set_dma_premapped(struct virtqueue *_vq)
return -EINVAL;
}
- vq->premapped = true;
+ vq->premapped = premapped;
END_USE(vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ca318a66a7e1..69677d02cee9 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -81,7 +81,7 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
-int virtqueue_set_dma_premapped(struct virtqueue *_vq);
+int virtqueue_set_dma_premapped(struct virtqueue *_vq, bool premapped);
bool virtqueue_poll(struct virtqueue *vq, unsigned);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 04/15] virtio_net: separate virtnet_rx_resize()
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (2 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 03/15] virtio_ring: virtqueue_set_dma_premapped() support to disable Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 05/15] virtio_net: separate virtnet_tx_resize() Xuan Zhuo
` (10 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
This patch separates two sub-functions from virtnet_rx_resize():
* virtnet_rx_pause
* virtnet_rx_resume
Then the subsequent reset rx for xsk can share these two functions.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 838b450d9591..597d2c5fccc0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2609,28 +2609,41 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
-static int virtnet_rx_resize(struct virtnet_info *vi,
- struct receive_queue *rq, u32 ring_num)
+static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
{
bool running = netif_running(vi->dev);
- int err, qindex;
-
- qindex = rq - vi->rq;
if (running) {
napi_disable(&rq->napi);
cancel_work_sync(&rq->dim.work);
}
+}
- err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
- if (err)
- netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
+static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
+{
+ bool running = netif_running(vi->dev);
if (!try_fill_recv(vi, rq, GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
if (running)
virtnet_napi_enable(rq->vq, &rq->napi);
+}
+
+static int virtnet_rx_resize(struct virtnet_info *vi,
+ struct receive_queue *rq, u32 ring_num)
+{
+ int err, qindex;
+
+ qindex = rq - vi->rq;
+
+ virtnet_rx_pause(vi, rq);
+
+ err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
+ if (err)
+ netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
+
+ virtnet_rx_resume(vi, rq);
return err;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 05/15] virtio_net: separate virtnet_tx_resize()
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (3 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 04/15] virtio_net: separate virtnet_rx_resize() Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 06/15] virtio_net: separate receive_buf Xuan Zhuo
` (9 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
This patch separates two sub-functions from virtnet_tx_resize():
* virtnet_tx_pause
* virtnet_tx_resume
Then the subsequent virtnet_tx_reset() can share these two functions.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 35 +++++++++++++++++++++++++++++------
1 file changed, 29 insertions(+), 6 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 597d2c5fccc0..8c51947abce9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2647,12 +2647,11 @@ static int virtnet_rx_resize(struct virtnet_info *vi,
return err;
}
-static int virtnet_tx_resize(struct virtnet_info *vi,
- struct send_queue *sq, u32 ring_num)
+static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq)
{
bool running = netif_running(vi->dev);
struct netdev_queue *txq;
- int err, qindex;
+ int qindex;
qindex = sq - vi->sq;
@@ -2673,10 +2672,17 @@ static int virtnet_tx_resize(struct virtnet_info *vi,
netif_stop_subqueue(vi->dev, qindex);
__netif_tx_unlock_bh(txq);
+}
- err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
- if (err)
- netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
+static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq)
+{
+ bool running = netif_running(vi->dev);
+ struct netdev_queue *txq;
+ int qindex;
+
+ qindex = sq - vi->sq;
+
+ txq = netdev_get_tx_queue(vi->dev, qindex);
__netif_tx_lock_bh(txq);
sq->reset = false;
@@ -2685,6 +2691,23 @@ static int virtnet_tx_resize(struct virtnet_info *vi,
if (running)
virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
+}
+
+static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq,
+ u32 ring_num)
+{
+ int qindex, err;
+
+ qindex = sq - vi->sq;
+
+ virtnet_tx_pause(vi, sq);
+
+ err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
+ if (err)
+ netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
+
+ virtnet_tx_resume(vi, sq);
+
return err;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 06/15] virtio_net: separate receive_buf
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (4 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 05/15] virtio_net: separate virtnet_tx_resize() Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 07/15] virtio_net: refactor the xmit type Xuan Zhuo
` (8 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
This commit separates the function receive_buf(), then we wrap the logic
of handling the skb to an independent function virtnet_receive_done().
The subsequent commit will reuse it.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 56 +++++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8c51947abce9..161694957065 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1935,32 +1935,11 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type);
}
-static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
- void *buf, unsigned int len, void **ctx,
- unsigned int *xdp_xmit,
- struct virtnet_rq_stats *stats)
+static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq,
+ struct sk_buff *skb)
{
- struct net_device *dev = vi->dev;
- struct sk_buff *skb;
struct virtio_net_common_hdr *hdr;
-
- if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
- pr_debug("%s: short packet %i\n", dev->name, len);
- DEV_STATS_INC(dev, rx_length_errors);
- virtnet_rq_free_buf(vi, rq, buf);
- return;
- }
-
- if (vi->mergeable_rx_bufs)
- skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
- stats);
- else if (vi->big_packets)
- skb = receive_big(dev, vi, rq, buf, len, stats);
- else
- skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
-
- if (unlikely(!skb))
- return;
+ struct net_device *dev = vi->dev;
hdr = skb_vnet_common_hdr(skb);
if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
@@ -1990,6 +1969,35 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
dev_kfree_skb(skb);
}
+static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
+ void *buf, unsigned int len, void **ctx,
+ unsigned int *xdp_xmit,
+ struct virtnet_rq_stats *stats)
+{
+ struct net_device *dev = vi->dev;
+ struct sk_buff *skb;
+
+ if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
+ pr_debug("%s: short packet %i\n", dev->name, len);
+ DEV_STATS_INC(dev, rx_length_errors);
+ virtnet_rq_free_buf(vi, rq, buf);
+ return;
+ }
+
+ if (vi->mergeable_rx_bufs)
+ skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
+ stats);
+ else if (vi->big_packets)
+ skb = receive_big(dev, vi, rq, buf, len, stats);
+ else
+ skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
+
+ if (unlikely(!skb))
+ return;
+
+ virtnet_receive_done(vi, rq, skb);
+}
+
/* Unlike mergeable buffers, all buffers are allocated to the
* same size, except for the headroom. For this reason we do
* not need to use mergeable_len_to_ctx here - it is enough
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 07/15] virtio_net: refactor the xmit type
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (5 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 06/15] virtio_net: separate receive_buf Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 08/15] virtio_net: sq support premapped mode Xuan Zhuo
` (7 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Because the af-xdp and sq premapped mode will introduce two
new xmit type, so I refactor the xmit type mechanism first.
We use the last two bits of the pointer to distinguish the xmit type,
so we can distinguish four xmit types. Now we have two xmit types:
SKB and XDP.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 58 +++++++++++++++++++++++++++-------------
1 file changed, 40 insertions(+), 18 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 161694957065..e84a4624549b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -47,8 +47,6 @@ module_param(napi_tx, bool, 0644);
#define VIRTIO_XDP_TX BIT(0)
#define VIRTIO_XDP_REDIR BIT(1)
-#define VIRTIO_XDP_FLAG BIT(0)
-
/* RX packet size EWMA. The average packet size is used to determine the packet
* buffer size when refilling RX rings. As the entire RX ring may be refilled
* at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -491,42 +489,62 @@ struct virtio_net_common_hdr {
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
-static bool is_xdp_frame(void *ptr)
+enum virtnet_xmit_type {
+ VIRTNET_XMIT_TYPE_SKB,
+ VIRTNET_XMIT_TYPE_XDP,
+};
+
+#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP)
+
+static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
{
- return (unsigned long)ptr & VIRTIO_XDP_FLAG;
+ unsigned long p = (unsigned long)*ptr;
+
+ *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK);
+
+ return p & VIRTNET_XMIT_TYPE_MASK;
}
-static void *xdp_to_ptr(struct xdp_frame *ptr)
+static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
{
- return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
+ return (void *)((unsigned long)ptr | type);
}
-static struct xdp_frame *ptr_to_xdp(void *ptr)
+static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
+ enum virtnet_xmit_type type)
{
- return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
+ return virtqueue_add_outbuf(sq->vq, sq->sg, num,
+ virtnet_xmit_ptr_mix(data, type),
+ GFP_ATOMIC);
}
static void __free_old_xmit(struct send_queue *sq, bool in_napi,
struct virtnet_sq_free_stats *stats)
{
+ struct xdp_frame *frame;
+ struct sk_buff *skb;
unsigned int len;
void *ptr;
while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
++stats->packets;
- if (!is_xdp_frame(ptr)) {
- struct sk_buff *skb = ptr;
+ switch (virtnet_xmit_ptr_strip(&ptr)) {
+ case VIRTNET_XMIT_TYPE_SKB:
+ skb = ptr;
pr_debug("Sent skb %p\n", skb);
stats->bytes += skb->len;
napi_consume_skb(skb, in_napi);
- } else {
- struct xdp_frame *frame = ptr_to_xdp(ptr);
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ frame = ptr;
stats->bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
+ break;
}
}
}
@@ -1064,8 +1082,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
skb_frag_size(frag), skb_frag_off(frag));
}
- err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
- xdp_to_ptr(xdpf), GFP_ATOMIC);
+ err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP);
if (unlikely(err))
return -ENOSPC; /* Caller handle free/refcnt */
@@ -2557,7 +2574,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
return num_sg;
num_sg++;
}
- return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
+ return virtnet_add_outbuf(sq, num_sg, skb, VIRTNET_XMIT_TYPE_SKB);
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -5263,10 +5280,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{
- if (!is_xdp_frame(buf))
+ switch (virtnet_xmit_ptr_strip(&buf)) {
+ case VIRTNET_XMIT_TYPE_SKB:
dev_kfree_skb(buf);
- else
- xdp_return_frame(ptr_to_xdp(buf));
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ xdp_return_frame(buf);
+ break;
+ }
}
static void free_unused_bufs(struct virtnet_info *vi)
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 08/15] virtio_net: sq support premapped mode
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (6 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 07/15] virtio_net: refactor the xmit type Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-12 23:23 ` Jakub Kicinski
2024-06-12 23:41 ` Michael S. Tsirkin
2024-06-11 11:41 ` [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk Xuan Zhuo
` (6 subsequent siblings)
14 siblings, 2 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
If the xsk is enabling, the xsk tx will share the send queue.
But the xsk requires that the send queue use the premapped mode.
So the send queue must support premapped mode when it is bound to
af-xdp.
* virtnet_sq_set_premapped(sq, true) is used to enable premapped mode.
In this mode, the driver will record the dma info when skb or xdp
frame is sent.
Currently, the SQ premapped mode is operational only with af-xdp. In
this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
the same SQ. Af-xdp independently manages its DMA. The kernel stack
and xdp tx/redirect utilize this DMA metadata to manage the DMA
info.
If the indirect descriptor feature be supported, the volume of DMA
details we need to maintain becomes quite substantial. Here, we have
a cap on the amount of DMA info we manage.
If the kernel stack and xdp tx/redirect attempt to use more
descriptors, virtnet_add_outbuf() will return an -ENOMEM error. But
the af-xdp can work continually.
* virtnet_sq_set_premapped(sq, false) is used to disable premapped mode.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 219 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 215 insertions(+), 4 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e84a4624549b..4968ab7eb5a4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -25,6 +25,7 @@
#include <net/net_failover.h>
#include <net/netdev_rx_queue.h>
#include <net/netdev_queues.h>
+#include <uapi/linux/virtio_ring.h>
static int napi_weight = NAPI_POLL_WEIGHT;
module_param(napi_weight, int, 0444);
@@ -276,6 +277,25 @@ struct virtnet_rq_dma {
u16 need_sync;
};
+struct virtnet_sq_dma {
+ union {
+ struct virtnet_sq_dma *next;
+ void *data;
+ };
+ dma_addr_t addr;
+ u32 len;
+ u8 num;
+};
+
+struct virtnet_sq_dma_info {
+ /* record for kfree */
+ void *p;
+
+ u32 free_num;
+
+ struct virtnet_sq_dma *free;
+};
+
/* Internal representation of a send virtqueue */
struct send_queue {
/* Virtqueue associated with this send _queue */
@@ -295,6 +315,11 @@ struct send_queue {
/* Record whether sq is in reset state. */
bool reset;
+
+ /* SQ is premapped mode or not. */
+ bool premapped;
+
+ struct virtnet_sq_dma_info dmainfo;
};
/* Internal representation of a receive virtqueue */
@@ -492,9 +517,11 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
VIRTNET_XMIT_TYPE_XDP,
+ VIRTNET_XMIT_TYPE_DMA,
};
-#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP)
+#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP \
+ | VIRTNET_XMIT_TYPE_DMA)
static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
{
@@ -510,12 +537,178 @@ static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
return (void *)((unsigned long)ptr | type);
}
+static void virtnet_sq_unmap(struct send_queue *sq, void **data)
+{
+ struct virtnet_sq_dma *head, *tail, *p;
+ int i;
+
+ head = *data;
+
+ p = head;
+
+ for (i = 0; i < head->num; ++i) {
+ virtqueue_dma_unmap_page_attrs(sq->vq, p->addr, p->len,
+ DMA_TO_DEVICE, 0);
+ tail = p;
+ p = p->next;
+ }
+
+ *data = tail->data;
+
+ tail->next = sq->dmainfo.free;
+ sq->dmainfo.free = head;
+ sq->dmainfo.free_num += head->num;
+}
+
+static void *virtnet_dma_chain_update(struct send_queue *sq,
+ struct virtnet_sq_dma *head,
+ struct virtnet_sq_dma *tail,
+ u8 num, void *data)
+{
+ sq->dmainfo.free = tail->next;
+ sq->dmainfo.free_num -= num;
+ head->num = num;
+
+ tail->data = data;
+
+ return virtnet_xmit_ptr_mix(head, VIRTNET_XMIT_TYPE_DMA);
+}
+
+static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq, int num, void *data)
+{
+ struct virtnet_sq_dma *head, *tail, *p;
+ struct scatterlist *sg;
+ dma_addr_t addr;
+ int i, err;
+
+ if (num > sq->dmainfo.free_num)
+ return NULL;
+
+ head = sq->dmainfo.free;
+ p = head;
+
+ tail = NULL;
+
+ for (i = 0; i < num; ++i) {
+ sg = &sq->sg[i];
+
+ addr = virtqueue_dma_map_page_attrs(sq->vq, sg_page(sg),
+ sg->offset,
+ sg->length, DMA_TO_DEVICE,
+ 0);
+ err = virtqueue_dma_mapping_error(sq->vq, addr);
+ if (err)
+ goto err;
+
+ sg->dma_address = addr;
+
+ tail = p;
+ tail->addr = sg->dma_address;
+ tail->len = sg->length;
+
+ p = p->next;
+ }
+
+ return virtnet_dma_chain_update(sq, head, tail, num, data);
+
+err:
+ if (tail) {
+ virtnet_dma_chain_update(sq, head, tail, i, data);
+ virtnet_sq_unmap(sq, (void **)&head);
+ }
+
+ return NULL;
+}
+
static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
enum virtnet_xmit_type type)
{
- return virtqueue_add_outbuf(sq->vq, sq->sg, num,
- virtnet_xmit_ptr_mix(data, type),
- GFP_ATOMIC);
+ int ret;
+
+ data = virtnet_xmit_ptr_mix(data, type);
+
+ if (sq->premapped) {
+ data = virtnet_sq_map_sg(sq, num, data);
+ if (!data)
+ return -ENOMEM;
+ }
+
+ ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
+ if (ret && sq->premapped) {
+ virtnet_xmit_ptr_strip(&data);
+ virtnet_sq_unmap(sq, &data);
+ }
+
+ return ret;
+}
+
+static int virtnet_sq_alloc_dma_meta(struct send_queue *sq)
+{
+ struct virtnet_sq_dma *d;
+ int num, i;
+
+ num = virtqueue_get_vring_size(sq->vq);
+
+ /* Currently, the SQ premapped mode is operational only with af-xdp. In
+ * this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
+ * the same SQ. Af-xdp independently manages its DMA. The kernel stack
+ * and xdp tx/redirect utilize this DMA metadata to manage the DMA info.
+ *
+ * If the indirect descriptor feature be supported, the volume of DMA
+ * details we need to maintain becomes quite substantial. Here, we have
+ * a cap on the amount of DMA info we manage, effectively limiting it to
+ * twice the size of the ring buffer.
+ *
+ * If the kernel stack and xdp tx/redirect attempt to use more
+ * descriptors than allowed by this double ring buffer size,
+ * virtnet_add_outbuf() will return an -ENOMEM error. But the af-xdp can
+ * work continually.
+ */
+ if (virtio_has_feature(sq->vq->vdev, VIRTIO_RING_F_INDIRECT_DESC))
+ num = num * 2;
+
+ sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
+ if (!sq->dmainfo.free)
+ return -ENOMEM;
+
+ sq->dmainfo.p = sq->dmainfo.free;
+ sq->dmainfo.free_num = num;
+
+ for (i = 0; i < num; ++i) {
+ d = &sq->dmainfo.free[i];
+ d->next = d + 1;
+ }
+
+ d->next = NULL;
+
+ return 0;
+}
+
+static void virtnet_sq_free_dma_meta(struct send_queue *sq)
+{
+ kfree(sq->dmainfo.p);
+
+ sq->dmainfo.p = NULL;
+ sq->dmainfo.free = NULL;
+ sq->dmainfo.free_num = 0;
+}
+
+/* This function must be called immediately after creating the vq, or after vq
+ * reset, and before adding any buffers to it.
+ */
+static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
+{
+ if (premapped) {
+ if (virtnet_sq_alloc_dma_meta(sq))
+ return -ENOMEM;
+ } else {
+ virtnet_sq_free_dma_meta(sq);
+ }
+
+ BUG_ON(virtqueue_set_dma_premapped(sq->vq, premapped));
+
+ sq->premapped = premapped;
+ return 0;
}
static void __free_old_xmit(struct send_queue *sq, bool in_napi,
@@ -529,6 +722,7 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
++stats->packets;
+retry:
switch (virtnet_xmit_ptr_strip(&ptr)) {
case VIRTNET_XMIT_TYPE_SKB:
skb = ptr;
@@ -545,6 +739,10 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
stats->bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
break;
+
+ case VIRTNET_XMIT_TYPE_DMA:
+ virtnet_sq_unmap(sq, &ptr);
+ goto retry;
}
}
}
@@ -5232,6 +5430,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++) {
__netif_napi_del(&vi->rq[i].napi);
__netif_napi_del(&vi->sq[i].napi);
+
+ virtnet_sq_free_dma_meta(&vi->sq[i]);
}
/* We called __netif_napi_del(),
@@ -5280,6 +5480,13 @@ static void free_receive_page_frags(struct virtnet_info *vi)
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{
+ struct virtnet_info *vi = vq->vdev->priv;
+ struct send_queue *sq;
+ int i = vq2rxq(vq);
+
+ sq = &vi->sq[i];
+
+retry:
switch (virtnet_xmit_ptr_strip(&buf)) {
case VIRTNET_XMIT_TYPE_SKB:
dev_kfree_skb(buf);
@@ -5288,6 +5495,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
case VIRTNET_XMIT_TYPE_XDP:
xdp_return_frame(buf);
break;
+
+ case VIRTNET_XMIT_TYPE_DMA:
+ virtnet_sq_unmap(sq, &buf);
+ goto retry;
}
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 08/15] virtio_net: sq support premapped mode
2024-06-11 11:41 ` [PATCH net-next v4 08/15] virtio_net: sq support premapped mode Xuan Zhuo
@ 2024-06-12 23:23 ` Jakub Kicinski
2024-06-13 2:39 ` Xuan Zhuo
2024-06-12 23:41 ` Michael S. Tsirkin
1 sibling, 1 reply; 27+ messages in thread
From: Jakub Kicinski @ 2024-06-12 23:23 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, 11 Jun 2024 19:41:40 +0800 Xuan Zhuo wrote:
> +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
Could you try to add __maybe_unused or some such and then remove it
in the patch which calls this function? Having warnings during
bisection is not great.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH net-next v4 08/15] virtio_net: sq support premapped mode
2024-06-12 23:23 ` Jakub Kicinski
@ 2024-06-13 2:39 ` Xuan Zhuo
0 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-13 2:39 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 12 Jun 2024 16:23:37 -0700, Jakub Kicinski <kuba@kernel.org> wrote:
> On Tue, 11 Jun 2024 19:41:40 +0800 Xuan Zhuo wrote:
> > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
>
> Could you try to add __maybe_unused or some such and then remove it
> in the patch which calls this function? Having warnings during
> bisection is not great.
Will do.
Thanks.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH net-next v4 08/15] virtio_net: sq support premapped mode
2024-06-11 11:41 ` [PATCH net-next v4 08/15] virtio_net: sq support premapped mode Xuan Zhuo
2024-06-12 23:23 ` Jakub Kicinski
@ 2024-06-12 23:41 ` Michael S. Tsirkin
2024-06-13 2:47 ` Xuan Zhuo
1 sibling, 1 reply; 27+ messages in thread
From: Michael S. Tsirkin @ 2024-06-12 23:41 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Jun 11, 2024 at 07:41:40PM +0800, Xuan Zhuo wrote:
> If the xsk is enabling, the xsk tx will share the send queue.
> But the xsk requires that the send queue use the premapped mode.
> So the send queue must support premapped mode when it is bound to
> af-xdp.
>
> * virtnet_sq_set_premapped(sq, true) is used to enable premapped mode.
>
> In this mode, the driver will record the dma info when skb or xdp
> frame is sent.
>
> Currently, the SQ premapped mode is operational only with af-xdp. In
> this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
> the same SQ. Af-xdp independently manages its DMA. The kernel stack
> and xdp tx/redirect utilize this DMA metadata to manage the DMA
> info.
>
> If the indirect descriptor feature be supported, the volume of DMA
> details we need to maintain becomes quite substantial. Here, we have
> a cap on the amount of DMA info we manage.
>
> If the kernel stack and xdp tx/redirect attempt to use more
> descriptors, virtnet_add_outbuf() will return an -ENOMEM error. But
> the af-xdp can work continually.
>
> * virtnet_sq_set_premapped(sq, false) is used to disable premapped mode.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 219 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 215 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index e84a4624549b..4968ab7eb5a4 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -25,6 +25,7 @@
> #include <net/net_failover.h>
> #include <net/netdev_rx_queue.h>
> #include <net/netdev_queues.h>
> +#include <uapi/linux/virtio_ring.h>
>
> static int napi_weight = NAPI_POLL_WEIGHT;
> module_param(napi_weight, int, 0444);
> @@ -276,6 +277,25 @@ struct virtnet_rq_dma {
> u16 need_sync;
> };
>
> +struct virtnet_sq_dma {
> + union {
> + struct virtnet_sq_dma *next;
Maybe:
struct llist_node node;
I'd like to avoid growing our own single linked list
implementation if at all possible.
> + void *data;
> + };
> + dma_addr_t addr;
> + u32 len;
> + u8 num;
> +};
> +
> +struct virtnet_sq_dma_info {
> + /* record for kfree */
> + void *p;
> +
> + u32 free_num;
> +
> + struct virtnet_sq_dma *free;
> +};
> +
> /* Internal representation of a send virtqueue */
> struct send_queue {
> /* Virtqueue associated with this send _queue */
> @@ -295,6 +315,11 @@ struct send_queue {
>
> /* Record whether sq is in reset state. */
> bool reset;
> +
> + /* SQ is premapped mode or not. */
> + bool premapped;
> +
> + struct virtnet_sq_dma_info dmainfo;
> };
>
> /* Internal representation of a receive virtqueue */
> @@ -492,9 +517,11 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> enum virtnet_xmit_type {
> VIRTNET_XMIT_TYPE_SKB,
> VIRTNET_XMIT_TYPE_XDP,
> + VIRTNET_XMIT_TYPE_DMA,
> };
>
> -#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP)
> +#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP \
> + | VIRTNET_XMIT_TYPE_DMA)
>
> static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> {
> @@ -510,12 +537,178 @@ static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
> return (void *)((unsigned long)ptr | type);
> }
>
> +static void virtnet_sq_unmap(struct send_queue *sq, void **data)
> +{
> + struct virtnet_sq_dma *head, *tail, *p;
> + int i;
> +
> + head = *data;
> +
> + p = head;
> +
> + for (i = 0; i < head->num; ++i) {
> + virtqueue_dma_unmap_page_attrs(sq->vq, p->addr, p->len,
> + DMA_TO_DEVICE, 0);
> + tail = p;
> + p = p->next;
> + }
> +
> + *data = tail->data;
> +
> + tail->next = sq->dmainfo.free;
> + sq->dmainfo.free = head;
> + sq->dmainfo.free_num += head->num;
> +}
> +
> +static void *virtnet_dma_chain_update(struct send_queue *sq,
> + struct virtnet_sq_dma *head,
> + struct virtnet_sq_dma *tail,
> + u8 num, void *data)
> +{
> + sq->dmainfo.free = tail->next;
> + sq->dmainfo.free_num -= num;
> + head->num = num;
> +
> + tail->data = data;
> +
> + return virtnet_xmit_ptr_mix(head, VIRTNET_XMIT_TYPE_DMA);
> +}
> +
> +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq, int num, void *data)
> +{
> + struct virtnet_sq_dma *head, *tail, *p;
> + struct scatterlist *sg;
> + dma_addr_t addr;
> + int i, err;
> +
> + if (num > sq->dmainfo.free_num)
> + return NULL;
> +
> + head = sq->dmainfo.free;
> + p = head;
> +
> + tail = NULL;
> +
> + for (i = 0; i < num; ++i) {
> + sg = &sq->sg[i];
> +
> + addr = virtqueue_dma_map_page_attrs(sq->vq, sg_page(sg),
> + sg->offset,
> + sg->length, DMA_TO_DEVICE,
> + 0);
> + err = virtqueue_dma_mapping_error(sq->vq, addr);
> + if (err)
> + goto err;
> +
> + sg->dma_address = addr;
> +
> + tail = p;
> + tail->addr = sg->dma_address;
> + tail->len = sg->length;
> +
> + p = p->next;
> + }
> +
> + return virtnet_dma_chain_update(sq, head, tail, num, data);
> +
> +err:
> + if (tail) {
> + virtnet_dma_chain_update(sq, head, tail, i, data);
> + virtnet_sq_unmap(sq, (void **)&head);
> + }
> +
> + return NULL;
> +}
> +
> static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> enum virtnet_xmit_type type)
> {
> - return virtqueue_add_outbuf(sq->vq, sq->sg, num,
> - virtnet_xmit_ptr_mix(data, type),
> - GFP_ATOMIC);
> + int ret;
> +
> + data = virtnet_xmit_ptr_mix(data, type);
> +
> + if (sq->premapped) {
> + data = virtnet_sq_map_sg(sq, num, data);
> + if (!data)
> + return -ENOMEM;
> + }
> +
> + ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> + if (ret && sq->premapped) {
> + virtnet_xmit_ptr_strip(&data);
> + virtnet_sq_unmap(sq, &data);
> + }
> +
> + return ret;
> +}
> +
> +static int virtnet_sq_alloc_dma_meta(struct send_queue *sq)
> +{
> + struct virtnet_sq_dma *d;
> + int num, i;
> +
> + num = virtqueue_get_vring_size(sq->vq);
> +
> + /* Currently, the SQ premapped mode is operational only with af-xdp. In
> + * this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
> + * the same SQ. Af-xdp independently manages its DMA. The kernel stack
> + * and xdp tx/redirect utilize this DMA metadata to manage the DMA info.
> + *
> + * If the indirect descriptor feature be supported, the volume of DMA
> + * details we need to maintain becomes quite substantial. Here, we have
> + * a cap on the amount of DMA info we manage, effectively limiting it to
> + * twice the size of the ring buffer.
> + *
> + * If the kernel stack and xdp tx/redirect attempt to use more
> + * descriptors than allowed by this double ring buffer size,
> + * virtnet_add_outbuf() will return an -ENOMEM error. But the af-xdp can
> + * work continually.
> + */
> + if (virtio_has_feature(sq->vq->vdev, VIRTIO_RING_F_INDIRECT_DESC))
> + num = num * 2;
> +
> + sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
Can be large. kvmalloc seems more appropriate.
> + if (!sq->dmainfo.free)
> + return -ENOMEM;
> +
> + sq->dmainfo.p = sq->dmainfo.free;
> + sq->dmainfo.free_num = num;
> +
> + for (i = 0; i < num; ++i) {
> + d = &sq->dmainfo.free[i];
> + d->next = d + 1;
> + }
> +
> + d->next = NULL;
> +
> + return 0;
> +}
> +
> +static void virtnet_sq_free_dma_meta(struct send_queue *sq)
> +{
> + kfree(sq->dmainfo.p);
> +
> + sq->dmainfo.p = NULL;
> + sq->dmainfo.free = NULL;
> + sq->dmainfo.free_num = 0;
> +}
> +
> +/* This function must be called immediately after creating the vq, or after vq
> + * reset, and before adding any buffers to it.
> + */
> +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
> +{
> + if (premapped) {
> + if (virtnet_sq_alloc_dma_meta(sq))
> + return -ENOMEM;
I would just return what virtnet_sq_alloc_dma_meta returns.
int r = virtnet_sq_alloc_dma_meta();
if (r)
return r;
> + } else {
> + virtnet_sq_free_dma_meta(sq);
> + }
> +
> + BUG_ON(virtqueue_set_dma_premapped(sq->vq, premapped));
> +
> + sq->premapped = premapped;
> + return 0;
> }
>
> static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> @@ -529,6 +722,7 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> ++stats->packets;
>
> +retry:
> switch (virtnet_xmit_ptr_strip(&ptr)) {
> case VIRTNET_XMIT_TYPE_SKB:
> skb = ptr;
> @@ -545,6 +739,10 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> stats->bytes += xdp_get_frame_len(frame);
> xdp_return_frame(frame);
> break;
> +
> + case VIRTNET_XMIT_TYPE_DMA:
> + virtnet_sq_unmap(sq, &ptr);
> + goto retry;
> }
> }
> }
> @@ -5232,6 +5430,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> for (i = 0; i < vi->max_queue_pairs; i++) {
> __netif_napi_del(&vi->rq[i].napi);
> __netif_napi_del(&vi->sq[i].napi);
> +
> + virtnet_sq_free_dma_meta(&vi->sq[i]);
> }
>
> /* We called __netif_napi_del(),
> @@ -5280,6 +5480,13 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>
> static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> {
> + struct virtnet_info *vi = vq->vdev->priv;
> + struct send_queue *sq;
> + int i = vq2rxq(vq);
> +
> + sq = &vi->sq[i];
> +
> +retry:
> switch (virtnet_xmit_ptr_strip(&buf)) {
> case VIRTNET_XMIT_TYPE_SKB:
> dev_kfree_skb(buf);
> @@ -5288,6 +5495,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> case VIRTNET_XMIT_TYPE_XDP:
> xdp_return_frame(buf);
> break;
> +
> + case VIRTNET_XMIT_TYPE_DMA:
> + virtnet_sq_unmap(sq, &buf);
> + goto retry;
document what's going on here? why are we retrying?
> }
> }
>
> --
> 2.32.0.3.g01195cf9f
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 08/15] virtio_net: sq support premapped mode
2024-06-12 23:41 ` Michael S. Tsirkin
@ 2024-06-13 2:47 ` Xuan Zhuo
0 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-13 2:47 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 12 Jun 2024 19:41:51 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Tue, Jun 11, 2024 at 07:41:40PM +0800, Xuan Zhuo wrote:
> > If the xsk is enabling, the xsk tx will share the send queue.
> > But the xsk requires that the send queue use the premapped mode.
> > So the send queue must support premapped mode when it is bound to
> > af-xdp.
> >
> > * virtnet_sq_set_premapped(sq, true) is used to enable premapped mode.
> >
> > In this mode, the driver will record the dma info when skb or xdp
> > frame is sent.
> >
> > Currently, the SQ premapped mode is operational only with af-xdp. In
> > this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
> > the same SQ. Af-xdp independently manages its DMA. The kernel stack
> > and xdp tx/redirect utilize this DMA metadata to manage the DMA
> > info.
> >
> > If the indirect descriptor feature be supported, the volume of DMA
> > details we need to maintain becomes quite substantial. Here, we have
> > a cap on the amount of DMA info we manage.
> >
> > If the kernel stack and xdp tx/redirect attempt to use more
> > descriptors, virtnet_add_outbuf() will return an -ENOMEM error. But
> > the af-xdp can work continually.
> >
> > * virtnet_sq_set_premapped(sq, false) is used to disable premapped mode.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> > drivers/net/virtio_net.c | 219 ++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 215 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index e84a4624549b..4968ab7eb5a4 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -25,6 +25,7 @@
> > #include <net/net_failover.h>
> > #include <net/netdev_rx_queue.h>
> > #include <net/netdev_queues.h>
> > +#include <uapi/linux/virtio_ring.h>
> >
> > static int napi_weight = NAPI_POLL_WEIGHT;
> > module_param(napi_weight, int, 0444);
> > @@ -276,6 +277,25 @@ struct virtnet_rq_dma {
> > u16 need_sync;
> > };
> >
> > +struct virtnet_sq_dma {
> > + union {
> > + struct virtnet_sq_dma *next;
>
> Maybe:
>
> struct llist_node node;
>
> I'd like to avoid growing our own single linked list
> implementation if at all possible.
I will fix all the comments in next version.
Thanks.
>
>
>
> > + void *data;
> > + };
> > + dma_addr_t addr;
> > + u32 len;
> > + u8 num;
> > +};
> > +
> > +struct virtnet_sq_dma_info {
> > + /* record for kfree */
> > + void *p;
> > +
> > + u32 free_num;
> > +
> > + struct virtnet_sq_dma *free;
> > +};
> > +
> > /* Internal representation of a send virtqueue */
> > struct send_queue {
> > /* Virtqueue associated with this send _queue */
> > @@ -295,6 +315,11 @@ struct send_queue {
> >
> > /* Record whether sq is in reset state. */
> > bool reset;
> > +
> > + /* SQ is premapped mode or not. */
> > + bool premapped;
> > +
> > + struct virtnet_sq_dma_info dmainfo;
> > };
> >
> > /* Internal representation of a receive virtqueue */
> > @@ -492,9 +517,11 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> > enum virtnet_xmit_type {
> > VIRTNET_XMIT_TYPE_SKB,
> > VIRTNET_XMIT_TYPE_XDP,
> > + VIRTNET_XMIT_TYPE_DMA,
> > };
> >
> > -#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP)
> > +#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP \
> > + | VIRTNET_XMIT_TYPE_DMA)
> >
> > static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
> > {
> > @@ -510,12 +537,178 @@ static void *virtnet_xmit_ptr_mix(void *ptr, enum virtnet_xmit_type type)
> > return (void *)((unsigned long)ptr | type);
> > }
> >
> > +static void virtnet_sq_unmap(struct send_queue *sq, void **data)
> > +{
> > + struct virtnet_sq_dma *head, *tail, *p;
> > + int i;
> > +
> > + head = *data;
> > +
> > + p = head;
> > +
> > + for (i = 0; i < head->num; ++i) {
> > + virtqueue_dma_unmap_page_attrs(sq->vq, p->addr, p->len,
> > + DMA_TO_DEVICE, 0);
> > + tail = p;
> > + p = p->next;
> > + }
> > +
> > + *data = tail->data;
> > +
> > + tail->next = sq->dmainfo.free;
> > + sq->dmainfo.free = head;
> > + sq->dmainfo.free_num += head->num;
> > +}
> > +
> > +static void *virtnet_dma_chain_update(struct send_queue *sq,
> > + struct virtnet_sq_dma *head,
> > + struct virtnet_sq_dma *tail,
> > + u8 num, void *data)
> > +{
> > + sq->dmainfo.free = tail->next;
> > + sq->dmainfo.free_num -= num;
> > + head->num = num;
> > +
> > + tail->data = data;
> > +
> > + return virtnet_xmit_ptr_mix(head, VIRTNET_XMIT_TYPE_DMA);
> > +}
> > +
> > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq, int num, void *data)
> > +{
> > + struct virtnet_sq_dma *head, *tail, *p;
> > + struct scatterlist *sg;
> > + dma_addr_t addr;
> > + int i, err;
> > +
> > + if (num > sq->dmainfo.free_num)
> > + return NULL;
> > +
> > + head = sq->dmainfo.free;
> > + p = head;
> > +
> > + tail = NULL;
> > +
> > + for (i = 0; i < num; ++i) {
> > + sg = &sq->sg[i];
> > +
> > + addr = virtqueue_dma_map_page_attrs(sq->vq, sg_page(sg),
> > + sg->offset,
> > + sg->length, DMA_TO_DEVICE,
> > + 0);
> > + err = virtqueue_dma_mapping_error(sq->vq, addr);
> > + if (err)
> > + goto err;
> > +
> > + sg->dma_address = addr;
> > +
> > + tail = p;
> > + tail->addr = sg->dma_address;
> > + tail->len = sg->length;
> > +
> > + p = p->next;
> > + }
> > +
> > + return virtnet_dma_chain_update(sq, head, tail, num, data);
> > +
> > +err:
> > + if (tail) {
> > + virtnet_dma_chain_update(sq, head, tail, i, data);
> > + virtnet_sq_unmap(sq, (void **)&head);
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
> > enum virtnet_xmit_type type)
> > {
> > - return virtqueue_add_outbuf(sq->vq, sq->sg, num,
> > - virtnet_xmit_ptr_mix(data, type),
> > - GFP_ATOMIC);
> > + int ret;
> > +
> > + data = virtnet_xmit_ptr_mix(data, type);
> > +
> > + if (sq->premapped) {
> > + data = virtnet_sq_map_sg(sq, num, data);
> > + if (!data)
> > + return -ENOMEM;
> > + }
> > +
> > + ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > + if (ret && sq->premapped) {
> > + virtnet_xmit_ptr_strip(&data);
> > + virtnet_sq_unmap(sq, &data);
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int virtnet_sq_alloc_dma_meta(struct send_queue *sq)
> > +{
> > + struct virtnet_sq_dma *d;
> > + int num, i;
> > +
> > + num = virtqueue_get_vring_size(sq->vq);
> > +
> > + /* Currently, the SQ premapped mode is operational only with af-xdp. In
> > + * this mode, af-xdp, the kernel stack, and xdp tx/redirect will share
> > + * the same SQ. Af-xdp independently manages its DMA. The kernel stack
> > + * and xdp tx/redirect utilize this DMA metadata to manage the DMA info.
> > + *
> > + * If the indirect descriptor feature be supported, the volume of DMA
> > + * details we need to maintain becomes quite substantial. Here, we have
> > + * a cap on the amount of DMA info we manage, effectively limiting it to
> > + * twice the size of the ring buffer.
> > + *
> > + * If the kernel stack and xdp tx/redirect attempt to use more
> > + * descriptors than allowed by this double ring buffer size,
> > + * virtnet_add_outbuf() will return an -ENOMEM error. But the af-xdp can
> > + * work continually.
> > + */
> > + if (virtio_has_feature(sq->vq->vdev, VIRTIO_RING_F_INDIRECT_DESC))
> > + num = num * 2;
> > +
> > + sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
>
> Can be large. kvmalloc seems more appropriate.
>
>
> > + if (!sq->dmainfo.free)
> > + return -ENOMEM;
> > +
> > + sq->dmainfo.p = sq->dmainfo.free;
> > + sq->dmainfo.free_num = num;
> > +
> > + for (i = 0; i < num; ++i) {
> > + d = &sq->dmainfo.free[i];
> > + d->next = d + 1;
> > + }
> > +
> > + d->next = NULL;
> > +
> > + return 0;
> > +}
> > +
> > +static void virtnet_sq_free_dma_meta(struct send_queue *sq)
> > +{
> > + kfree(sq->dmainfo.p);
> > +
> > + sq->dmainfo.p = NULL;
> > + sq->dmainfo.free = NULL;
> > + sq->dmainfo.free_num = 0;
> > +}
> > +
> > +/* This function must be called immediately after creating the vq, or after vq
> > + * reset, and before adding any buffers to it.
> > + */
> > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
> > +{
> > + if (premapped) {
> > + if (virtnet_sq_alloc_dma_meta(sq))
> > + return -ENOMEM;
>
> I would just return what virtnet_sq_alloc_dma_meta returns.
> int r = virtnet_sq_alloc_dma_meta();
>
> if (r)
> return r;
>
> > + } else {
> > + virtnet_sq_free_dma_meta(sq);
> > + }
> > +
> > + BUG_ON(virtqueue_set_dma_premapped(sq->vq, premapped));
> > +
> > + sq->premapped = premapped;
> > + return 0;
> > }
> >
> > static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > @@ -529,6 +722,7 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > ++stats->packets;
> >
> > +retry:
> > switch (virtnet_xmit_ptr_strip(&ptr)) {
> > case VIRTNET_XMIT_TYPE_SKB:
> > skb = ptr;
> > @@ -545,6 +739,10 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > stats->bytes += xdp_get_frame_len(frame);
> > xdp_return_frame(frame);
> > break;
> > +
> > + case VIRTNET_XMIT_TYPE_DMA:
> > + virtnet_sq_unmap(sq, &ptr);
> > + goto retry;
> > }
> > }
> > }
> > @@ -5232,6 +5430,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > for (i = 0; i < vi->max_queue_pairs; i++) {
> > __netif_napi_del(&vi->rq[i].napi);
> > __netif_napi_del(&vi->sq[i].napi);
> > +
> > + virtnet_sq_free_dma_meta(&vi->sq[i]);
> > }
> >
> > /* We called __netif_napi_del(),
> > @@ -5280,6 +5480,13 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > {
> > + struct virtnet_info *vi = vq->vdev->priv;
> > + struct send_queue *sq;
> > + int i = vq2rxq(vq);
> > +
> > + sq = &vi->sq[i];
> > +
> > +retry:
> > switch (virtnet_xmit_ptr_strip(&buf)) {
> > case VIRTNET_XMIT_TYPE_SKB:
> > dev_kfree_skb(buf);
> > @@ -5288,6 +5495,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > case VIRTNET_XMIT_TYPE_XDP:
> > xdp_return_frame(buf);
> > break;
> > +
> > + case VIRTNET_XMIT_TYPE_DMA:
> > + virtnet_sq_unmap(sq, &buf);
> > + goto retry;
>
> document what's going on here? why are we retrying?
>
>
> > }
> > }
> >
> > --
> > 2.32.0.3.g01195cf9f
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (7 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 08/15] virtio_net: sq support premapped mode Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-12 23:43 ` Michael S. Tsirkin
2024-06-11 11:41 ` [PATCH net-next v4 10/15] virtio_net: xsk: prevent disable tx napi Xuan Zhuo
` (5 subsequent siblings)
14 siblings, 1 reply; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
This patch implement the logic of bind/unbind xsk pool to sq and rq.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 199 +++++++++++++++++++++++++++++++++++++++
1 file changed, 199 insertions(+)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4968ab7eb5a4..c82a0691632c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@
#include <net/netdev_rx_queue.h>
#include <net/netdev_queues.h>
#include <uapi/linux/virtio_ring.h>
+#include <net/xdp_sock_drv.h>
static int napi_weight = NAPI_POLL_WEIGHT;
module_param(napi_weight, int, 0444);
@@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
#define VIRTNET_DRIVER_VERSION "1.0.0"
+static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
+
static const unsigned long guest_offloads[] = {
VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
@@ -320,6 +323,12 @@ struct send_queue {
bool premapped;
struct virtnet_sq_dma_info dmainfo;
+
+ struct {
+ struct xsk_buff_pool *pool;
+
+ dma_addr_t hdr_dma_address;
+ } xsk;
};
/* Internal representation of a receive virtqueue */
@@ -371,6 +380,13 @@ struct receive_queue {
/* Record the last dma info to free after new pages is allocated. */
struct virtnet_rq_dma *last_dma;
+
+ struct {
+ struct xsk_buff_pool *pool;
+
+ /* xdp rxq used by xsk */
+ struct xdp_rxq_info xdp_rxq;
+ } xsk;
};
/* This structure can contain rss message with maximum settings for indirection table and keysize
@@ -5168,6 +5184,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
return virtnet_set_guest_offloads(vi, offloads);
}
+static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq,
+ struct xsk_buff_pool *pool)
+{
+ int err, qindex;
+
+ qindex = rq - vi->rq;
+
+ if (pool) {
+ err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id);
+ if (err < 0)
+ return err;
+
+ err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq,
+ MEM_TYPE_XSK_BUFF_POOL, NULL);
+ if (err < 0) {
+ xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
+ return err;
+ }
+
+ xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq);
+ }
+
+ virtnet_rx_pause(vi, rq);
+
+ err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf);
+ if (err) {
+ netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err);
+
+ pool = NULL;
+ }
+
+ if (!pool)
+ xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
+
+ rq->xsk.pool = pool;
+
+ virtnet_rx_resume(vi, rq);
+
+ return err;
+}
+
+static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
+ struct send_queue *sq,
+ struct xsk_buff_pool *pool)
+{
+ int err, qindex;
+
+ qindex = sq - vi->sq;
+
+ virtnet_tx_pause(vi, sq);
+
+ err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
+ if (err)
+ netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
+ else
+ err = virtnet_sq_set_premapped(sq, !!pool);
+
+ if (err)
+ pool = NULL;
+
+ sq->xsk.pool = pool;
+
+ virtnet_tx_resume(vi, sq);
+
+ return err;
+}
+
+static int virtnet_xsk_pool_enable(struct net_device *dev,
+ struct xsk_buff_pool *pool,
+ u16 qid)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct receive_queue *rq;
+ struct send_queue *sq;
+ struct device *dma_dev;
+ dma_addr_t hdr_dma;
+ int err;
+
+ /* In big_packets mode, xdp cannot work, so there is no need to
+ * initialize xsk of rq.
+ *
+ * Support for small mode firstly.
+ */
+ if (vi->big_packets)
+ return -ENOENT;
+
+ if (qid >= vi->curr_queue_pairs)
+ return -EINVAL;
+
+ sq = &vi->sq[qid];
+ rq = &vi->rq[qid];
+
+ /* xsk tx zerocopy depend on the tx napi.
+ *
+ * All xsk packets are actually consumed and sent out from the xsk tx
+ * queue under the tx napi mechanism.
+ */
+ if (!sq->napi.weight)
+ return -EPERM;
+
+ /* For the xsk, the tx and rx should have the same device. But
+ * vq->dma_dev allows every vq has the respective dma dev. So I check
+ * the dma dev of vq and sq is the same dev.
+ */
+ if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq))
+ return -EPERM;
+
+ dma_dev = virtqueue_dma_dev(rq->vq);
+ if (!dma_dev)
+ return -EPERM;
+
+ hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, hdr_dma))
+ return -ENOMEM;
+
+ err = xsk_pool_dma_map(pool, dma_dev, 0);
+ if (err)
+ goto err_xsk_map;
+
+ err = virtnet_rq_bind_xsk_pool(vi, rq, pool);
+ if (err)
+ goto err_rq;
+
+ err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
+ if (err)
+ goto err_sq;
+
+ /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
+ * So all the tx packets can share a single hdr.
+ */
+ sq->xsk.hdr_dma_address = hdr_dma;
+
+ return 0;
+
+err_sq:
+ virtnet_rq_bind_xsk_pool(vi, rq, NULL);
+err_rq:
+ xsk_pool_dma_unmap(pool, 0);
+err_xsk_map:
+ dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
+ return err;
+}
+
+static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct xsk_buff_pool *pool;
+ struct device *dma_dev;
+ struct receive_queue *rq;
+ struct send_queue *sq;
+ int err1, err2;
+
+ if (qid >= vi->curr_queue_pairs)
+ return -EINVAL;
+
+ sq = &vi->sq[qid];
+ rq = &vi->rq[qid];
+
+ pool = sq->xsk.pool;
+
+ err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL);
+ err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
+
+ xsk_pool_dma_unmap(pool, 0);
+
+ dma_dev = virtqueue_dma_dev(rq->vq);
+
+ dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE);
+
+ return err1 | err2;
+}
+
+static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ if (xdp->xsk.pool)
+ return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
+ xdp->xsk.queue_id);
+ else
+ return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
+}
+
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
@@ -5293,6 +5490,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
switch (xdp->command) {
case XDP_SETUP_PROG:
return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_SETUP_XSK_POOL:
+ return virtnet_xsk_pool_setup(dev, xdp);
default:
return -EINVAL;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk
2024-06-11 11:41 ` [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk Xuan Zhuo
@ 2024-06-12 23:43 ` Michael S. Tsirkin
2024-06-13 2:43 ` Xuan Zhuo
0 siblings, 1 reply; 27+ messages in thread
From: Michael S. Tsirkin @ 2024-06-12 23:43 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, Jun 11, 2024 at 07:41:41PM +0800, Xuan Zhuo wrote:
> This patch implement the logic of bind/unbind xsk pool to sq and rq.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
I'd just squash with previous patch. This one is hard to review in
isolation.
> ---
> drivers/net/virtio_net.c | 199 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 199 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4968ab7eb5a4..c82a0691632c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -26,6 +26,7 @@
> #include <net/netdev_rx_queue.h>
> #include <net/netdev_queues.h>
> #include <uapi/linux/virtio_ring.h>
> +#include <net/xdp_sock_drv.h>
>
> static int napi_weight = NAPI_POLL_WEIGHT;
> module_param(napi_weight, int, 0444);
> @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>
> #define VIRTNET_DRIVER_VERSION "1.0.0"
>
> +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
> +
> static const unsigned long guest_offloads[] = {
> VIRTIO_NET_F_GUEST_TSO4,
> VIRTIO_NET_F_GUEST_TSO6,
> @@ -320,6 +323,12 @@ struct send_queue {
> bool premapped;
>
> struct virtnet_sq_dma_info dmainfo;
> +
> + struct {
> + struct xsk_buff_pool *pool;
> +
> + dma_addr_t hdr_dma_address;
> + } xsk;
> };
>
> /* Internal representation of a receive virtqueue */
> @@ -371,6 +380,13 @@ struct receive_queue {
>
> /* Record the last dma info to free after new pages is allocated. */
> struct virtnet_rq_dma *last_dma;
> +
> + struct {
> + struct xsk_buff_pool *pool;
> +
> + /* xdp rxq used by xsk */
> + struct xdp_rxq_info xdp_rxq;
> + } xsk;
> };
>
> /* This structure can contain rss message with maximum settings for indirection table and keysize
> @@ -5168,6 +5184,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
> return virtnet_set_guest_offloads(vi, offloads);
> }
>
> +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq,
> + struct xsk_buff_pool *pool)
> +{
> + int err, qindex;
> +
> + qindex = rq - vi->rq;
> +
> + if (pool) {
> + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id);
> + if (err < 0)
> + return err;
> +
> + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq,
> + MEM_TYPE_XSK_BUFF_POOL, NULL);
> + if (err < 0) {
> + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
> + return err;
> + }
> +
> + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq);
> + }
> +
> + virtnet_rx_pause(vi, rq);
> +
> + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf);
> + if (err) {
> + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err);
> +
> + pool = NULL;
> + }
> +
> + if (!pool)
> + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
> +
> + rq->xsk.pool = pool;
> +
> + virtnet_rx_resume(vi, rq);
> +
> + return err;
> +}
> +
> +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
> + struct send_queue *sq,
> + struct xsk_buff_pool *pool)
> +{
> + int err, qindex;
> +
> + qindex = sq - vi->sq;
> +
> + virtnet_tx_pause(vi, sq);
> +
> + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
> + if (err)
> + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
> + else
> + err = virtnet_sq_set_premapped(sq, !!pool);
> +
> + if (err)
> + pool = NULL;
> +
> + sq->xsk.pool = pool;
> +
> + virtnet_tx_resume(vi, sq);
> +
> + return err;
> +}
> +
> +static int virtnet_xsk_pool_enable(struct net_device *dev,
> + struct xsk_buff_pool *pool,
> + u16 qid)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + struct receive_queue *rq;
> + struct send_queue *sq;
> + struct device *dma_dev;
> + dma_addr_t hdr_dma;
> + int err;
> +
> + /* In big_packets mode, xdp cannot work, so there is no need to
> + * initialize xsk of rq.
> + *
> + * Support for small mode firstly.
> + */
> + if (vi->big_packets)
> + return -ENOENT;
> +
> + if (qid >= vi->curr_queue_pairs)
> + return -EINVAL;
> +
> + sq = &vi->sq[qid];
> + rq = &vi->rq[qid];
> +
> + /* xsk tx zerocopy depend on the tx napi.
> + *
> + * All xsk packets are actually consumed and sent out from the xsk tx
> + * queue under the tx napi mechanism.
> + */
> + if (!sq->napi.weight)
> + return -EPERM;
> +
> + /* For the xsk, the tx and rx should have the same device. But
> + * vq->dma_dev allows every vq has the respective dma dev. So I check
> + * the dma dev of vq and sq is the same dev.
> + */
> + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq))
> + return -EPERM;
> +
> + dma_dev = virtqueue_dma_dev(rq->vq);
> + if (!dma_dev)
> + return -EPERM;
> +
> + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
> + if (dma_mapping_error(dma_dev, hdr_dma))
> + return -ENOMEM;
> +
> + err = xsk_pool_dma_map(pool, dma_dev, 0);
> + if (err)
> + goto err_xsk_map;
> +
> + err = virtnet_rq_bind_xsk_pool(vi, rq, pool);
> + if (err)
> + goto err_rq;
> +
> + err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
> + if (err)
> + goto err_sq;
> +
> + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
> + * So all the tx packets can share a single hdr.
> + */
> + sq->xsk.hdr_dma_address = hdr_dma;
> +
> + return 0;
> +
> +err_sq:
> + virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> +err_rq:
> + xsk_pool_dma_unmap(pool, 0);
> +err_xsk_map:
> + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
> + return err;
> +}
> +
> +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + struct xsk_buff_pool *pool;
> + struct device *dma_dev;
> + struct receive_queue *rq;
> + struct send_queue *sq;
> + int err1, err2;
> +
> + if (qid >= vi->curr_queue_pairs)
> + return -EINVAL;
> +
> + sq = &vi->sq[qid];
> + rq = &vi->rq[qid];
> +
> + pool = sq->xsk.pool;
> +
> + err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL);
> + err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> +
> + xsk_pool_dma_unmap(pool, 0);
> +
> + dma_dev = virtqueue_dma_dev(rq->vq);
> +
> + dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE);
> +
> + return err1 | err2;
> +}
> +
> +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
> +{
> + if (xdp->xsk.pool)
> + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
> + xdp->xsk.queue_id);
> + else
> + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
> +}
> +
> static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
> struct netlink_ext_ack *extack)
> {
> @@ -5293,6 +5490,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> switch (xdp->command) {
> case XDP_SETUP_PROG:
> return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
> + case XDP_SETUP_XSK_POOL:
> + return virtnet_xsk_pool_setup(dev, xdp);
> default:
> return -EINVAL;
> }
> --
> 2.32.0.3.g01195cf9f
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk
2024-06-12 23:43 ` Michael S. Tsirkin
@ 2024-06-13 2:43 ` Xuan Zhuo
0 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-13 2:43 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: netdev, Jason Wang, Eugenio Pérez, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 12 Jun 2024 19:43:01 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Tue, Jun 11, 2024 at 07:41:41PM +0800, Xuan Zhuo wrote:
> > This patch implement the logic of bind/unbind xsk pool to sq and rq.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>
> I'd just squash with previous patch. This one is hard to review in
> isolation.
Why?
The previous patch is to supply a API to switch to premapped mode for sq.
This is to introduce the xsk. This patch just use the API.
I do not think we should merge these two patch.
Thanks.
>
> > ---
> > drivers/net/virtio_net.c | 199 +++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 199 insertions(+)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 4968ab7eb5a4..c82a0691632c 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -26,6 +26,7 @@
> > #include <net/netdev_rx_queue.h>
> > #include <net/netdev_queues.h>
> > #include <uapi/linux/virtio_ring.h>
> > +#include <net/xdp_sock_drv.h>
> >
> > static int napi_weight = NAPI_POLL_WEIGHT;
> > module_param(napi_weight, int, 0444);
> > @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
> >
> > #define VIRTNET_DRIVER_VERSION "1.0.0"
> >
> > +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
> > +
> > static const unsigned long guest_offloads[] = {
> > VIRTIO_NET_F_GUEST_TSO4,
> > VIRTIO_NET_F_GUEST_TSO6,
> > @@ -320,6 +323,12 @@ struct send_queue {
> > bool premapped;
> >
> > struct virtnet_sq_dma_info dmainfo;
> > +
> > + struct {
> > + struct xsk_buff_pool *pool;
> > +
> > + dma_addr_t hdr_dma_address;
> > + } xsk;
> > };
> >
> > /* Internal representation of a receive virtqueue */
> > @@ -371,6 +380,13 @@ struct receive_queue {
> >
> > /* Record the last dma info to free after new pages is allocated. */
> > struct virtnet_rq_dma *last_dma;
> > +
> > + struct {
> > + struct xsk_buff_pool *pool;
> > +
> > + /* xdp rxq used by xsk */
> > + struct xdp_rxq_info xdp_rxq;
> > + } xsk;
> > };
> >
> > /* This structure can contain rss message with maximum settings for indirection table and keysize
> > @@ -5168,6 +5184,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
> > return virtnet_set_guest_offloads(vi, offloads);
> > }
> >
> > +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq,
> > + struct xsk_buff_pool *pool)
> > +{
> > + int err, qindex;
> > +
> > + qindex = rq - vi->rq;
> > +
> > + if (pool) {
> > + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id);
> > + if (err < 0)
> > + return err;
> > +
> > + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq,
> > + MEM_TYPE_XSK_BUFF_POOL, NULL);
> > + if (err < 0) {
> > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
> > + return err;
> > + }
> > +
> > + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq);
> > + }
> > +
> > + virtnet_rx_pause(vi, rq);
> > +
> > + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf);
> > + if (err) {
> > + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err);
> > +
> > + pool = NULL;
> > + }
> > +
> > + if (!pool)
> > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
> > +
> > + rq->xsk.pool = pool;
> > +
> > + virtnet_rx_resume(vi, rq);
> > +
> > + return err;
> > +}
> > +
> > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
> > + struct send_queue *sq,
> > + struct xsk_buff_pool *pool)
> > +{
> > + int err, qindex;
> > +
> > + qindex = sq - vi->sq;
> > +
> > + virtnet_tx_pause(vi, sq);
> > +
> > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
> > + if (err)
> > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
> > + else
> > + err = virtnet_sq_set_premapped(sq, !!pool);
> > +
> > + if (err)
> > + pool = NULL;
> > +
> > + sq->xsk.pool = pool;
> > +
> > + virtnet_tx_resume(vi, sq);
> > +
> > + return err;
> > +}
> > +
> > +static int virtnet_xsk_pool_enable(struct net_device *dev,
> > + struct xsk_buff_pool *pool,
> > + u16 qid)
> > +{
> > + struct virtnet_info *vi = netdev_priv(dev);
> > + struct receive_queue *rq;
> > + struct send_queue *sq;
> > + struct device *dma_dev;
> > + dma_addr_t hdr_dma;
> > + int err;
> > +
> > + /* In big_packets mode, xdp cannot work, so there is no need to
> > + * initialize xsk of rq.
> > + *
> > + * Support for small mode firstly.
> > + */
> > + if (vi->big_packets)
> > + return -ENOENT;
> > +
> > + if (qid >= vi->curr_queue_pairs)
> > + return -EINVAL;
> > +
> > + sq = &vi->sq[qid];
> > + rq = &vi->rq[qid];
> > +
> > + /* xsk tx zerocopy depend on the tx napi.
> > + *
> > + * All xsk packets are actually consumed and sent out from the xsk tx
> > + * queue under the tx napi mechanism.
> > + */
> > + if (!sq->napi.weight)
> > + return -EPERM;
> > +
> > + /* For the xsk, the tx and rx should have the same device. But
> > + * vq->dma_dev allows every vq has the respective dma dev. So I check
> > + * the dma dev of vq and sq is the same dev.
> > + */
> > + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq))
> > + return -EPERM;
> > +
> > + dma_dev = virtqueue_dma_dev(rq->vq);
> > + if (!dma_dev)
> > + return -EPERM;
> > +
> > + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
> > + if (dma_mapping_error(dma_dev, hdr_dma))
> > + return -ENOMEM;
> > +
> > + err = xsk_pool_dma_map(pool, dma_dev, 0);
> > + if (err)
> > + goto err_xsk_map;
> > +
> > + err = virtnet_rq_bind_xsk_pool(vi, rq, pool);
> > + if (err)
> > + goto err_rq;
> > +
> > + err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
> > + if (err)
> > + goto err_sq;
> > +
> > + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero.
> > + * So all the tx packets can share a single hdr.
> > + */
> > + sq->xsk.hdr_dma_address = hdr_dma;
> > +
> > + return 0;
> > +
> > +err_sq:
> > + virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> > +err_rq:
> > + xsk_pool_dma_unmap(pool, 0);
> > +err_xsk_map:
> > + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE);
> > + return err;
> > +}
> > +
> > +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> > +{
> > + struct virtnet_info *vi = netdev_priv(dev);
> > + struct xsk_buff_pool *pool;
> > + struct device *dma_dev;
> > + struct receive_queue *rq;
> > + struct send_queue *sq;
> > + int err1, err2;
> > +
> > + if (qid >= vi->curr_queue_pairs)
> > + return -EINVAL;
> > +
> > + sq = &vi->sq[qid];
> > + rq = &vi->rq[qid];
> > +
> > + pool = sq->xsk.pool;
> > +
> > + err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL);
> > + err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
> > +
> > + xsk_pool_dma_unmap(pool, 0);
> > +
> > + dma_dev = virtqueue_dma_dev(rq->vq);
> > +
> > + dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE);
> > +
> > + return err1 | err2;
> > +}
> > +
> > +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
> > +{
> > + if (xdp->xsk.pool)
> > + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
> > + xdp->xsk.queue_id);
> > + else
> > + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
> > +}
> > +
> > static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
> > struct netlink_ext_ack *extack)
> > {
> > @@ -5293,6 +5490,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> > switch (xdp->command) {
> > case XDP_SETUP_PROG:
> > return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
> > + case XDP_SETUP_XSK_POOL:
> > + return virtnet_xsk_pool_setup(dev, xdp);
> > default:
> > return -EINVAL;
> > }
> > --
> > 2.32.0.3.g01195cf9f
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH net-next v4 10/15] virtio_net: xsk: prevent disable tx napi
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (8 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 09/15] virtio_net: xsk: bind/unbind xsk Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
` (4 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Since xsk's TX queue is consumed by TX NAPI, if sq is bound to xsk, then
we must stop tx napi from being disabled.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c82a0691632c..17cbb6a94373 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -4729,7 +4729,7 @@ static int virtnet_set_coalesce(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct virtnet_info *vi = netdev_priv(dev);
- int ret, queue_number, napi_weight;
+ int ret, queue_number, napi_weight, i;
bool update_napi = false;
/* Can't change NAPI weight if the link is up */
@@ -4758,6 +4758,14 @@ static int virtnet_set_coalesce(struct net_device *dev,
return ret;
if (update_napi) {
+ /* xsk xmit depends on the tx napi. So if xsk is active,
+ * prevent modifications to tx napi.
+ */
+ for (i = queue_number; i < vi->max_queue_pairs; i++) {
+ if (vi->sq[i].xsk.pool)
+ return -EBUSY;
+ }
+
for (; queue_number < vi->max_queue_pairs; queue_number++)
vi->sq[queue_number].napi.weight = napi_weight;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (9 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 10/15] virtio_net: xsk: prevent disable tx napi Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-12 23:25 ` Jakub Kicinski
2024-06-11 11:41 ` [PATCH net-next v4 12/15] virtio_net: xsk: tx: support wakeup Xuan Zhuo
` (3 subsequent siblings)
14 siblings, 1 reply; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
The driver's tx napi is very important for XSK. It is responsible for
obtaining data from the XSK queue and sending it out.
At the beginning, we need to trigger tx napi.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 113 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 111 insertions(+), 2 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 17cbb6a94373..4624a9fde89a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -534,10 +534,13 @@ enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
VIRTNET_XMIT_TYPE_XDP,
VIRTNET_XMIT_TYPE_DMA,
+ VIRTNET_XMIT_TYPE_XSK,
};
#define VIRTNET_XMIT_TYPE_MASK (VIRTNET_XMIT_TYPE_SKB | VIRTNET_XMIT_TYPE_XDP \
- | VIRTNET_XMIT_TYPE_DMA)
+ | VIRTNET_XMIT_TYPE_DMA | VIRTNET_XMIT_TYPE_XSK)
+
+#define VIRTIO_XSK_FLAG_OFFSET 4
static enum virtnet_xmit_type virtnet_xmit_ptr_strip(void **ptr)
{
@@ -1256,6 +1259,102 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
+static void *virtnet_xsk_to_ptr(u32 len)
+{
+ unsigned long p;
+
+ p = len << VIRTIO_XSK_FLAG_OFFSET;
+
+ return virtnet_xmit_ptr_mix((void *)p, VIRTNET_XMIT_TYPE_XSK);
+}
+
+static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
+{
+ sg->dma_address = addr;
+ sg->length = len;
+}
+
+static int virtnet_xsk_xmit_one(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ struct virtnet_info *vi;
+ dma_addr_t addr;
+
+ vi = sq->vq->vdev->priv;
+
+ addr = xsk_buff_raw_get_dma(pool, desc->addr);
+ xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
+
+ sg_init_table(sq->sg, 2);
+
+ sg_fill_dma(sq->sg, sq->xsk.hdr_dma_address, vi->hdr_len);
+ sg_fill_dma(sq->sg + 1, addr, desc->len);
+
+ return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
+ virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
+}
+
+static int virtnet_xsk_xmit_batch(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ unsigned int budget,
+ u64 *kicks)
+{
+ struct xdp_desc *descs = pool->tx_descs;
+ bool kick = false;
+ u32 nb_pkts, i;
+ int err;
+
+ budget = min_t(u32, budget, sq->vq->num_free);
+
+ nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
+ if (!nb_pkts)
+ return 0;
+
+ for (i = 0; i < nb_pkts; i++) {
+ err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
+ if (unlikely(err)) {
+ xsk_tx_completed(sq->xsk.pool, nb_pkts - i);
+ break;
+ }
+
+ kick = true;
+ }
+
+ if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
+ (*kicks)++;
+
+ return i;
+}
+
+static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
+ int budget)
+{
+ struct virtnet_info *vi = sq->vq->vdev->priv;
+ struct virtnet_sq_free_stats stats = {};
+ u64 kicks = 0;
+ int sent;
+
+ __free_old_xmit(sq, true, &stats);
+
+ sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
+
+ if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
+ check_sq_full_and_disable(vi, vi->dev, sq);
+
+ u64_stats_update_begin(&sq->stats.syncp);
+ u64_stats_add(&sq->stats.packets, stats.packets);
+ u64_stats_add(&sq->stats.bytes, stats.bytes);
+ u64_stats_add(&sq->stats.kicks, kicks);
+ u64_stats_add(&sq->stats.xdp_tx, sent);
+ u64_stats_update_end(&sq->stats.syncp);
+
+ if (xsk_uses_need_wakeup(pool))
+ xsk_set_tx_need_wakeup(pool);
+
+ return sent == budget;
+}
+
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
@@ -2698,6 +2797,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
struct virtnet_info *vi = sq->vq->vdev->priv;
unsigned int index = vq2txq(sq->vq);
struct netdev_queue *txq;
+ bool xsk_busy = false;
int opaque;
bool done;
@@ -2710,7 +2810,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
txq = netdev_get_tx_queue(vi->dev, index);
__netif_tx_lock(txq, raw_smp_processor_id());
virtqueue_disable_cb(sq->vq);
- free_old_xmit(sq, true);
+
+ if (sq->xsk.pool)
+ xsk_busy = virtnet_xsk_xmit(sq, sq->xsk.pool, budget);
+ else
+ free_old_xmit(sq, true);
if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
if (netif_tx_queue_stopped(txq)) {
@@ -2721,6 +2825,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
netif_tx_wake_queue(txq);
}
+ if (xsk_busy) {
+ __netif_tx_unlock(txq);
+ return budget;
+ }
+
opaque = virtqueue_enable_cb_prepare(sq->vq);
done = napi_complete_done(napi, 0);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer
2024-06-11 11:41 ` [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
@ 2024-06-12 23:25 ` Jakub Kicinski
2024-06-12 23:44 ` Michael S. Tsirkin
2024-06-13 2:43 ` Xuan Zhuo
0 siblings, 2 replies; 27+ messages in thread
From: Jakub Kicinski @ 2024-06-12 23:25 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Tue, 11 Jun 2024 19:41:43 +0800 Xuan Zhuo wrote:
> @@ -534,10 +534,13 @@ enum virtnet_xmit_type {
> VIRTNET_XMIT_TYPE_SKB,
> VIRTNET_XMIT_TYPE_XDP,
> VIRTNET_XMIT_TYPE_DMA,
> + VIRTNET_XMIT_TYPE_XSK,
Again, would be great to avoid the transient warning (if it can be done
cleanly):
drivers/net/virtio_net.c:5806:9: warning: enumeration value ‘VIRTNET_XMIT_TYPE_XSK’ not handled in switch [-Wswitch]
5806 | switch (virtnet_xmit_ptr_strip(&buf)) {
| ^~~~~~
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer
2024-06-12 23:25 ` Jakub Kicinski
@ 2024-06-12 23:44 ` Michael S. Tsirkin
2024-06-13 2:43 ` Xuan Zhuo
1 sibling, 0 replies; 27+ messages in thread
From: Michael S. Tsirkin @ 2024-06-12 23:44 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Xuan Zhuo, netdev, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, Jun 12, 2024 at 04:25:05PM -0700, Jakub Kicinski wrote:
> On Tue, 11 Jun 2024 19:41:43 +0800 Xuan Zhuo wrote:
> > @@ -534,10 +534,13 @@ enum virtnet_xmit_type {
> > VIRTNET_XMIT_TYPE_SKB,
> > VIRTNET_XMIT_TYPE_XDP,
> > VIRTNET_XMIT_TYPE_DMA,
> > + VIRTNET_XMIT_TYPE_XSK,
>
> Again, would be great to avoid the transient warning (if it can be done
> cleanly):
>
> drivers/net/virtio_net.c:5806:9: warning: enumeration value ‘VIRTNET_XMIT_TYPE_XSK’ not handled in switch [-Wswitch]
> 5806 | switch (virtnet_xmit_ptr_strip(&buf)) {
> | ^~~~~~
yea just squashing is usually enough.
--
MST
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer
2024-06-12 23:25 ` Jakub Kicinski
2024-06-12 23:44 ` Michael S. Tsirkin
@ 2024-06-13 2:43 ` Xuan Zhuo
1 sibling, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-13 2:43 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, Michael S. Tsirkin, Jason Wang, Eugenio Pérez,
David S. Miller, Eric Dumazet, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
virtualization, bpf
On Wed, 12 Jun 2024 16:25:05 -0700, Jakub Kicinski <kuba@kernel.org> wrote:
> On Tue, 11 Jun 2024 19:41:43 +0800 Xuan Zhuo wrote:
> > @@ -534,10 +534,13 @@ enum virtnet_xmit_type {
> > VIRTNET_XMIT_TYPE_SKB,
> > VIRTNET_XMIT_TYPE_XDP,
> > VIRTNET_XMIT_TYPE_DMA,
> > + VIRTNET_XMIT_TYPE_XSK,
>
> Again, would be great to avoid the transient warning (if it can be done
> cleanly):
>
> drivers/net/virtio_net.c:5806:9: warning: enumeration value ‘VIRTNET_XMIT_TYPE_XSK’ not handled in switch [-Wswitch]
> 5806 | switch (virtnet_xmit_ptr_strip(&buf)) {
> | ^~~~~~
Will fix.
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH net-next v4 12/15] virtio_net: xsk: tx: support wakeup
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (10 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 11/15] virtio_net: xsk: tx: support xmit xsk buffer Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 13/15] virtio_net: xsk: tx: handle the transmitted xsk buffer Xuan Zhuo
` (2 subsequent siblings)
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
xsk wakeup is used to trigger the logic for xsk xmit by xsk framework or
user.
Virtio-net does not support to actively generate an interruption, so it
tries to trigger tx NAPI on the local cpu.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4624a9fde89a..7f8ef93a194a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1355,6 +1355,29 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
return sent == budget;
}
+static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct send_queue *sq;
+
+ if (!netif_running(dev))
+ return -ENETDOWN;
+
+ if (qid >= vi->curr_queue_pairs)
+ return -EINVAL;
+
+ sq = &vi->sq[qid];
+
+ if (napi_if_scheduled_mark_missed(&sq->napi))
+ return 0;
+
+ local_bh_disable();
+ virtqueue_napi_schedule(&sq->napi, sq->vq);
+ local_bh_enable();
+
+ return 0;
+}
+
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
@@ -5693,6 +5716,7 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
.ndo_bpf = virtnet_xdp,
.ndo_xdp_xmit = virtnet_xdp_xmit,
+ .ndo_xsk_wakeup = virtnet_xsk_wakeup,
.ndo_features_check = passthru_features_check,
.ndo_get_phys_port_name = virtnet_get_phys_port_name,
.ndo_set_features = virtnet_set_features,
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 13/15] virtio_net: xsk: tx: handle the transmitted xsk buffer
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (11 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 12/15] virtio_net: xsk: tx: support wakeup Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 14/15] virtio_net: xsk: rx: support fill with " Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 15/15] virtio_net: xsk: rx: support recv small mode Xuan Zhuo
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk
buffer) by the last bits of the pointer.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 60 ++++++++++++++++++++++++++++++++++------
1 file changed, 52 insertions(+), 8 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7f8ef93a194a..d38b4ef80f6f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -87,6 +87,7 @@ struct virtnet_stat_desc {
struct virtnet_sq_free_stats {
u64 packets;
u64 bytes;
+ u64 xsk;
};
struct virtnet_sq_stats {
@@ -529,6 +530,7 @@ struct virtio_net_common_hdr {
};
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
+static void virtnet_xsk_completed(struct send_queue *sq, int num);
enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
@@ -730,6 +732,11 @@ static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped)
return 0;
}
+static u32 virtnet_ptr_to_xsk(void *ptr)
+{
+ return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
+}
+
static void __free_old_xmit(struct send_queue *sq, bool in_napi,
struct virtnet_sq_free_stats *stats)
{
@@ -762,10 +769,24 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
case VIRTNET_XMIT_TYPE_DMA:
virtnet_sq_unmap(sq, &ptr);
goto retry;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ stats->bytes += virtnet_ptr_to_xsk(ptr);
+ stats->xsk++;
+ break;
}
}
}
+static void virtnet_free_old_xmit(struct send_queue *sq, bool in_napi,
+ struct virtnet_sq_free_stats *stats)
+{
+ __free_old_xmit(sq, in_napi, stats);
+
+ if (stats->xsk)
+ virtnet_xsk_completed(sq, stats->xsk);
+}
+
/* Converting between virtqueue no. and kernel tx/rx queue no.
* 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
*/
@@ -1194,7 +1215,7 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
{
struct virtnet_sq_free_stats stats = {0};
- __free_old_xmit(sq, in_napi, &stats);
+ virtnet_free_old_xmit(sq, in_napi, &stats);
/* Avoid overhead when no packets have been processed
* happens when called speculatively from start_xmit.
@@ -1335,8 +1356,12 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
u64 kicks = 0;
int sent;
+ /* Avoid to wakeup napi meanless, so call __free_old_xmit. */
__free_old_xmit(sq, true, &stats);
+ if (stats.xsk)
+ xsk_tx_completed(sq->xsk.pool, stats.xsk);
+
sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
@@ -1355,6 +1380,16 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
return sent == budget;
}
+static void xsk_wakeup(struct send_queue *sq)
+{
+ if (napi_if_scheduled_mark_missed(&sq->napi))
+ return;
+
+ local_bh_disable();
+ virtqueue_napi_schedule(&sq->napi, sq->vq);
+ local_bh_enable();
+}
+
static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
{
struct virtnet_info *vi = netdev_priv(dev);
@@ -1368,14 +1403,19 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
sq = &vi->sq[qid];
- if (napi_if_scheduled_mark_missed(&sq->napi))
- return 0;
+ xsk_wakeup(sq);
+ return 0;
+}
- local_bh_disable();
- virtqueue_napi_schedule(&sq->napi, sq->vq);
- local_bh_enable();
+static void virtnet_xsk_completed(struct send_queue *sq, int num)
+{
+ xsk_tx_completed(sq->xsk.pool, num);
- return 0;
+ /* If this is called by rx poll, start_xmit and xdp xmit we should
+ * wakeup the tx napi to consume the xsk tx queue, because the tx
+ * interrupt may not be triggered.
+ */
+ xsk_wakeup(sq);
}
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
@@ -1491,7 +1531,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
}
/* Free up any pending old buffers before queueing new ones. */
- __free_old_xmit(sq, false, &stats);
+ virtnet_free_old_xmit(sq, false, &stats);
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
@@ -5839,6 +5879,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
case VIRTNET_XMIT_TYPE_DMA:
virtnet_sq_unmap(sq, &buf);
goto retry;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ xsk_tx_completed(sq->xsk.pool, 1);
+ break;
}
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 14/15] virtio_net: xsk: rx: support fill with xsk buffer
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (12 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 13/15] virtio_net: xsk: tx: handle the transmitted xsk buffer Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
2024-06-11 11:41 ` [PATCH net-next v4 15/15] virtio_net: xsk: rx: support recv small mode Xuan Zhuo
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
Implement the logic of filling rq with XSK buffers.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 64 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 62 insertions(+), 2 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d38b4ef80f6f..a780dcc4dd4f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -387,6 +387,8 @@ struct receive_queue {
/* xdp rxq used by xsk */
struct xdp_rxq_info xdp_rxq;
+
+ struct xdp_buff **xsk_buffs;
} xsk;
};
@@ -531,6 +533,8 @@ struct virtio_net_common_hdr {
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
static void virtnet_xsk_completed(struct send_queue *sq, int num);
+static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq,
+ struct xsk_buff_pool *pool, gfp_t gfp);
enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
@@ -1295,6 +1299,47 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
sg->length = len;
}
+static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq,
+ struct xsk_buff_pool *pool, gfp_t gfp)
+{
+ struct xdp_buff **xsk_buffs;
+ dma_addr_t addr;
+ u32 len, i;
+ int err = 0;
+ int num;
+
+ xsk_buffs = rq->xsk.xsk_buffs;
+
+ num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free);
+ if (!num)
+ return -ENOMEM;
+
+ len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
+
+ for (i = 0; i < num; ++i) {
+ /* use the part of XDP_PACKET_HEADROOM as the virtnet hdr space */
+ addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len;
+
+ sg_init_table(rq->sg, 1);
+ sg_fill_dma(rq->sg, addr, len);
+
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], gfp);
+ if (err)
+ goto err;
+ }
+
+ return num;
+
+err:
+ if (i)
+ err = i;
+
+ for (; i < num; ++i)
+ xsk_buff_free(xsk_buffs[i]);
+
+ return err;
+}
+
static int virtnet_xsk_xmit_one(struct send_queue *sq,
struct xsk_buff_pool *pool,
struct xdp_desc *desc)
@@ -2551,6 +2596,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
int err;
bool oom;
+ if (rq->xsk.pool) {
+ err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk.pool, gfp);
+ goto kick;
+ }
+
do {
if (vi->mergeable_rx_bufs)
err = add_recvbuf_mergeable(vi, rq, gfp);
@@ -2559,10 +2609,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
else
err = add_recvbuf_small(vi, rq, gfp);
- oom = err == -ENOMEM;
if (err)
break;
} while (rq->vq->num_free);
+
+kick:
if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
unsigned long flags;
@@ -2571,6 +2622,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
}
+ oom = err == -ENOMEM;
return !oom;
}
@@ -5440,7 +5492,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
struct send_queue *sq;
struct device *dma_dev;
dma_addr_t hdr_dma;
- int err;
+ int err, size;
/* In big_packets mode, xdp cannot work, so there is no need to
* initialize xsk of rq.
@@ -5475,6 +5527,12 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
if (!dma_dev)
return -EPERM;
+ size = virtqueue_get_vring_size(rq->vq);
+
+ rq->xsk.xsk_buffs = kcalloc(size, sizeof(*rq->xsk.xsk_buffs), GFP_KERNEL);
+ if (!rq->xsk.xsk_buffs)
+ return -ENOMEM;
+
hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE);
if (dma_mapping_error(dma_dev, hdr_dma))
return -ENOMEM;
@@ -5533,6 +5591,8 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE);
+ kfree(rq->xsk.xsk_buffs);
+
return err1 | err2;
}
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH net-next v4 15/15] virtio_net: xsk: rx: support recv small mode
2024-06-11 11:41 [PATCH net-next v4 00/15] virtio-net: support AF_XDP zero copy Xuan Zhuo
` (13 preceding siblings ...)
2024-06-11 11:41 ` [PATCH net-next v4 14/15] virtio_net: xsk: rx: support fill with " Xuan Zhuo
@ 2024-06-11 11:41 ` Xuan Zhuo
14 siblings, 0 replies; 27+ messages in thread
From: Xuan Zhuo @ 2024-06-11 11:41 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, virtualization, bpf
The virtnet_xdp_handler() is re-used. But
1. We need to copy data to create skb for XDP_PASS.
2. We need to call xsk_buff_free() to release the buffer.
3. The handle for xdp_buff is difference.
If we pushed this logic into existing receive handle(merge and small),
we would have to maintain code scattered inside merge and small (and big).
So I think it is a good choice for us to put the xsk code into an
independent function.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
drivers/net/virtio_net.c | 142 +++++++++++++++++++++++++++++++++++++--
1 file changed, 138 insertions(+), 4 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a780dcc4dd4f..1a4fd8e744e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -533,8 +533,10 @@ struct virtio_net_common_hdr {
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
static void virtnet_xsk_completed(struct send_queue *sq, int num);
-static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq,
- struct xsk_buff_pool *pool, gfp_t gfp);
+static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
+ struct net_device *dev,
+ unsigned int *xdp_xmit,
+ struct virtnet_rq_stats *stats);
enum virtnet_xmit_type {
VIRTNET_XMIT_TYPE_SKB,
@@ -1209,6 +1211,11 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
rq = &vi->rq[i];
+ if (rq->xsk.pool) {
+ xsk_buff_free((struct xdp_buff *)buf);
+ return;
+ }
+
if (!vi->big_packets || vi->mergeable_rx_bufs)
virtnet_rq_unmap(rq, buf, 0);
@@ -1299,6 +1306,120 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
sg->length = len;
}
+static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
+ struct receive_queue *rq, void *buf, u32 len)
+{
+ struct xdp_buff *xdp;
+ u32 bufsize;
+
+ xdp = (struct xdp_buff *)buf;
+
+ bufsize = xsk_pool_get_rx_frame_size(rq->xsk.pool) + vi->hdr_len;
+
+ if (unlikely(len > bufsize)) {
+ pr_debug("%s: rx error: len %u exceeds truesize %u\n",
+ vi->dev->name, len, bufsize);
+ DEV_STATS_INC(vi->dev, rx_length_errors);
+ xsk_buff_free(xdp);
+ return NULL;
+ }
+
+ xsk_buff_set_size(xdp, len);
+ xsk_buff_dma_sync_for_cpu(xdp);
+
+ return xdp;
+}
+
+static struct sk_buff *xdp_construct_skb(struct receive_queue *rq,
+ struct xdp_buff *xdp)
+{
+ unsigned int metasize = xdp->data - xdp->data_meta;
+ struct sk_buff *skb;
+ unsigned int size;
+
+ size = xdp->data_end - xdp->data_hard_start;
+ skb = napi_alloc_skb(&rq->napi, size);
+ if (unlikely(!skb)) {
+ xsk_buff_free(xdp);
+ return NULL;
+ }
+
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ size = xdp->data_end - xdp->data_meta;
+ memcpy(__skb_put(skb, size), xdp->data_meta, size);
+
+ if (metasize) {
+ __skb_pull(skb, metasize);
+ skb_metadata_set(skb, metasize);
+ }
+
+ xsk_buff_free(xdp);
+
+ return skb;
+}
+
+static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi,
+ struct receive_queue *rq, struct xdp_buff *xdp,
+ unsigned int *xdp_xmit,
+ struct virtnet_rq_stats *stats)
+{
+ struct bpf_prog *prog;
+ u32 ret;
+
+ ret = XDP_PASS;
+ rcu_read_lock();
+ prog = rcu_dereference(rq->xdp_prog);
+ if (prog)
+ ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case XDP_PASS:
+ return xdp_construct_skb(rq, xdp);
+
+ case XDP_TX:
+ case XDP_REDIRECT:
+ return NULL;
+
+ default:
+ /* drop packet */
+ xsk_buff_free(xdp);
+ u64_stats_inc(&stats->drops);
+ return NULL;
+ }
+}
+
+static struct sk_buff *virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq,
+ void *buf, u32 len,
+ unsigned int *xdp_xmit,
+ struct virtnet_rq_stats *stats)
+{
+ struct net_device *dev = vi->dev;
+ struct sk_buff *skb = NULL;
+ struct xdp_buff *xdp;
+
+ len -= vi->hdr_len;
+
+ u64_stats_add(&stats->bytes, len);
+
+ xdp = buf_to_xdp(vi, rq, buf, len);
+ if (!xdp)
+ return NULL;
+
+ if (unlikely(len < ETH_HLEN)) {
+ pr_debug("%s: short packet %i\n", dev->name, len);
+ DEV_STATS_INC(dev, rx_length_errors);
+ xsk_buff_free(xdp);
+ return NULL;
+ }
+
+ if (!vi->mergeable_rx_bufs)
+ skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats);
+
+ return skb;
+}
+
static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq,
struct xsk_buff_pool *pool, gfp_t gfp)
{
@@ -2704,9 +2825,22 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
void *buf;
int i;
- if (!vi->big_packets || vi->mergeable_rx_bufs) {
- void *ctx;
+ if (rq->xsk.pool) {
+ struct sk_buff *skb;
+
+ while (packets < budget) {
+ buf = virtqueue_get_buf(rq->vq, &len);
+ if (!buf)
+ break;
+ skb = virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, &stats);
+ if (skb)
+ virtnet_receive_done(vi, rq, skb);
+
+ packets++;
+ }
+ } else if (!vi->big_packets || vi->mergeable_rx_bufs) {
+ void *ctx;
while (packets < budget &&
(buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
--
2.32.0.3.g01195cf9f
^ permalink raw reply related [flat|nested] 27+ messages in thread