[RFC PATCH net-next v2 0/2] virtio-net: support zerocopy multi buffer XDP in mergeable

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH net-next v2 0/2] virtio-net: support zerocopy multi buffer XDP in mergeable
@ 2025-05-27 16:19 Bui Quang Minh
  2025-05-27 16:19 ` [RFC PATCH net-next v2 1/2] " Bui Quang Minh
  2025-05-27 16:19 ` [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net Bui Quang Minh
  0 siblings, 2 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-27 16:19 UTC (permalink / raw)
  To: netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf, Bui Quang Minh

Currently, in zerocopy mode with mergeable receive buffer, virtio-net
does not support multi buffer but a single buffer only. This commit adds
support for multi mergeable receive buffer in the zerocopy XDP path by
utilizing XDP buffer with frags. This happens when the MTU of tap device
is set to 9000 so that a packet can exceed a single XDP buffer. As a
result, that packet will be split into multi buffer XDP.

This series also adds the test for virtio-net rx when an XDP socket is
bound to the interface. The test exercises both copy and zerocopy mode. We
can adjust the tap device's MTU to test both single buffer and multi
buffer XDP.

Changes in RFC v2:
- Return XDP_DROP when receiving multi-buffer XDP but BPF program does not
support it
- Add selftest
- Link to RFC v1: https://lore.kernel.org/netdev/20250426082752.43222-1-minhquangbui99@gmail.com/

Thanks,
Quang Minh.

Bui Quang Minh (2):
  virtio-net: support zerocopy multi buffer XDP in mergeable
  selftests: net: add XDP socket tests for virtio-net

 drivers/net/virtio_net.c                      | 123 +++---
 .../selftests/drivers/net/hw/.gitignore       |   3 +
 .../testing/selftests/drivers/net/hw/Makefile |  12 +-
 .../drivers/net/hw/xsk_receive.bpf.c          |  43 ++
 .../selftests/drivers/net/hw/xsk_receive.c    | 398 ++++++++++++++++++
 .../selftests/drivers/net/hw/xsk_receive.py   |  75 ++++
 6 files changed, 596 insertions(+), 58 deletions(-)
 create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
 create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.c
 create mode 100755 tools/testing/selftests/drivers/net/hw/xsk_receive.py

-- 
2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-27 16:19 [RFC PATCH net-next v2 0/2] virtio-net: support zerocopy multi buffer XDP in mergeable Bui Quang Minh
@ 2025-05-27 16:19 ` Bui Quang Minh
  2025-05-28 16:44   ` ALOK TIWARI
  2025-05-29  5:59   ` Jason Wang
  2025-05-27 16:19 ` [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net Bui Quang Minh
  1 sibling, 2 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-27 16:19 UTC (permalink / raw)
  To: netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf, Bui Quang Minh

Currently, in zerocopy mode with mergeable receive buffer, virtio-net
does not support multi buffer but a single buffer only. This commit adds
support for multi mergeable receive buffer in the zerocopy XDP path by
utilizing XDP buffer with frags.

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
 drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 57 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53ba600605a..a9558650f205 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
 #define VIRTIO_XDP_TX		BIT(0)
 #define VIRTIO_XDP_REDIR	BIT(1)
 
+#define VIRTNET_MAX_ZC_SEGS	8
+
 /* RX packet size EWMA. The average packet size is used to determine the packet
  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  * at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
 	}
 }
 
-static int xsk_append_merge_buffer(struct virtnet_info *vi,
-				   struct receive_queue *rq,
-				   struct sk_buff *head_skb,
-				   u32 num_buf,
-				   struct virtio_net_hdr_mrg_rxbuf *hdr,
-				   struct virtnet_rq_stats *stats)
+static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
+				      struct receive_queue *rq,
+				      u32 num_buf,
+				      struct xdp_buff *xdp,
+				      struct virtnet_rq_stats *stats)
 {
-	struct sk_buff *curr_skb;
-	struct xdp_buff *xdp;
-	u32 len, truesize;
-	struct page *page;
+	unsigned int len;
 	void *buf;
 
-	curr_skb = head_skb;
+	if (num_buf < 2)
+		return 0;
+
+	while (num_buf > 1) {
+		struct xdp_buff *new_xdp;
 
-	while (--num_buf) {
 		buf = virtqueue_get_buf(rq->vq, &len);
-		if (unlikely(!buf)) {
-			pr_debug("%s: rx error: %d buffers out of %d missing\n",
-				 vi->dev->name, num_buf,
-				 virtio16_to_cpu(vi->vdev,
-						 hdr->num_buffers));
+		if (!unlikely(buf)) {
+			pr_debug("%s: rx error: %d buffers missing\n",
+				 vi->dev->name, num_buf);
 			DEV_STATS_INC(vi->dev, rx_length_errors);
-			return -EINVAL;
-		}
-
-		u64_stats_add(&stats->bytes, len);
-
-		xdp = buf_to_xdp(vi, rq, buf, len);
-		if (!xdp)
-			goto err;
-
-		buf = napi_alloc_frag(len);
-		if (!buf) {
-			xsk_buff_free(xdp);
-			goto err;
+			return -1;
 		}
 
-		memcpy(buf, xdp->data - vi->hdr_len, len);
-
-		xsk_buff_free(xdp);
+		new_xdp = buf_to_xdp(vi, rq, buf, len);
+		if (!new_xdp)
+			goto drop_bufs;
 
-		page = virt_to_page(buf);
+		/* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
+		 * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
+		 * However, only the first packet has the virtio_net_hdr, the
+		 * following ones do not. So we need to adjust the following
+		 * packets' data pointer to the correct place.
+		 */
+		new_xdp->data -= vi->hdr_len;
+		new_xdp->data_end = new_xdp->data + len;
 
-		truesize = len;
+		if (!xsk_buff_add_frag(xdp, new_xdp))
+			goto drop_bufs;
 
-		curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
-						    buf, len, truesize);
-		if (!curr_skb) {
-			put_page(page);
-			goto err;
-		}
+		num_buf--;
 	}
 
 	return 0;
 
-err:
+drop_bufs:
 	xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
-	return -EINVAL;
+	return -1;
 }
 
 static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
@@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
 	num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 
 	ret = XDP_PASS;
+	if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
+		goto drop;
+
 	rcu_read_lock();
 	prog = rcu_dereference(rq->xdp_prog);
-	/* TODO: support multi buffer. */
-	if (prog && num_buf == 1)
-		ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
+	if (prog) {
+		/* We are in zerocopy mode so we cannot copy the multi-buffer
+		 * xdp buff to a single linear xdp buff. If we do so, in case
+		 * the BPF program decides to redirect to a XDP socket (XSK),
+		 * it will trigger the zerocopy receive logic in XDP socket.
+		 * The receive logic thinks it receives zerocopy buffer while
+		 * in fact, it is the copy one and everything is messed up.
+		 * So just drop the packet here if we have a multi-buffer xdp
+		 * buff and the BPF program does not support it.
+		 */
+		if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
+			ret = XDP_DROP;
+		else
+			ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
+						  stats);
+	}
 	rcu_read_unlock();
 
 	switch (ret) {
 	case XDP_PASS:
-		skb = xsk_construct_skb(rq, xdp);
+		skb = xdp_build_skb_from_zc(xdp);
 		if (!skb)
-			goto drop_bufs;
+			break;
 
-		if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
-			dev_kfree_skb(skb);
-			goto drop;
-		}
+		/* Later, in virtnet_receive_done(), eth_type_trans()
+		 * is called. However, in xdp_build_skb_from_zc(), it is called
+		 * already. As a result, we need to reset the data to before
+		 * the mac header so that the later call in
+		 * virtnet_receive_done() works correctly.
+		 */
+		skb_push(skb, ETH_HLEN);
 
 		return skb;
 
@@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
 		return NULL;
 
 	default:
-		/* drop packet */
-		xsk_buff_free(xdp);
+		break;
 	}
 
-drop_bufs:
-	xsk_drop_follow_bufs(dev, rq, num_buf, stats);
-
 drop:
+	xsk_buff_free(xdp);
 	u64_stats_inc(&stats->drops);
 	return NULL;
 }
@@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
 		return -ENOMEM;
 
 	len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
+	/* Reserve some space for skb_shared_info */
+	len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	for (i = 0; i < num; ++i) {
 		/* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
@@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	dev->netdev_ops = &virtnet_netdev;
 	dev->stat_ops = &virtnet_stat_ops;
 	dev->features = NETIF_F_HIGHDMA;
+	dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
 
 	dev->ethtool_ops = &virtnet_ethtool_ops;
 	SET_NETDEV_DEV(dev, &vdev->dev);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-27 16:19 [RFC PATCH net-next v2 0/2] virtio-net: support zerocopy multi buffer XDP in mergeable Bui Quang Minh
  2025-05-27 16:19 ` [RFC PATCH net-next v2 1/2] " Bui Quang Minh
@ 2025-05-27 16:19 ` Bui Quang Minh
  2025-05-28 17:04   ` ALOK TIWARI
  2025-05-29 11:18   ` Maciej Fijalkowski
  1 sibling, 2 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-27 16:19 UTC (permalink / raw)
  To: netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf, Bui Quang Minh

This adds a test to test the virtio-net rx when there is a XDP socket
bound to it. There are tests for both copy mode and zerocopy mode, both
cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
 .../selftests/drivers/net/hw/.gitignore       |   3 +
 .../testing/selftests/drivers/net/hw/Makefile |  12 +-
 .../drivers/net/hw/xsk_receive.bpf.c          |  43 ++
 .../selftests/drivers/net/hw/xsk_receive.c    | 398 ++++++++++++++++++
 .../selftests/drivers/net/hw/xsk_receive.py   |  75 ++++
 5 files changed, 530 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
 create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.c
 create mode 100755 tools/testing/selftests/drivers/net/hw/xsk_receive.py

diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore
index 6942bf575497..c32271faecff 100644
--- a/tools/testing/selftests/drivers/net/hw/.gitignore
+++ b/tools/testing/selftests/drivers/net/hw/.gitignore
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 iou-zcrx
 ncdevmem
+xsk_receive.skel.h
+xsk_receive
+tools
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index df2c047ffa90..964edbb3b79f 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0+ OR MIT
 
-TEST_GEN_FILES = iou-zcrx
+TEST_GEN_FILES = \
+	iou-zcrx \
+	xsk_receive \
+	#
 
 TEST_PROGS = \
 	csum.py \
@@ -20,6 +23,7 @@ TEST_PROGS = \
 	rss_input_xfrm.py \
 	tso.py \
 	xsk_reconfig.py \
+	xsk_receive.py \
 	#
 
 TEST_FILES := \
@@ -48,3 +52,9 @@ include ../../../net/ynl.mk
 include ../../../net/bpf.mk
 
 $(OUTPUT)/iou-zcrx: LDLIBS += -luring
+
+$(OUTPUT)/xsk_receive.skel.h: xsk_receive.bpf.o
+	bpftool gen skeleton xsk_receive.bpf.o > xsk_receive.skel.h
+
+$(OUTPUT)/xsk_receive: xsk_receive.skel.h
+$(OUTPUT)/xsk_receive: LDLIBS += -lbpf
diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
new file mode 100644
index 000000000000..462046d95bfe
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_XSKMAP);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} xsk_map SEC(".maps");
+
+SEC("xdp.frags")
+int dummy_prog(struct xdp_md *ctx)
+{
+	return XDP_PASS;
+}
+
+SEC("xdp.frags")
+int redirect_xsk_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	struct iphdr *iph;
+
+	if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+		return XDP_PASS;
+
+	if (bpf_htons(eth->h_proto) != ETH_P_IP)
+		return XDP_PASS;
+
+	iph = data + sizeof(*eth);
+	if (iph->protocol != IPPROTO_UDP)
+		return XDP_PASS;
+
+	return bpf_redirect_map(&xsk_map, 0, XDP_DROP);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
new file mode 100644
index 000000000000..96213ceeda5c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <poll.h>
+#include <stdatomic.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <linux/if_xdp.h>
+
+#include "xsk_receive.skel.h"
+
+#define load_acquire(p) \
+	atomic_load_explicit((_Atomic typeof(*(p)) *)(p), memory_order_acquire)
+
+#define store_release(p, v) \
+	atomic_store_explicit((_Atomic typeof(*(p)) *)(p), v, \
+			      memory_order_release)
+
+#define UMEM_CHUNK_SIZE 0x1000
+#define BUFFER_SIZE 0x2000
+
+#define SERVER_PORT 8888
+#define CLIENT_PORT 9999
+
+const int num_entries = 256;
+const char *pass_msg = "PASS";
+
+int cfg_client;
+int cfg_server;
+char *cfg_server_ip;
+char *cfg_client_ip;
+int cfg_ifindex;
+int cfg_redirect;
+int cfg_zerocopy;
+
+struct xdp_sock_context {
+	int xdp_sock;
+	void *umem_region;
+	void *rx_ring;
+	void *fill_ring;
+	struct xdp_mmap_offsets off;
+};
+
+struct xdp_sock_context *setup_xdp_socket(int ifindex)
+{
+	struct xdp_mmap_offsets off;
+	void *rx_ring, *fill_ring;
+	struct xdp_umem_reg umem_reg = {};
+	int optlen = sizeof(off);
+	int umem_len, sock, ret, i;
+	void *umem_region;
+	uint32_t *fr_producer;
+	uint64_t *addr;
+	struct sockaddr_xdp sxdp = {
+		.sxdp_family = AF_XDP,
+		.sxdp_ifindex = ifindex,
+		.sxdp_queue_id = 0,
+		.sxdp_flags = XDP_USE_SG,
+	};
+	struct xdp_sock_context *ctx;
+
+	ctx = malloc(sizeof(*ctx));
+	if (!ctx)
+		error(1, 0, "malloc()");
+
+	if (cfg_zerocopy)
+		sxdp.sxdp_flags |= XDP_ZEROCOPY;
+	else
+		sxdp.sxdp_flags |= XDP_COPY;
+
+	umem_len = UMEM_CHUNK_SIZE * num_entries;
+	umem_region = mmap(0, umem_len, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (umem_region == MAP_FAILED)
+		error(1, errno, "mmap() umem");
+	ctx->umem_region = umem_region;
+
+	sock = socket(AF_XDP, SOCK_RAW, 0);
+	if (sock < 0)
+		error(1, errno, "socket() XDP");
+	ctx->xdp_sock = sock;
+
+	ret = setsockopt(sock, SOL_XDP, XDP_RX_RING, &num_entries,
+			 sizeof(num_entries));
+	if (ret < 0)
+		error(1, errno, "setsockopt() XDP_RX_RING");
+
+	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_COMPLETION_RING, &num_entries,
+			 sizeof(num_entries));
+	if (ret < 0)
+		error(1, errno, "setsockopt() XDP_UMEM_COMPLETION_RING");
+
+	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_FILL_RING, &num_entries,
+			 sizeof(num_entries));
+	if (ret < 0)
+		error(1, errno, "setsockopt() XDP_UMEM_FILL_RING");
+
+	ret = getsockopt(sock, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (ret < 0)
+		error(1, errno, "getsockopt()");
+	ctx->off = off;
+
+	rx_ring = mmap(0, off.rx.desc + num_entries * sizeof(struct xdp_desc),
+		       PROT_READ | PROT_WRITE, MAP_SHARED, sock,
+		       XDP_PGOFF_RX_RING);
+	if (rx_ring == (void *)-1)
+		error(1, errno, "mmap() rx-ring");
+	ctx->rx_ring = rx_ring;
+
+	fill_ring = mmap(0, off.fr.desc + num_entries * sizeof(uint64_t),
+			 PROT_READ | PROT_WRITE, MAP_SHARED, sock,
+			 XDP_UMEM_PGOFF_FILL_RING);
+	if (fill_ring == (void *)-1)
+		error(1, errno, "mmap() fill-ring");
+	ctx->fill_ring = fill_ring;
+
+	umem_reg.addr = (unsigned long long)ctx->umem_region;
+	umem_reg.len = umem_len;
+	umem_reg.chunk_size = UMEM_CHUNK_SIZE;
+	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_REG, &umem_reg,
+			 sizeof(umem_reg));
+	if (ret < 0)
+		error(1, errno, "setsockopt() XDP_UMEM_REG");
+
+	i = 0;
+	while (1) {
+		ret = bind(sock, (const struct sockaddr *)&sxdp, sizeof(sxdp));
+		if (!ret)
+			break;
+
+		if (errno == EBUSY && i < 3) {
+			i++;
+			sleep(1);
+		} else {
+			error(1, errno, "bind() XDP");
+		}
+	}
+
+	/* Submit all umem entries to fill ring */
+	addr = fill_ring + off.fr.desc;
+	for (i = 0; i < umem_len; i += UMEM_CHUNK_SIZE) {
+		*addr = i;
+		addr++;
+	}
+	fr_producer = fill_ring + off.fr.producer;
+	store_release(fr_producer, num_entries);
+
+	return ctx;
+}
+
+void setup_xdp_prog(int sock, int ifindex, int redirect)
+{
+	struct xsk_receive_bpf *bpf;
+	int key, ret;
+
+	bpf = xsk_receive_bpf__open_and_load();
+	if (!bpf)
+		error(1, 0, "open eBPF");
+
+	key = 0;
+	ret = bpf_map__update_elem(bpf->maps.xsk_map, &key, sizeof(key),
+				   &sock, sizeof(sock), 0);
+	if (ret < 0)
+		error(1, errno, "eBPF map update");
+
+	if (redirect) {
+		ret = bpf_xdp_attach(ifindex,
+				bpf_program__fd(bpf->progs.redirect_xsk_prog),
+				0, NULL);
+		if (ret < 0)
+			error(1, errno, "attach eBPF");
+	} else {
+		ret = bpf_xdp_attach(ifindex,
+				     bpf_program__fd(bpf->progs.dummy_prog),
+				     0, NULL);
+		if (ret < 0)
+			error(1, errno, "attach eBPF");
+	}
+}
+
+void send_pass_msg(int sock)
+{
+	int ret;
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_addr = inet_addr(cfg_client_ip),
+		.sin_port = htons(CLIENT_PORT),
+	};
+
+	ret = sendto(sock, pass_msg, sizeof(pass_msg), 0,
+		     (const struct sockaddr *)&addr, sizeof(addr));
+	if (ret < 0)
+		error(1, errno, "sendto()");
+}
+
+void server_recv_xdp(struct xdp_sock_context *ctx, int udp_sock)
+{
+	int ret;
+	struct pollfd fds = {
+		.fd = ctx->xdp_sock,
+		.events = POLLIN,
+	};
+
+	ret = poll(&fds, 1, -1);
+	if (ret < 0)
+		error(1, errno, "poll()");
+
+	if (fds.revents & POLLIN) {
+		uint32_t *producer_ptr = ctx->rx_ring + ctx->off.rx.producer;
+		uint32_t *consumer_ptr = ctx->rx_ring + ctx->off.rx.consumer;
+		uint32_t producer, consumer;
+		struct xdp_desc *desc;
+
+		producer = load_acquire(producer_ptr);
+		consumer = load_acquire(consumer_ptr);
+
+		printf("Receive %d XDP buffers\n", producer - consumer);
+
+		store_release(consumer_ptr, producer);
+	} else {
+		error(1, 0, "unexpected poll event: %d", fds.revents);
+	}
+
+	send_pass_msg(udp_sock);
+}
+
+void server_recv_udp(int sock)
+{
+	char *buffer;
+	int i, ret;
+
+	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (buffer == MAP_FAILED)
+		error(1, errno, "mmap() send buffer");
+
+	ret = recv(sock, buffer, BUFFER_SIZE, 0);
+	if (ret < 0)
+		error(1, errno, "recv()");
+
+	if (ret != BUFFER_SIZE)
+		error(1, errno, "message is truncated, expected: %d, got: %d",
+		      BUFFER_SIZE, ret);
+
+	for (i = 0; i < BUFFER_SIZE; i++)
+		if (buffer[i] != 'a' + (i % 26))
+			error(1, 0, "message mismatches at %d", i);
+
+	send_pass_msg(sock);
+}
+
+int setup_udp_sock(const char *addr, int port)
+{
+	int sock, ret;
+	struct sockaddr_in saddr = {
+		.sin_family = AF_INET,
+		.sin_addr = inet_addr(addr),
+		.sin_port = htons(port),
+	};
+
+	sock = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sock < 0)
+		error(1, errno, "socket() UDP");
+
+	ret = bind(sock, (const struct sockaddr *)&saddr, sizeof(saddr));
+	if (ret < 0)
+		error(1, errno, "bind() UDP");
+
+	return sock;
+}
+
+void run_server(void)
+{
+	int udp_sock;
+	struct xdp_sock_context *ctx;
+
+	ctx = setup_xdp_socket(cfg_ifindex);
+	setup_xdp_prog(ctx->xdp_sock, cfg_ifindex, cfg_redirect);
+	udp_sock = setup_udp_sock(cfg_server_ip, SERVER_PORT);
+
+	if (cfg_redirect)
+		server_recv_xdp(ctx, udp_sock);
+	else
+		server_recv_udp(udp_sock);
+}
+
+void run_client(void)
+{
+	char *buffer;
+	int sock, ret, i;
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_addr = inet_addr(cfg_server_ip),
+		.sin_port = htons(SERVER_PORT),
+	};
+
+	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (buffer == MAP_FAILED)
+		error(1, errno, "mmap() send buffer");
+
+	for (i = 0; i < BUFFER_SIZE; i++)
+		buffer[i] = 'a' + (i % 26);
+
+	sock = setup_udp_sock(cfg_client_ip, CLIENT_PORT);
+
+	ret = sendto(sock, buffer, BUFFER_SIZE, 0,
+		     (const struct sockaddr *)&addr, sizeof(addr));
+	if (ret < 0)
+		error(1, errno, "sendto()");
+
+	if (ret != BUFFER_SIZE)
+		error(1, 0, "sent buffer is truncated, expected: %d got: %d",
+		      BUFFER_SIZE, ret);
+
+	ret = recv(sock, buffer, BUFFER_SIZE, 0);
+	if (ret < 0)
+		error(1, errno, "recv()");
+
+	if ((ret != sizeof(pass_msg)) || strcmp(buffer, pass_msg))
+		error(1, 0, "message mismatches, expected: %s, got: %s",
+		      pass_msg, buffer);
+}
+
+void print_usage(char *prog)
+{
+	fprintf(stderr, "Usage: %s (-c|-s) -r<server_ip> -l<client_ip>"
+		" -i<server_ifname> [-d] [-z]\n", prog);
+}
+
+void parse_opts(int argc, char **argv)
+{
+	int opt;
+	char *ifname = NULL;
+
+	while ((opt = getopt(argc, argv, "hcsr:l:i:dz")) != -1) {
+		switch (opt) {
+		case 'c':
+			if (cfg_server)
+				error(1, 0, "Pass one of -s or -c");
+
+			cfg_client = 1;
+			break;
+		case 's':
+			if (cfg_client)
+				error(1, 0, "Pass one of -s or -c");
+
+			cfg_server = 1;
+			break;
+		case 'r':
+			cfg_server_ip = optarg;
+			break;
+		case 'l':
+			cfg_client_ip = optarg;
+			break;
+		case 'i':
+			ifname = optarg;
+			break;
+		case 'd':
+			cfg_redirect = 1;
+			break;
+		case 'z':
+			cfg_zerocopy = 1;
+			break;
+		case 'h':
+		default:
+			print_usage(argv[0]);
+			exit(1);
+		}
+	}
+
+	if (!cfg_client && !cfg_server)
+		error(1, 0, "Pass one of -s or -c");
+
+	if (ifname) {
+		cfg_ifindex = if_nametoindex(ifname);
+		if (!cfg_ifindex)
+			error(1, errno, "Invalid interface %s", ifname);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+	if (cfg_client)
+		run_client();
+	else if (cfg_server)
+		run_server();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.py b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
new file mode 100755
index 000000000000..f32cb4477b75
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+# This a test for virtio-net rx when there is a XDP socket bound to it. The test
+# is expected to be run in the host side.
+#
+# The run example:
+#
+# export NETIF=tap0
+# export LOCAL_V4=192.168.31.1
+# export REMOTE_V4=192.168.31.3
+# export REMOTE_TYPE=ssh
+# export REMOTE_ARGS='root@192.168.31.3'
+# ./ksft-net-drv/run_kselftest.sh -t drivers/net/hw:xsk_receive.py
+#
+# where:
+# - 192.168.31.1 is the IP of tap device in the host
+# - 192.168.31.3 is the IP of virtio-net device in the guest
+#
+# The Qemu command to setup virtio-net
+# -netdev tap,id=hostnet1,vhost=on,script=no,downscript=no
+# -device virtio-net-pci,netdev=hostnet1,iommu_platform=on,disable-legacy=on
+#
+# The MTU of tap device can be adjusted to test more cases:
+# - 1500: single buffer XDP
+# - 9000: multi-buffer XDP
+
+from lib.py import ksft_exit, ksft_run
+from lib.py import KsftSkipEx, KsftFailEx
+from lib.py import NetDrvEpEnv
+from lib.py import bkg, cmd, wait_port_listen
+from os import path
+
+SERVER_PORT = 8888
+CLIENT_PORT = 9999
+
+def test_xdp_pass(cfg, server_cmd, client_cmd):
+    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
+        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
+        cmd(client_cmd)
+
+def test_xdp_pass_zc(cfg, server_cmd, client_cmd):
+    server_cmd += " -z"
+    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
+        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
+        cmd(client_cmd)
+
+def test_xdp_redirect(cfg, server_cmd, client_cmd):
+    server_cmd += " -d"
+    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
+        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
+        cmd(client_cmd)
+
+def test_xdp_redirect_zc(cfg, server_cmd, client_cmd):
+    server_cmd += " -d -z"
+    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
+        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
+        cmd(client_cmd)
+
+def main():
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+        cfg.bin_local = path.abspath(path.dirname(__file__)
+                            + "/../../../drivers/net/hw/xsk_receive")
+        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+
+        server_cmd = f"{cfg.bin_remote} -s -i {cfg.remote_ifname} "
+        server_cmd += f"-r {cfg.remote_addr_v["4"]} -l {cfg.addr_v["4"]}"
+        client_cmd = f"{cfg.bin_local} -c -r {cfg.remote_addr_v["4"]} "
+        client_cmd += f"-l {cfg.addr_v["4"]}"
+
+        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, server_cmd, client_cmd))
+    ksft_exit()
+
+if __name__ == "__main__":
+    main()
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-27 16:19 ` [RFC PATCH net-next v2 1/2] " Bui Quang Minh
@ 2025-05-28 16:44   ` ALOK TIWARI
  2025-05-29  3:42     ` Bui Quang Minh
  2025-05-29  5:59   ` Jason Wang
  1 sibling, 1 reply; 18+ messages in thread
From: ALOK TIWARI @ 2025-05-28 16:44 UTC (permalink / raw)
  To: Bui Quang Minh, netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf



On 27-05-2025 21:49, Bui Quang Minh wrote:
> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
> does not support multi buffer but a single buffer only. This commit adds
> support for multi mergeable receive buffer in the zerocopy XDP path by
> utilizing XDP buffer with frags.
> 
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>   drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
>   1 file changed, 66 insertions(+), 57 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index e53ba600605a..a9558650f205 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
>   #define VIRTIO_XDP_TX		BIT(0)
>   #define VIRTIO_XDP_REDIR	BIT(1)
>   
> +#define VIRTNET_MAX_ZC_SEGS	8
> +
>   /* RX packet size EWMA. The average packet size is used to determine the packet
>    * buffer size when refilling RX rings. As the entire RX ring may be refilled
>    * at once, the weight is chosen so that the EWMA will be insensitive to short-
> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
>   	}
>   }
>   
> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
> -				   struct receive_queue *rq,
> -				   struct sk_buff *head_skb,
> -				   u32 num_buf,
> -				   struct virtio_net_hdr_mrg_rxbuf *hdr,
> -				   struct virtnet_rq_stats *stats)
> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
> +				      struct receive_queue *rq,
> +				      u32 num_buf,
> +				      struct xdp_buff *xdp,
> +				      struct virtnet_rq_stats *stats)
>   {
> -	struct sk_buff *curr_skb;
> -	struct xdp_buff *xdp;
> -	u32 len, truesize;
> -	struct page *page;
> +	unsigned int len;
>   	void *buf;
>   
> -	curr_skb = head_skb;
> +	if (num_buf < 2)
> +		return 0;
> +
> +	while (num_buf > 1) {
> +		struct xdp_buff *new_xdp;
>   
> -	while (--num_buf) {
>   		buf = virtqueue_get_buf(rq->vq, &len);
> -		if (unlikely(!buf)) {
> -			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> -				 vi->dev->name, num_buf,
> -				 virtio16_to_cpu(vi->vdev,
> -						 hdr->num_buffers));
> +		if (!unlikely(buf)) {

if (unlikely(!buf)) { ?

> +			pr_debug("%s: rx error: %d buffers missing\n",
> +				 vi->dev->name, num_buf);
>   			DEV_STATS_INC(vi->dev, rx_length_errors);

Thanks,
Alok

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-27 16:19 ` [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net Bui Quang Minh
@ 2025-05-28 17:04   ` ALOK TIWARI
  2025-05-29  3:44     ` Bui Quang Minh
  2025-05-29 11:18   ` Maciej Fijalkowski
  1 sibling, 1 reply; 18+ messages in thread
From: ALOK TIWARI @ 2025-05-28 17:04 UTC (permalink / raw)
  To: Bui Quang Minh, netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf



On 27-05-2025 21:49, Bui Quang Minh wrote:
> +def main():
> +    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
> +        cfg.bin_local = path.abspath(path.dirname(__file__)
> +                            + "/../../../drivers/net/hw/xsk_receive")
> +        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
> +
> +        server_cmd = f"{cfg.bin_remote} -s -i {cfg.remote_ifname} "
> +        server_cmd += f"-r {cfg.remote_addr_v["4"]} -l {cfg.addr_v["4"]}"
> +        client_cmd = f"{cfg.bin_local} -c -r {cfg.remote_addr_v["4"]} "
> +        client_cmd += f"-l {cfg.addr_v["4"]}"
> +
> +        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, server_cmd, client_cmd))
> +    ksft_exit()

SyntaxError ?
inner ["4"] uses double quotes, which clash with the outer double quotes 
of the f-string

Thanks,
Alok

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-28 16:44   ` ALOK TIWARI
@ 2025-05-29  3:42     ` Bui Quang Minh
  0 siblings, 0 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-29  3:42 UTC (permalink / raw)
  To: ALOK TIWARI, netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 5/28/25 23:44, ALOK TIWARI wrote:
>
>
> On 27-05-2025 21:49, Bui Quang Minh wrote:
>> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
>> does not support multi buffer but a single buffer only. This commit adds
>> support for multi mergeable receive buffer in the zerocopy XDP path by
>> utilizing XDP buffer with frags.
>>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>>   drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
>>   1 file changed, 66 insertions(+), 57 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index e53ba600605a..a9558650f205 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
>>   #define VIRTIO_XDP_TX        BIT(0)
>>   #define VIRTIO_XDP_REDIR    BIT(1)
>>   +#define VIRTNET_MAX_ZC_SEGS    8
>> +
>>   /* RX packet size EWMA. The average packet size is used to 
>> determine the packet
>>    * buffer size when refilling RX rings. As the entire RX ring may 
>> be refilled
>>    * at once, the weight is chosen so that the EWMA will be 
>> insensitive to short-
>> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct 
>> net_device *dev,
>>       }
>>   }
>>   -static int xsk_append_merge_buffer(struct virtnet_info *vi,
>> -                   struct receive_queue *rq,
>> -                   struct sk_buff *head_skb,
>> -                   u32 num_buf,
>> -                   struct virtio_net_hdr_mrg_rxbuf *hdr,
>> -                   struct virtnet_rq_stats *stats)
>> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
>> +                      struct receive_queue *rq,
>> +                      u32 num_buf,
>> +                      struct xdp_buff *xdp,
>> +                      struct virtnet_rq_stats *stats)
>>   {
>> -    struct sk_buff *curr_skb;
>> -    struct xdp_buff *xdp;
>> -    u32 len, truesize;
>> -    struct page *page;
>> +    unsigned int len;
>>       void *buf;
>>   -    curr_skb = head_skb;
>> +    if (num_buf < 2)
>> +        return 0;
>> +
>> +    while (num_buf > 1) {
>> +        struct xdp_buff *new_xdp;
>>   -    while (--num_buf) {
>>           buf = virtqueue_get_buf(rq->vq, &len);
>> -        if (unlikely(!buf)) {
>> -            pr_debug("%s: rx error: %d buffers out of %d missing\n",
>> -                 vi->dev->name, num_buf,
>> -                 virtio16_to_cpu(vi->vdev,
>> -                         hdr->num_buffers));
>> +        if (!unlikely(buf)) {
>
> if (unlikely(!buf)) { ?

Thanks, I'll fix this in the next version.

>
>> +            pr_debug("%s: rx error: %d buffers missing\n",
>> +                 vi->dev->name, num_buf);
>>               DEV_STATS_INC(vi->dev, rx_length_errors);
>
> Thanks,
> Alok


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-28 17:04   ` ALOK TIWARI
@ 2025-05-29  3:44     ` Bui Quang Minh
  0 siblings, 0 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-29  3:44 UTC (permalink / raw)
  To: ALOK TIWARI, netdev
  Cc: Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 5/29/25 00:04, ALOK TIWARI wrote:
>
>
> On 27-05-2025 21:49, Bui Quang Minh wrote:
>> +def main():
>> +    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
>> +        cfg.bin_local = path.abspath(path.dirname(__file__)
>> +                            + "/../../../drivers/net/hw/xsk_receive")
>> +        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
>> +
>> +        server_cmd = f"{cfg.bin_remote} -s -i {cfg.remote_ifname} "
>> +        server_cmd += f"-r {cfg.remote_addr_v["4"]} -l 
>> {cfg.addr_v["4"]}"
>> +        client_cmd = f"{cfg.bin_local} -c -r {cfg.remote_addr_v["4"]} "
>> +        client_cmd += f"-l {cfg.addr_v["4"]}"
>> +
>> +        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, 
>> server_cmd, client_cmd))
>> +    ksft_exit()
>
> SyntaxError ?
> inner ["4"] uses double quotes, which clash with the outer double 
> quotes of the f-string

This works just fine because the ["4"] is inside {}. But I can fix this 
to avoid confusion.

Thanks,
Quang Minh.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-27 16:19 ` [RFC PATCH net-next v2 1/2] " Bui Quang Minh
  2025-05-28 16:44   ` ALOK TIWARI
@ 2025-05-29  5:59   ` Jason Wang
  2025-05-29 12:28     ` Bui Quang Minh
  1 sibling, 1 reply; 18+ messages in thread
From: Jason Wang @ 2025-05-29  5:59 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On Wed, May 28, 2025 at 12:19 AM Bui Quang Minh
<minhquangbui99@gmail.com> wrote:
>
> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
> does not support multi buffer but a single buffer only. This commit adds
> support for multi mergeable receive buffer in the zerocopy XDP path by
> utilizing XDP buffer with frags.
>
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>  drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
>  1 file changed, 66 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index e53ba600605a..a9558650f205 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
>  #define VIRTIO_XDP_TX          BIT(0)
>  #define VIRTIO_XDP_REDIR       BIT(1)
>
> +#define VIRTNET_MAX_ZC_SEGS    8
> +
>  /* RX packet size EWMA. The average packet size is used to determine the packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
>   * at once, the weight is chosen so that the EWMA will be insensitive to short-
> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
>         }
>  }
>
> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
> -                                  struct receive_queue *rq,
> -                                  struct sk_buff *head_skb,
> -                                  u32 num_buf,
> -                                  struct virtio_net_hdr_mrg_rxbuf *hdr,
> -                                  struct virtnet_rq_stats *stats)
> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
> +                                     struct receive_queue *rq,
> +                                     u32 num_buf,
> +                                     struct xdp_buff *xdp,
> +                                     struct virtnet_rq_stats *stats)
>  {
> -       struct sk_buff *curr_skb;
> -       struct xdp_buff *xdp;
> -       u32 len, truesize;
> -       struct page *page;
> +       unsigned int len;
>         void *buf;
>
> -       curr_skb = head_skb;
> +       if (num_buf < 2)
> +               return 0;
> +
> +       while (num_buf > 1) {
> +               struct xdp_buff *new_xdp;
>
> -       while (--num_buf) {
>                 buf = virtqueue_get_buf(rq->vq, &len);
> -               if (unlikely(!buf)) {
> -                       pr_debug("%s: rx error: %d buffers out of %d missing\n",
> -                                vi->dev->name, num_buf,
> -                                virtio16_to_cpu(vi->vdev,
> -                                                hdr->num_buffers));
> +               if (!unlikely(buf)) {
> +                       pr_debug("%s: rx error: %d buffers missing\n",
> +                                vi->dev->name, num_buf);
>                         DEV_STATS_INC(vi->dev, rx_length_errors);
> -                       return -EINVAL;
> -               }
> -
> -               u64_stats_add(&stats->bytes, len);
> -
> -               xdp = buf_to_xdp(vi, rq, buf, len);
> -               if (!xdp)
> -                       goto err;
> -
> -               buf = napi_alloc_frag(len);
> -               if (!buf) {
> -                       xsk_buff_free(xdp);
> -                       goto err;
> +                       return -1;
>                 }
>
> -               memcpy(buf, xdp->data - vi->hdr_len, len);
> -
> -               xsk_buff_free(xdp);
> +               new_xdp = buf_to_xdp(vi, rq, buf, len);
> +               if (!new_xdp)
> +                       goto drop_bufs;
>
> -               page = virt_to_page(buf);
> +               /* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
> +                * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
> +                * However, only the first packet has the virtio_net_hdr, the
> +                * following ones do not. So we need to adjust the following

Typo here.

> +                * packets' data pointer to the correct place.
> +                */

I wonder what happens if we don't use this trick? I meant we don't
reuse the header room for the virtio-net header. This seems to be fine
for a mergeable buffer and can help to reduce the trick.

> +               new_xdp->data -= vi->hdr_len;
> +               new_xdp->data_end = new_xdp->data + len;
>
> -               truesize = len;
> +               if (!xsk_buff_add_frag(xdp, new_xdp))
> +                       goto drop_bufs;
>
> -               curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
> -                                                   buf, len, truesize);
> -               if (!curr_skb) {
> -                       put_page(page);
> -                       goto err;
> -               }
> +               num_buf--;
>         }
>
>         return 0;
>
> -err:
> +drop_bufs:
>         xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
> -       return -EINVAL;
> +       return -1;
>  }
>
>  static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
> @@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>         num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>
>         ret = XDP_PASS;
> +       if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
> +               goto drop;
> +
>         rcu_read_lock();
>         prog = rcu_dereference(rq->xdp_prog);
> -       /* TODO: support multi buffer. */
> -       if (prog && num_buf == 1)
> -               ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);

Without this patch it looks like we had a bug:

        ret = XDP_PASS;
        rcu_read_lock();
        prog = rcu_dereference(rq->xdp_prog);
        /* TODO: support multi buffer. */
        if (prog && num_buf == 1)
                ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
        rcu_read_unlock();

This implies if num_buf is greater than 1, we will assume XDP_PASS?

> +       if (prog) {
> +               /* We are in zerocopy mode so we cannot copy the multi-buffer
> +                * xdp buff to a single linear xdp buff. If we do so, in case
> +                * the BPF program decides to redirect to a XDP socket (XSK),
> +                * it will trigger the zerocopy receive logic in XDP socket.
> +                * The receive logic thinks it receives zerocopy buffer while
> +                * in fact, it is the copy one and everything is messed up.
> +                * So just drop the packet here if we have a multi-buffer xdp
> +                * buff and the BPF program does not support it.
> +                */
> +               if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
> +                       ret = XDP_DROP;

Could we move the check before trying to build a multi-buffer XDP buff?

> +               else
> +                       ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
> +                                                 stats);
> +       }
>         rcu_read_unlock();
>
>         switch (ret) {
>         case XDP_PASS:
> -               skb = xsk_construct_skb(rq, xdp);
> +               skb = xdp_build_skb_from_zc(xdp);

Is this better to make this change a separate patch?

>                 if (!skb)
> -                       goto drop_bufs;
> +                       break;
>
> -               if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
> -                       dev_kfree_skb(skb);
> -                       goto drop;
> -               }
> +               /* Later, in virtnet_receive_done(), eth_type_trans()
> +                * is called. However, in xdp_build_skb_from_zc(), it is called
> +                * already. As a result, we need to reset the data to before
> +                * the mac header so that the later call in
> +                * virtnet_receive_done() works correctly.
> +                */
> +               skb_push(skb, ETH_HLEN);
>
>                 return skb;
>
> @@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>                 return NULL;
>
>         default:
> -               /* drop packet */
> -               xsk_buff_free(xdp);
> +               break;
>         }
>
> -drop_bufs:
> -       xsk_drop_follow_bufs(dev, rq, num_buf, stats);
> -
>  drop:
> +       xsk_buff_free(xdp);
>         u64_stats_inc(&stats->drops);
>         return NULL;
>  }
> @@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
>                 return -ENOMEM;
>
>         len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
> +       /* Reserve some space for skb_shared_info */
> +       len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>
>         for (i = 0; i < num; ++i) {
>                 /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
> @@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>         dev->netdev_ops = &virtnet_netdev;
>         dev->stat_ops = &virtnet_stat_ops;
>         dev->features = NETIF_F_HIGHDMA;
> +       dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
>
>         dev->ethtool_ops = &virtnet_ethtool_ops;
>         SET_NETDEV_DEV(dev, &vdev->dev);
> --
> 2.43.0
>

Thanks


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-27 16:19 ` [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net Bui Quang Minh
  2025-05-28 17:04   ` ALOK TIWARI
@ 2025-05-29 11:18   ` Maciej Fijalkowski
  2025-05-29 14:29     ` Bui Quang Minh
  1 sibling, 1 reply; 18+ messages in thread
From: Maciej Fijalkowski @ 2025-05-29 11:18 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
> This adds a test to test the virtio-net rx when there is a XDP socket
> bound to it. There are tests for both copy mode and zerocopy mode, both
> cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
> 
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>

Hi Bui,

have you considered adjusting xskxceiver for your needs? If yes and you
decided to go with another test app then what were the issues around it?

This is yet another approach for xsk testing where we already have a
test framework.

> ---
>  .../selftests/drivers/net/hw/.gitignore       |   3 +
>  .../testing/selftests/drivers/net/hw/Makefile |  12 +-
>  .../drivers/net/hw/xsk_receive.bpf.c          |  43 ++
>  .../selftests/drivers/net/hw/xsk_receive.c    | 398 ++++++++++++++++++
>  .../selftests/drivers/net/hw/xsk_receive.py   |  75 ++++
>  5 files changed, 530 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
>  create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.c
>  create mode 100755 tools/testing/selftests/drivers/net/hw/xsk_receive.py
> 
> diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore
> index 6942bf575497..c32271faecff 100644
> --- a/tools/testing/selftests/drivers/net/hw/.gitignore
> +++ b/tools/testing/selftests/drivers/net/hw/.gitignore
> @@ -1,3 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  iou-zcrx
>  ncdevmem
> +xsk_receive.skel.h
> +xsk_receive
> +tools
> diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
> index df2c047ffa90..964edbb3b79f 100644
> --- a/tools/testing/selftests/drivers/net/hw/Makefile
> +++ b/tools/testing/selftests/drivers/net/hw/Makefile
> @@ -1,6 +1,9 @@
>  # SPDX-License-Identifier: GPL-2.0+ OR MIT
>  
> -TEST_GEN_FILES = iou-zcrx
> +TEST_GEN_FILES = \
> +	iou-zcrx \
> +	xsk_receive \
> +	#
>  
>  TEST_PROGS = \
>  	csum.py \
> @@ -20,6 +23,7 @@ TEST_PROGS = \
>  	rss_input_xfrm.py \
>  	tso.py \
>  	xsk_reconfig.py \
> +	xsk_receive.py \
>  	#
>  
>  TEST_FILES := \
> @@ -48,3 +52,9 @@ include ../../../net/ynl.mk
>  include ../../../net/bpf.mk
>  
>  $(OUTPUT)/iou-zcrx: LDLIBS += -luring
> +
> +$(OUTPUT)/xsk_receive.skel.h: xsk_receive.bpf.o
> +	bpftool gen skeleton xsk_receive.bpf.o > xsk_receive.skel.h
> +
> +$(OUTPUT)/xsk_receive: xsk_receive.skel.h
> +$(OUTPUT)/xsk_receive: LDLIBS += -lbpf
> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
> new file mode 100644
> index 000000000000..462046d95bfe
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
> @@ -0,0 +1,43 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_endian.h>
> +#include <linux/if_ether.h>
> +#include <linux/ip.h>
> +#include <linux/in.h>
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_XSKMAP);
> +	__uint(max_entries, 1);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(__u32));
> +} xsk_map SEC(".maps");
> +
> +SEC("xdp.frags")
> +int dummy_prog(struct xdp_md *ctx)
> +{
> +	return XDP_PASS;
> +}
> +
> +SEC("xdp.frags")
> +int redirect_xsk_prog(struct xdp_md *ctx)
> +{
> +	void *data_end = (void *)(long)ctx->data_end;
> +	void *data = (void *)(long)ctx->data;
> +	struct ethhdr *eth = data;
> +	struct iphdr *iph;
> +
> +	if (data + sizeof(*eth) + sizeof(*iph) > data_end)
> +		return XDP_PASS;
> +
> +	if (bpf_htons(eth->h_proto) != ETH_P_IP)
> +		return XDP_PASS;
> +
> +	iph = data + sizeof(*eth);
> +	if (iph->protocol != IPPROTO_UDP)
> +		return XDP_PASS;
> +
> +	return bpf_redirect_map(&xsk_map, 0, XDP_DROP);
> +}
> +
> +char _license[] SEC("license") = "GPL";
> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
> new file mode 100644
> index 000000000000..96213ceeda5c
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
> @@ -0,0 +1,398 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <error.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <poll.h>
> +#include <stdatomic.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +#include <net/if.h>
> +#include <netinet/in.h>
> +#include <arpa/inet.h>
> +#include <linux/if_xdp.h>
> +
> +#include "xsk_receive.skel.h"
> +
> +#define load_acquire(p) \
> +	atomic_load_explicit((_Atomic typeof(*(p)) *)(p), memory_order_acquire)
> +
> +#define store_release(p, v) \
> +	atomic_store_explicit((_Atomic typeof(*(p)) *)(p), v, \
> +			      memory_order_release)
> +
> +#define UMEM_CHUNK_SIZE 0x1000
> +#define BUFFER_SIZE 0x2000
> +
> +#define SERVER_PORT 8888
> +#define CLIENT_PORT 9999
> +
> +const int num_entries = 256;
> +const char *pass_msg = "PASS";
> +
> +int cfg_client;
> +int cfg_server;
> +char *cfg_server_ip;
> +char *cfg_client_ip;
> +int cfg_ifindex;
> +int cfg_redirect;
> +int cfg_zerocopy;
> +
> +struct xdp_sock_context {
> +	int xdp_sock;
> +	void *umem_region;
> +	void *rx_ring;
> +	void *fill_ring;
> +	struct xdp_mmap_offsets off;
> +};
> +
> +struct xdp_sock_context *setup_xdp_socket(int ifindex)
> +{
> +	struct xdp_mmap_offsets off;
> +	void *rx_ring, *fill_ring;
> +	struct xdp_umem_reg umem_reg = {};
> +	int optlen = sizeof(off);
> +	int umem_len, sock, ret, i;
> +	void *umem_region;
> +	uint32_t *fr_producer;
> +	uint64_t *addr;
> +	struct sockaddr_xdp sxdp = {
> +		.sxdp_family = AF_XDP,
> +		.sxdp_ifindex = ifindex,
> +		.sxdp_queue_id = 0,
> +		.sxdp_flags = XDP_USE_SG,
> +	};
> +	struct xdp_sock_context *ctx;
> +
> +	ctx = malloc(sizeof(*ctx));
> +	if (!ctx)
> +		error(1, 0, "malloc()");
> +
> +	if (cfg_zerocopy)
> +		sxdp.sxdp_flags |= XDP_ZEROCOPY;
> +	else
> +		sxdp.sxdp_flags |= XDP_COPY;
> +
> +	umem_len = UMEM_CHUNK_SIZE * num_entries;
> +	umem_region = mmap(0, umem_len, PROT_READ | PROT_WRITE,
> +			   MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +	if (umem_region == MAP_FAILED)
> +		error(1, errno, "mmap() umem");
> +	ctx->umem_region = umem_region;
> +
> +	sock = socket(AF_XDP, SOCK_RAW, 0);
> +	if (sock < 0)
> +		error(1, errno, "socket() XDP");
> +	ctx->xdp_sock = sock;
> +
> +	ret = setsockopt(sock, SOL_XDP, XDP_RX_RING, &num_entries,
> +			 sizeof(num_entries));
> +	if (ret < 0)
> +		error(1, errno, "setsockopt() XDP_RX_RING");
> +
> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_COMPLETION_RING, &num_entries,
> +			 sizeof(num_entries));
> +	if (ret < 0)
> +		error(1, errno, "setsockopt() XDP_UMEM_COMPLETION_RING");
> +
> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_FILL_RING, &num_entries,
> +			 sizeof(num_entries));
> +	if (ret < 0)
> +		error(1, errno, "setsockopt() XDP_UMEM_FILL_RING");
> +
> +	ret = getsockopt(sock, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> +	if (ret < 0)
> +		error(1, errno, "getsockopt()");
> +	ctx->off = off;
> +
> +	rx_ring = mmap(0, off.rx.desc + num_entries * sizeof(struct xdp_desc),
> +		       PROT_READ | PROT_WRITE, MAP_SHARED, sock,
> +		       XDP_PGOFF_RX_RING);
> +	if (rx_ring == (void *)-1)
> +		error(1, errno, "mmap() rx-ring");
> +	ctx->rx_ring = rx_ring;
> +
> +	fill_ring = mmap(0, off.fr.desc + num_entries * sizeof(uint64_t),
> +			 PROT_READ | PROT_WRITE, MAP_SHARED, sock,
> +			 XDP_UMEM_PGOFF_FILL_RING);
> +	if (fill_ring == (void *)-1)
> +		error(1, errno, "mmap() fill-ring");
> +	ctx->fill_ring = fill_ring;
> +
> +	umem_reg.addr = (unsigned long long)ctx->umem_region;
> +	umem_reg.len = umem_len;
> +	umem_reg.chunk_size = UMEM_CHUNK_SIZE;
> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_REG, &umem_reg,
> +			 sizeof(umem_reg));
> +	if (ret < 0)
> +		error(1, errno, "setsockopt() XDP_UMEM_REG");
> +
> +	i = 0;
> +	while (1) {
> +		ret = bind(sock, (const struct sockaddr *)&sxdp, sizeof(sxdp));
> +		if (!ret)
> +			break;
> +
> +		if (errno == EBUSY && i < 3) {
> +			i++;
> +			sleep(1);
> +		} else {
> +			error(1, errno, "bind() XDP");
> +		}
> +	}
> +
> +	/* Submit all umem entries to fill ring */
> +	addr = fill_ring + off.fr.desc;
> +	for (i = 0; i < umem_len; i += UMEM_CHUNK_SIZE) {
> +		*addr = i;
> +		addr++;
> +	}
> +	fr_producer = fill_ring + off.fr.producer;
> +	store_release(fr_producer, num_entries);
> +
> +	return ctx;
> +}
> +
> +void setup_xdp_prog(int sock, int ifindex, int redirect)
> +{
> +	struct xsk_receive_bpf *bpf;
> +	int key, ret;
> +
> +	bpf = xsk_receive_bpf__open_and_load();
> +	if (!bpf)
> +		error(1, 0, "open eBPF");
> +
> +	key = 0;
> +	ret = bpf_map__update_elem(bpf->maps.xsk_map, &key, sizeof(key),
> +				   &sock, sizeof(sock), 0);
> +	if (ret < 0)
> +		error(1, errno, "eBPF map update");
> +
> +	if (redirect) {
> +		ret = bpf_xdp_attach(ifindex,
> +				bpf_program__fd(bpf->progs.redirect_xsk_prog),
> +				0, NULL);
> +		if (ret < 0)
> +			error(1, errno, "attach eBPF");
> +	} else {
> +		ret = bpf_xdp_attach(ifindex,
> +				     bpf_program__fd(bpf->progs.dummy_prog),
> +				     0, NULL);
> +		if (ret < 0)
> +			error(1, errno, "attach eBPF");
> +	}
> +}
> +
> +void send_pass_msg(int sock)
> +{
> +	int ret;
> +	struct sockaddr_in addr = {
> +		.sin_family = AF_INET,
> +		.sin_addr = inet_addr(cfg_client_ip),
> +		.sin_port = htons(CLIENT_PORT),
> +	};
> +
> +	ret = sendto(sock, pass_msg, sizeof(pass_msg), 0,
> +		     (const struct sockaddr *)&addr, sizeof(addr));
> +	if (ret < 0)
> +		error(1, errno, "sendto()");
> +}
> +
> +void server_recv_xdp(struct xdp_sock_context *ctx, int udp_sock)
> +{
> +	int ret;
> +	struct pollfd fds = {
> +		.fd = ctx->xdp_sock,
> +		.events = POLLIN,
> +	};
> +
> +	ret = poll(&fds, 1, -1);
> +	if (ret < 0)
> +		error(1, errno, "poll()");
> +
> +	if (fds.revents & POLLIN) {
> +		uint32_t *producer_ptr = ctx->rx_ring + ctx->off.rx.producer;
> +		uint32_t *consumer_ptr = ctx->rx_ring + ctx->off.rx.consumer;
> +		uint32_t producer, consumer;
> +		struct xdp_desc *desc;
> +
> +		producer = load_acquire(producer_ptr);
> +		consumer = load_acquire(consumer_ptr);
> +
> +		printf("Receive %d XDP buffers\n", producer - consumer);
> +
> +		store_release(consumer_ptr, producer);
> +	} else {
> +		error(1, 0, "unexpected poll event: %d", fds.revents);
> +	}
> +
> +	send_pass_msg(udp_sock);
> +}
> +
> +void server_recv_udp(int sock)
> +{
> +	char *buffer;
> +	int i, ret;
> +
> +	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
> +		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +	if (buffer == MAP_FAILED)
> +		error(1, errno, "mmap() send buffer");
> +
> +	ret = recv(sock, buffer, BUFFER_SIZE, 0);
> +	if (ret < 0)
> +		error(1, errno, "recv()");
> +
> +	if (ret != BUFFER_SIZE)
> +		error(1, errno, "message is truncated, expected: %d, got: %d",
> +		      BUFFER_SIZE, ret);
> +
> +	for (i = 0; i < BUFFER_SIZE; i++)
> +		if (buffer[i] != 'a' + (i % 26))
> +			error(1, 0, "message mismatches at %d", i);
> +
> +	send_pass_msg(sock);
> +}
> +
> +int setup_udp_sock(const char *addr, int port)
> +{
> +	int sock, ret;
> +	struct sockaddr_in saddr = {
> +		.sin_family = AF_INET,
> +		.sin_addr = inet_addr(addr),
> +		.sin_port = htons(port),
> +	};
> +
> +	sock = socket(AF_INET, SOCK_DGRAM, 0);
> +	if (sock < 0)
> +		error(1, errno, "socket() UDP");
> +
> +	ret = bind(sock, (const struct sockaddr *)&saddr, sizeof(saddr));
> +	if (ret < 0)
> +		error(1, errno, "bind() UDP");
> +
> +	return sock;
> +}
> +
> +void run_server(void)
> +{
> +	int udp_sock;
> +	struct xdp_sock_context *ctx;
> +
> +	ctx = setup_xdp_socket(cfg_ifindex);
> +	setup_xdp_prog(ctx->xdp_sock, cfg_ifindex, cfg_redirect);
> +	udp_sock = setup_udp_sock(cfg_server_ip, SERVER_PORT);
> +
> +	if (cfg_redirect)
> +		server_recv_xdp(ctx, udp_sock);
> +	else
> +		server_recv_udp(udp_sock);
> +}
> +
> +void run_client(void)
> +{
> +	char *buffer;
> +	int sock, ret, i;
> +	struct sockaddr_in addr = {
> +		.sin_family = AF_INET,
> +		.sin_addr = inet_addr(cfg_server_ip),
> +		.sin_port = htons(SERVER_PORT),
> +	};
> +
> +	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
> +		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +	if (buffer == MAP_FAILED)
> +		error(1, errno, "mmap() send buffer");
> +
> +	for (i = 0; i < BUFFER_SIZE; i++)
> +		buffer[i] = 'a' + (i % 26);
> +
> +	sock = setup_udp_sock(cfg_client_ip, CLIENT_PORT);
> +
> +	ret = sendto(sock, buffer, BUFFER_SIZE, 0,
> +		     (const struct sockaddr *)&addr, sizeof(addr));
> +	if (ret < 0)
> +		error(1, errno, "sendto()");
> +
> +	if (ret != BUFFER_SIZE)
> +		error(1, 0, "sent buffer is truncated, expected: %d got: %d",
> +		      BUFFER_SIZE, ret);
> +
> +	ret = recv(sock, buffer, BUFFER_SIZE, 0);
> +	if (ret < 0)
> +		error(1, errno, "recv()");
> +
> +	if ((ret != sizeof(pass_msg)) || strcmp(buffer, pass_msg))
> +		error(1, 0, "message mismatches, expected: %s, got: %s",
> +		      pass_msg, buffer);
> +}
> +
> +void print_usage(char *prog)
> +{
> +	fprintf(stderr, "Usage: %s (-c|-s) -r<server_ip> -l<client_ip>"
> +		" -i<server_ifname> [-d] [-z]\n", prog);
> +}
> +
> +void parse_opts(int argc, char **argv)
> +{
> +	int opt;
> +	char *ifname = NULL;
> +
> +	while ((opt = getopt(argc, argv, "hcsr:l:i:dz")) != -1) {
> +		switch (opt) {
> +		case 'c':
> +			if (cfg_server)
> +				error(1, 0, "Pass one of -s or -c");
> +
> +			cfg_client = 1;
> +			break;
> +		case 's':
> +			if (cfg_client)
> +				error(1, 0, "Pass one of -s or -c");
> +
> +			cfg_server = 1;
> +			break;
> +		case 'r':
> +			cfg_server_ip = optarg;
> +			break;
> +		case 'l':
> +			cfg_client_ip = optarg;
> +			break;
> +		case 'i':
> +			ifname = optarg;
> +			break;
> +		case 'd':
> +			cfg_redirect = 1;
> +			break;
> +		case 'z':
> +			cfg_zerocopy = 1;
> +			break;
> +		case 'h':
> +		default:
> +			print_usage(argv[0]);
> +			exit(1);
> +		}
> +	}
> +
> +	if (!cfg_client && !cfg_server)
> +		error(1, 0, "Pass one of -s or -c");
> +
> +	if (ifname) {
> +		cfg_ifindex = if_nametoindex(ifname);
> +		if (!cfg_ifindex)
> +			error(1, errno, "Invalid interface %s", ifname);
> +	}
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	parse_opts(argc, argv);
> +	if (cfg_client)
> +		run_client();
> +	else if (cfg_server)
> +		run_server();
> +
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.py b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
> new file mode 100755
> index 000000000000..f32cb4477b75
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
> @@ -0,0 +1,75 @@
> +#!/usr/bin/env python3
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# This a test for virtio-net rx when there is a XDP socket bound to it. The test
> +# is expected to be run in the host side.
> +#
> +# The run example:
> +#
> +# export NETIF=tap0
> +# export LOCAL_V4=192.168.31.1
> +# export REMOTE_V4=192.168.31.3
> +# export REMOTE_TYPE=ssh
> +# export REMOTE_ARGS='root@192.168.31.3'
> +# ./ksft-net-drv/run_kselftest.sh -t drivers/net/hw:xsk_receive.py
> +#
> +# where:
> +# - 192.168.31.1 is the IP of tap device in the host
> +# - 192.168.31.3 is the IP of virtio-net device in the guest
> +#
> +# The Qemu command to setup virtio-net
> +# -netdev tap,id=hostnet1,vhost=on,script=no,downscript=no
> +# -device virtio-net-pci,netdev=hostnet1,iommu_platform=on,disable-legacy=on
> +#
> +# The MTU of tap device can be adjusted to test more cases:
> +# - 1500: single buffer XDP
> +# - 9000: multi-buffer XDP
> +
> +from lib.py import ksft_exit, ksft_run
> +from lib.py import KsftSkipEx, KsftFailEx
> +from lib.py import NetDrvEpEnv
> +from lib.py import bkg, cmd, wait_port_listen
> +from os import path
> +
> +SERVER_PORT = 8888
> +CLIENT_PORT = 9999
> +
> +def test_xdp_pass(cfg, server_cmd, client_cmd):
> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
> +        cmd(client_cmd)
> +
> +def test_xdp_pass_zc(cfg, server_cmd, client_cmd):
> +    server_cmd += " -z"
> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
> +        cmd(client_cmd)
> +
> +def test_xdp_redirect(cfg, server_cmd, client_cmd):
> +    server_cmd += " -d"
> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
> +        cmd(client_cmd)
> +
> +def test_xdp_redirect_zc(cfg, server_cmd, client_cmd):
> +    server_cmd += " -d -z"
> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
> +        cmd(client_cmd)
> +
> +def main():
> +    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
> +        cfg.bin_local = path.abspath(path.dirname(__file__)
> +                            + "/../../../drivers/net/hw/xsk_receive")
> +        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
> +
> +        server_cmd = f"{cfg.bin_remote} -s -i {cfg.remote_ifname} "
> +        server_cmd += f"-r {cfg.remote_addr_v["4"]} -l {cfg.addr_v["4"]}"
> +        client_cmd = f"{cfg.bin_local} -c -r {cfg.remote_addr_v["4"]} "
> +        client_cmd += f"-l {cfg.addr_v["4"]}"
> +
> +        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, server_cmd, client_cmd))
> +    ksft_exit()
> +
> +if __name__ == "__main__":
> +    main()
> -- 
> 2.43.0
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-29  5:59   ` Jason Wang
@ 2025-05-29 12:28     ` Bui Quang Minh
  2025-06-03  2:56       ` Jason Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-29 12:28 UTC (permalink / raw)
  To: Jason Wang
  Cc: netdev, Michael S. Tsirkin, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 5/29/25 12:59, Jason Wang wrote:
> On Wed, May 28, 2025 at 12:19 AM Bui Quang Minh
> <minhquangbui99@gmail.com> wrote:
>> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
>> does not support multi buffer but a single buffer only. This commit adds
>> support for multi mergeable receive buffer in the zerocopy XDP path by
>> utilizing XDP buffer with frags.
>>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>>   drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
>>   1 file changed, 66 insertions(+), 57 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index e53ba600605a..a9558650f205 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
>>   #define VIRTIO_XDP_TX          BIT(0)
>>   #define VIRTIO_XDP_REDIR       BIT(1)
>>
>> +#define VIRTNET_MAX_ZC_SEGS    8
>> +
>>   /* RX packet size EWMA. The average packet size is used to determine the packet
>>    * buffer size when refilling RX rings. As the entire RX ring may be refilled
>>    * at once, the weight is chosen so that the EWMA will be insensitive to short-
>> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
>>          }
>>   }
>>
>> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
>> -                                  struct receive_queue *rq,
>> -                                  struct sk_buff *head_skb,
>> -                                  u32 num_buf,
>> -                                  struct virtio_net_hdr_mrg_rxbuf *hdr,
>> -                                  struct virtnet_rq_stats *stats)
>> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
>> +                                     struct receive_queue *rq,
>> +                                     u32 num_buf,
>> +                                     struct xdp_buff *xdp,
>> +                                     struct virtnet_rq_stats *stats)
>>   {
>> -       struct sk_buff *curr_skb;
>> -       struct xdp_buff *xdp;
>> -       u32 len, truesize;
>> -       struct page *page;
>> +       unsigned int len;
>>          void *buf;
>>
>> -       curr_skb = head_skb;
>> +       if (num_buf < 2)
>> +               return 0;
>> +
>> +       while (num_buf > 1) {
>> +               struct xdp_buff *new_xdp;
>>
>> -       while (--num_buf) {
>>                  buf = virtqueue_get_buf(rq->vq, &len);
>> -               if (unlikely(!buf)) {
>> -                       pr_debug("%s: rx error: %d buffers out of %d missing\n",
>> -                                vi->dev->name, num_buf,
>> -                                virtio16_to_cpu(vi->vdev,
>> -                                                hdr->num_buffers));
>> +               if (!unlikely(buf)) {
>> +                       pr_debug("%s: rx error: %d buffers missing\n",
>> +                                vi->dev->name, num_buf);
>>                          DEV_STATS_INC(vi->dev, rx_length_errors);
>> -                       return -EINVAL;
>> -               }
>> -
>> -               u64_stats_add(&stats->bytes, len);
>> -
>> -               xdp = buf_to_xdp(vi, rq, buf, len);
>> -               if (!xdp)
>> -                       goto err;
>> -
>> -               buf = napi_alloc_frag(len);
>> -               if (!buf) {
>> -                       xsk_buff_free(xdp);
>> -                       goto err;
>> +                       return -1;
>>                  }
>>
>> -               memcpy(buf, xdp->data - vi->hdr_len, len);
>> -
>> -               xsk_buff_free(xdp);
>> +               new_xdp = buf_to_xdp(vi, rq, buf, len);
>> +               if (!new_xdp)
>> +                       goto drop_bufs;
>>
>> -               page = virt_to_page(buf);
>> +               /* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
>> +                * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
>> +                * However, only the first packet has the virtio_net_hdr, the
>> +                * following ones do not. So we need to adjust the following
> Typo here.

I'm sorry, could you clarify which word contains the typo?

>
>> +                * packets' data pointer to the correct place.
>> +                */
> I wonder what happens if we don't use this trick? I meant we don't
> reuse the header room for the virtio-net header. This seems to be fine
> for a mergeable buffer and can help to reduce the trick.

I don't think using the header room for virtio-net header creates this 
case handling. In my opinion, it comes from the slightly difference in 
the recvbuf between single buffer and multi-buffer. When we have n 
single-buffer packets, each buffer will have its own virtio-net header. 
But when we have 1 multi-buffer packet (which spans across n buffers), 
only the first buffer has virtio-net header, the following buffers do not.

There 2 important pointers here. The pointer we announce to the vhost 
side to fill the data, let's call it announced_addr, and xdp_buff->data 
which is expected to point the the start of Ethernet frame. Currently,

     announced_addr = xdp_buff->data - hdr_len

The host side will write the virtio-net header to announced_addr then 
the Ethernet frame's data in the first buffer. In case of multi-buffer 
packet, in the following buffers, host side writes the Ethernet frame's 
data to the announced_addr no virtio-net header. So in the virtio-net, 
we need to subtract xdp_buff->data, otherwise, we lose some Ethernet 
frame's data.

I think a slightly better solution is that we set announced_addr = 
xdp_buff->data then we only need to xdp_buff->data += hdr_len for the 
first buffer and do need to adjust xdp_buff->data of the following buffers.

>
>> +               new_xdp->data -= vi->hdr_len;
>> +               new_xdp->data_end = new_xdp->data + len;
>>
>> -               truesize = len;
>> +               if (!xsk_buff_add_frag(xdp, new_xdp))
>> +                       goto drop_bufs;
>>
>> -               curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
>> -                                                   buf, len, truesize);
>> -               if (!curr_skb) {
>> -                       put_page(page);
>> -                       goto err;
>> -               }
>> +               num_buf--;
>>          }
>>
>>          return 0;
>>
>> -err:
>> +drop_bufs:
>>          xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
>> -       return -EINVAL;
>> +       return -1;
>>   }
>>
>>   static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
>> @@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>>          num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>>
>>          ret = XDP_PASS;
>> +       if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
>> +               goto drop;
>> +
>>          rcu_read_lock();
>>          prog = rcu_dereference(rq->xdp_prog);
>> -       /* TODO: support multi buffer. */
>> -       if (prog && num_buf == 1)
>> -               ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
> Without this patch it looks like we had a bug:
>
>          ret = XDP_PASS;
>          rcu_read_lock();
>          prog = rcu_dereference(rq->xdp_prog);
>          /* TODO: support multi buffer. */
>          if (prog && num_buf == 1)
>                  ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
>          rcu_read_unlock();
>
> This implies if num_buf is greater than 1, we will assume XDP_PASS?

Yes, I think XDP_DROP should be returned in that case.

>
>> +       if (prog) {
>> +               /* We are in zerocopy mode so we cannot copy the multi-buffer
>> +                * xdp buff to a single linear xdp buff. If we do so, in case
>> +                * the BPF program decides to redirect to a XDP socket (XSK),
>> +                * it will trigger the zerocopy receive logic in XDP socket.
>> +                * The receive logic thinks it receives zerocopy buffer while
>> +                * in fact, it is the copy one and everything is messed up.
>> +                * So just drop the packet here if we have a multi-buffer xdp
>> +                * buff and the BPF program does not support it.
>> +                */
>> +               if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
>> +                       ret = XDP_DROP;
> Could we move the check before trying to build a multi-buffer XDP buff?

Yes, I'll fix this in next version.

>
>> +               else
>> +                       ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
>> +                                                 stats);
>> +       }
>>          rcu_read_unlock();
>>
>>          switch (ret) {
>>          case XDP_PASS:
>> -               skb = xsk_construct_skb(rq, xdp);
>> +               skb = xdp_build_skb_from_zc(xdp);
> Is this better to make this change a separate patch?

Okay, I'll create a separate patch to convert the current XDP_PASS 
handler to use xdp_build_skb_from_zc helper.

>
>>                  if (!skb)
>> -                       goto drop_bufs;
>> +                       break;
>>
>> -               if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
>> -                       dev_kfree_skb(skb);
>> -                       goto drop;
>> -               }
>> +               /* Later, in virtnet_receive_done(), eth_type_trans()
>> +                * is called. However, in xdp_build_skb_from_zc(), it is called
>> +                * already. As a result, we need to reset the data to before
>> +                * the mac header so that the later call in
>> +                * virtnet_receive_done() works correctly.
>> +                */
>> +               skb_push(skb, ETH_HLEN);
>>
>>                  return skb;
>>
>> @@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>>                  return NULL;
>>
>>          default:
>> -               /* drop packet */
>> -               xsk_buff_free(xdp);
>> +               break;
>>          }
>>
>> -drop_bufs:
>> -       xsk_drop_follow_bufs(dev, rq, num_buf, stats);
>> -
>>   drop:
>> +       xsk_buff_free(xdp);
>>          u64_stats_inc(&stats->drops);
>>          return NULL;
>>   }
>> @@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
>>                  return -ENOMEM;
>>
>>          len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
>> +       /* Reserve some space for skb_shared_info */
>> +       len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>>
>>          for (i = 0; i < num; ++i) {
>>                  /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
>> @@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>          dev->netdev_ops = &virtnet_netdev;
>>          dev->stat_ops = &virtnet_stat_ops;
>>          dev->features = NETIF_F_HIGHDMA;
>> +       dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
>>
>>          dev->ethtool_ops = &virtnet_ethtool_ops;
>>          SET_NETDEV_DEV(dev, &vdev->dev);
>> --
>> 2.43.0
>>
> Thanks
>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-29 11:18   ` Maciej Fijalkowski
@ 2025-05-29 14:29     ` Bui Quang Minh
  2025-05-30 11:45       ` Maciej Fijalkowski
  0 siblings, 1 reply; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-29 14:29 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 5/29/25 18:18, Maciej Fijalkowski wrote:
> On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
>> This adds a test to test the virtio-net rx when there is a XDP socket
>> bound to it. There are tests for both copy mode and zerocopy mode, both
>> cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
>>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> Hi Bui,
>
> have you considered adjusting xskxceiver for your needs? If yes and you
> decided to go with another test app then what were the issues around it?
>
> This is yet another approach for xsk testing where we already have a
> test framework.

Hi,

I haven't tried much hard to adapt xskxceiver. I did have a look at 
xskxceiver but I felt the supported topology is not suitable for my 
need. To test the receiving side in virtio-net, I use Qemu to set up 
virtio-net in the guest and vhost-net in the host side. The sending side 
is in the host and the receiving is in the guest so I can't figure out 
how to do that with xskxceiver.

Thanks,
Quang Minh.

>
>> ---
>>   .../selftests/drivers/net/hw/.gitignore       |   3 +
>>   .../testing/selftests/drivers/net/hw/Makefile |  12 +-
>>   .../drivers/net/hw/xsk_receive.bpf.c          |  43 ++
>>   .../selftests/drivers/net/hw/xsk_receive.c    | 398 ++++++++++++++++++
>>   .../selftests/drivers/net/hw/xsk_receive.py   |  75 ++++
>>   5 files changed, 530 insertions(+), 1 deletion(-)
>>   create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
>>   create mode 100644 tools/testing/selftests/drivers/net/hw/xsk_receive.c
>>   create mode 100755 tools/testing/selftests/drivers/net/hw/xsk_receive.py
>>
>> diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore
>> index 6942bf575497..c32271faecff 100644
>> --- a/tools/testing/selftests/drivers/net/hw/.gitignore
>> +++ b/tools/testing/selftests/drivers/net/hw/.gitignore
>> @@ -1,3 +1,6 @@
>>   # SPDX-License-Identifier: GPL-2.0-only
>>   iou-zcrx
>>   ncdevmem
>> +xsk_receive.skel.h
>> +xsk_receive
>> +tools
>> diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
>> index df2c047ffa90..964edbb3b79f 100644
>> --- a/tools/testing/selftests/drivers/net/hw/Makefile
>> +++ b/tools/testing/selftests/drivers/net/hw/Makefile
>> @@ -1,6 +1,9 @@
>>   # SPDX-License-Identifier: GPL-2.0+ OR MIT
>>   
>> -TEST_GEN_FILES = iou-zcrx
>> +TEST_GEN_FILES = \
>> +	iou-zcrx \
>> +	xsk_receive \
>> +	#
>>   
>>   TEST_PROGS = \
>>   	csum.py \
>> @@ -20,6 +23,7 @@ TEST_PROGS = \
>>   	rss_input_xfrm.py \
>>   	tso.py \
>>   	xsk_reconfig.py \
>> +	xsk_receive.py \
>>   	#
>>   
>>   TEST_FILES := \
>> @@ -48,3 +52,9 @@ include ../../../net/ynl.mk
>>   include ../../../net/bpf.mk
>>   
>>   $(OUTPUT)/iou-zcrx: LDLIBS += -luring
>> +
>> +$(OUTPUT)/xsk_receive.skel.h: xsk_receive.bpf.o
>> +	bpftool gen skeleton xsk_receive.bpf.o > xsk_receive.skel.h
>> +
>> +$(OUTPUT)/xsk_receive: xsk_receive.skel.h
>> +$(OUTPUT)/xsk_receive: LDLIBS += -lbpf
>> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
>> new file mode 100644
>> index 000000000000..462046d95bfe
>> --- /dev/null
>> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.bpf.c
>> @@ -0,0 +1,43 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#include <linux/bpf.h>
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_endian.h>
>> +#include <linux/if_ether.h>
>> +#include <linux/ip.h>
>> +#include <linux/in.h>
>> +
>> +struct {
>> +	__uint(type, BPF_MAP_TYPE_XSKMAP);
>> +	__uint(max_entries, 1);
>> +	__uint(key_size, sizeof(__u32));
>> +	__uint(value_size, sizeof(__u32));
>> +} xsk_map SEC(".maps");
>> +
>> +SEC("xdp.frags")
>> +int dummy_prog(struct xdp_md *ctx)
>> +{
>> +	return XDP_PASS;
>> +}
>> +
>> +SEC("xdp.frags")
>> +int redirect_xsk_prog(struct xdp_md *ctx)
>> +{
>> +	void *data_end = (void *)(long)ctx->data_end;
>> +	void *data = (void *)(long)ctx->data;
>> +	struct ethhdr *eth = data;
>> +	struct iphdr *iph;
>> +
>> +	if (data + sizeof(*eth) + sizeof(*iph) > data_end)
>> +		return XDP_PASS;
>> +
>> +	if (bpf_htons(eth->h_proto) != ETH_P_IP)
>> +		return XDP_PASS;
>> +
>> +	iph = data + sizeof(*eth);
>> +	if (iph->protocol != IPPROTO_UDP)
>> +		return XDP_PASS;
>> +
>> +	return bpf_redirect_map(&xsk_map, 0, XDP_DROP);
>> +}
>> +
>> +char _license[] SEC("license") = "GPL";
>> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.c b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
>> new file mode 100644
>> index 000000000000..96213ceeda5c
>> --- /dev/null
>> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.c
>> @@ -0,0 +1,398 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#include <error.h>
>> +#include <errno.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <stdint.h>
>> +#include <string.h>
>> +#include <poll.h>
>> +#include <stdatomic.h>
>> +#include <unistd.h>
>> +#include <sys/mman.h>
>> +#include <net/if.h>
>> +#include <netinet/in.h>
>> +#include <arpa/inet.h>
>> +#include <linux/if_xdp.h>
>> +
>> +#include "xsk_receive.skel.h"
>> +
>> +#define load_acquire(p) \
>> +	atomic_load_explicit((_Atomic typeof(*(p)) *)(p), memory_order_acquire)
>> +
>> +#define store_release(p, v) \
>> +	atomic_store_explicit((_Atomic typeof(*(p)) *)(p), v, \
>> +			      memory_order_release)
>> +
>> +#define UMEM_CHUNK_SIZE 0x1000
>> +#define BUFFER_SIZE 0x2000
>> +
>> +#define SERVER_PORT 8888
>> +#define CLIENT_PORT 9999
>> +
>> +const int num_entries = 256;
>> +const char *pass_msg = "PASS";
>> +
>> +int cfg_client;
>> +int cfg_server;
>> +char *cfg_server_ip;
>> +char *cfg_client_ip;
>> +int cfg_ifindex;
>> +int cfg_redirect;
>> +int cfg_zerocopy;
>> +
>> +struct xdp_sock_context {
>> +	int xdp_sock;
>> +	void *umem_region;
>> +	void *rx_ring;
>> +	void *fill_ring;
>> +	struct xdp_mmap_offsets off;
>> +};
>> +
>> +struct xdp_sock_context *setup_xdp_socket(int ifindex)
>> +{
>> +	struct xdp_mmap_offsets off;
>> +	void *rx_ring, *fill_ring;
>> +	struct xdp_umem_reg umem_reg = {};
>> +	int optlen = sizeof(off);
>> +	int umem_len, sock, ret, i;
>> +	void *umem_region;
>> +	uint32_t *fr_producer;
>> +	uint64_t *addr;
>> +	struct sockaddr_xdp sxdp = {
>> +		.sxdp_family = AF_XDP,
>> +		.sxdp_ifindex = ifindex,
>> +		.sxdp_queue_id = 0,
>> +		.sxdp_flags = XDP_USE_SG,
>> +	};
>> +	struct xdp_sock_context *ctx;
>> +
>> +	ctx = malloc(sizeof(*ctx));
>> +	if (!ctx)
>> +		error(1, 0, "malloc()");
>> +
>> +	if (cfg_zerocopy)
>> +		sxdp.sxdp_flags |= XDP_ZEROCOPY;
>> +	else
>> +		sxdp.sxdp_flags |= XDP_COPY;
>> +
>> +	umem_len = UMEM_CHUNK_SIZE * num_entries;
>> +	umem_region = mmap(0, umem_len, PROT_READ | PROT_WRITE,
>> +			   MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +	if (umem_region == MAP_FAILED)
>> +		error(1, errno, "mmap() umem");
>> +	ctx->umem_region = umem_region;
>> +
>> +	sock = socket(AF_XDP, SOCK_RAW, 0);
>> +	if (sock < 0)
>> +		error(1, errno, "socket() XDP");
>> +	ctx->xdp_sock = sock;
>> +
>> +	ret = setsockopt(sock, SOL_XDP, XDP_RX_RING, &num_entries,
>> +			 sizeof(num_entries));
>> +	if (ret < 0)
>> +		error(1, errno, "setsockopt() XDP_RX_RING");
>> +
>> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_COMPLETION_RING, &num_entries,
>> +			 sizeof(num_entries));
>> +	if (ret < 0)
>> +		error(1, errno, "setsockopt() XDP_UMEM_COMPLETION_RING");
>> +
>> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_FILL_RING, &num_entries,
>> +			 sizeof(num_entries));
>> +	if (ret < 0)
>> +		error(1, errno, "setsockopt() XDP_UMEM_FILL_RING");
>> +
>> +	ret = getsockopt(sock, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
>> +	if (ret < 0)
>> +		error(1, errno, "getsockopt()");
>> +	ctx->off = off;
>> +
>> +	rx_ring = mmap(0, off.rx.desc + num_entries * sizeof(struct xdp_desc),
>> +		       PROT_READ | PROT_WRITE, MAP_SHARED, sock,
>> +		       XDP_PGOFF_RX_RING);
>> +	if (rx_ring == (void *)-1)
>> +		error(1, errno, "mmap() rx-ring");
>> +	ctx->rx_ring = rx_ring;
>> +
>> +	fill_ring = mmap(0, off.fr.desc + num_entries * sizeof(uint64_t),
>> +			 PROT_READ | PROT_WRITE, MAP_SHARED, sock,
>> +			 XDP_UMEM_PGOFF_FILL_RING);
>> +	if (fill_ring == (void *)-1)
>> +		error(1, errno, "mmap() fill-ring");
>> +	ctx->fill_ring = fill_ring;
>> +
>> +	umem_reg.addr = (unsigned long long)ctx->umem_region;
>> +	umem_reg.len = umem_len;
>> +	umem_reg.chunk_size = UMEM_CHUNK_SIZE;
>> +	ret = setsockopt(sock, SOL_XDP, XDP_UMEM_REG, &umem_reg,
>> +			 sizeof(umem_reg));
>> +	if (ret < 0)
>> +		error(1, errno, "setsockopt() XDP_UMEM_REG");
>> +
>> +	i = 0;
>> +	while (1) {
>> +		ret = bind(sock, (const struct sockaddr *)&sxdp, sizeof(sxdp));
>> +		if (!ret)
>> +			break;
>> +
>> +		if (errno == EBUSY && i < 3) {
>> +			i++;
>> +			sleep(1);
>> +		} else {
>> +			error(1, errno, "bind() XDP");
>> +		}
>> +	}
>> +
>> +	/* Submit all umem entries to fill ring */
>> +	addr = fill_ring + off.fr.desc;
>> +	for (i = 0; i < umem_len; i += UMEM_CHUNK_SIZE) {
>> +		*addr = i;
>> +		addr++;
>> +	}
>> +	fr_producer = fill_ring + off.fr.producer;
>> +	store_release(fr_producer, num_entries);
>> +
>> +	return ctx;
>> +}
>> +
>> +void setup_xdp_prog(int sock, int ifindex, int redirect)
>> +{
>> +	struct xsk_receive_bpf *bpf;
>> +	int key, ret;
>> +
>> +	bpf = xsk_receive_bpf__open_and_load();
>> +	if (!bpf)
>> +		error(1, 0, "open eBPF");
>> +
>> +	key = 0;
>> +	ret = bpf_map__update_elem(bpf->maps.xsk_map, &key, sizeof(key),
>> +				   &sock, sizeof(sock), 0);
>> +	if (ret < 0)
>> +		error(1, errno, "eBPF map update");
>> +
>> +	if (redirect) {
>> +		ret = bpf_xdp_attach(ifindex,
>> +				bpf_program__fd(bpf->progs.redirect_xsk_prog),
>> +				0, NULL);
>> +		if (ret < 0)
>> +			error(1, errno, "attach eBPF");
>> +	} else {
>> +		ret = bpf_xdp_attach(ifindex,
>> +				     bpf_program__fd(bpf->progs.dummy_prog),
>> +				     0, NULL);
>> +		if (ret < 0)
>> +			error(1, errno, "attach eBPF");
>> +	}
>> +}
>> +
>> +void send_pass_msg(int sock)
>> +{
>> +	int ret;
>> +	struct sockaddr_in addr = {
>> +		.sin_family = AF_INET,
>> +		.sin_addr = inet_addr(cfg_client_ip),
>> +		.sin_port = htons(CLIENT_PORT),
>> +	};
>> +
>> +	ret = sendto(sock, pass_msg, sizeof(pass_msg), 0,
>> +		     (const struct sockaddr *)&addr, sizeof(addr));
>> +	if (ret < 0)
>> +		error(1, errno, "sendto()");
>> +}
>> +
>> +void server_recv_xdp(struct xdp_sock_context *ctx, int udp_sock)
>> +{
>> +	int ret;
>> +	struct pollfd fds = {
>> +		.fd = ctx->xdp_sock,
>> +		.events = POLLIN,
>> +	};
>> +
>> +	ret = poll(&fds, 1, -1);
>> +	if (ret < 0)
>> +		error(1, errno, "poll()");
>> +
>> +	if (fds.revents & POLLIN) {
>> +		uint32_t *producer_ptr = ctx->rx_ring + ctx->off.rx.producer;
>> +		uint32_t *consumer_ptr = ctx->rx_ring + ctx->off.rx.consumer;
>> +		uint32_t producer, consumer;
>> +		struct xdp_desc *desc;
>> +
>> +		producer = load_acquire(producer_ptr);
>> +		consumer = load_acquire(consumer_ptr);
>> +
>> +		printf("Receive %d XDP buffers\n", producer - consumer);
>> +
>> +		store_release(consumer_ptr, producer);
>> +	} else {
>> +		error(1, 0, "unexpected poll event: %d", fds.revents);
>> +	}
>> +
>> +	send_pass_msg(udp_sock);
>> +}
>> +
>> +void server_recv_udp(int sock)
>> +{
>> +	char *buffer;
>> +	int i, ret;
>> +
>> +	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
>> +		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +	if (buffer == MAP_FAILED)
>> +		error(1, errno, "mmap() send buffer");
>> +
>> +	ret = recv(sock, buffer, BUFFER_SIZE, 0);
>> +	if (ret < 0)
>> +		error(1, errno, "recv()");
>> +
>> +	if (ret != BUFFER_SIZE)
>> +		error(1, errno, "message is truncated, expected: %d, got: %d",
>> +		      BUFFER_SIZE, ret);
>> +
>> +	for (i = 0; i < BUFFER_SIZE; i++)
>> +		if (buffer[i] != 'a' + (i % 26))
>> +			error(1, 0, "message mismatches at %d", i);
>> +
>> +	send_pass_msg(sock);
>> +}
>> +
>> +int setup_udp_sock(const char *addr, int port)
>> +{
>> +	int sock, ret;
>> +	struct sockaddr_in saddr = {
>> +		.sin_family = AF_INET,
>> +		.sin_addr = inet_addr(addr),
>> +		.sin_port = htons(port),
>> +	};
>> +
>> +	sock = socket(AF_INET, SOCK_DGRAM, 0);
>> +	if (sock < 0)
>> +		error(1, errno, "socket() UDP");
>> +
>> +	ret = bind(sock, (const struct sockaddr *)&saddr, sizeof(saddr));
>> +	if (ret < 0)
>> +		error(1, errno, "bind() UDP");
>> +
>> +	return sock;
>> +}
>> +
>> +void run_server(void)
>> +{
>> +	int udp_sock;
>> +	struct xdp_sock_context *ctx;
>> +
>> +	ctx = setup_xdp_socket(cfg_ifindex);
>> +	setup_xdp_prog(ctx->xdp_sock, cfg_ifindex, cfg_redirect);
>> +	udp_sock = setup_udp_sock(cfg_server_ip, SERVER_PORT);
>> +
>> +	if (cfg_redirect)
>> +		server_recv_xdp(ctx, udp_sock);
>> +	else
>> +		server_recv_udp(udp_sock);
>> +}
>> +
>> +void run_client(void)
>> +{
>> +	char *buffer;
>> +	int sock, ret, i;
>> +	struct sockaddr_in addr = {
>> +		.sin_family = AF_INET,
>> +		.sin_addr = inet_addr(cfg_server_ip),
>> +		.sin_port = htons(SERVER_PORT),
>> +	};
>> +
>> +	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
>> +		      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +	if (buffer == MAP_FAILED)
>> +		error(1, errno, "mmap() send buffer");
>> +
>> +	for (i = 0; i < BUFFER_SIZE; i++)
>> +		buffer[i] = 'a' + (i % 26);
>> +
>> +	sock = setup_udp_sock(cfg_client_ip, CLIENT_PORT);
>> +
>> +	ret = sendto(sock, buffer, BUFFER_SIZE, 0,
>> +		     (const struct sockaddr *)&addr, sizeof(addr));
>> +	if (ret < 0)
>> +		error(1, errno, "sendto()");
>> +
>> +	if (ret != BUFFER_SIZE)
>> +		error(1, 0, "sent buffer is truncated, expected: %d got: %d",
>> +		      BUFFER_SIZE, ret);
>> +
>> +	ret = recv(sock, buffer, BUFFER_SIZE, 0);
>> +	if (ret < 0)
>> +		error(1, errno, "recv()");
>> +
>> +	if ((ret != sizeof(pass_msg)) || strcmp(buffer, pass_msg))
>> +		error(1, 0, "message mismatches, expected: %s, got: %s",
>> +		      pass_msg, buffer);
>> +}
>> +
>> +void print_usage(char *prog)
>> +{
>> +	fprintf(stderr, "Usage: %s (-c|-s) -r<server_ip> -l<client_ip>"
>> +		" -i<server_ifname> [-d] [-z]\n", prog);
>> +}
>> +
>> +void parse_opts(int argc, char **argv)
>> +{
>> +	int opt;
>> +	char *ifname = NULL;
>> +
>> +	while ((opt = getopt(argc, argv, "hcsr:l:i:dz")) != -1) {
>> +		switch (opt) {
>> +		case 'c':
>> +			if (cfg_server)
>> +				error(1, 0, "Pass one of -s or -c");
>> +
>> +			cfg_client = 1;
>> +			break;
>> +		case 's':
>> +			if (cfg_client)
>> +				error(1, 0, "Pass one of -s or -c");
>> +
>> +			cfg_server = 1;
>> +			break;
>> +		case 'r':
>> +			cfg_server_ip = optarg;
>> +			break;
>> +		case 'l':
>> +			cfg_client_ip = optarg;
>> +			break;
>> +		case 'i':
>> +			ifname = optarg;
>> +			break;
>> +		case 'd':
>> +			cfg_redirect = 1;
>> +			break;
>> +		case 'z':
>> +			cfg_zerocopy = 1;
>> +			break;
>> +		case 'h':
>> +		default:
>> +			print_usage(argv[0]);
>> +			exit(1);
>> +		}
>> +	}
>> +
>> +	if (!cfg_client && !cfg_server)
>> +		error(1, 0, "Pass one of -s or -c");
>> +
>> +	if (ifname) {
>> +		cfg_ifindex = if_nametoindex(ifname);
>> +		if (!cfg_ifindex)
>> +			error(1, errno, "Invalid interface %s", ifname);
>> +	}
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +	parse_opts(argc, argv);
>> +	if (cfg_client)
>> +		run_client();
>> +	else if (cfg_server)
>> +		run_server();
>> +
>> +	return 0;
>> +}
>> diff --git a/tools/testing/selftests/drivers/net/hw/xsk_receive.py b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
>> new file mode 100755
>> index 000000000000..f32cb4477b75
>> --- /dev/null
>> +++ b/tools/testing/selftests/drivers/net/hw/xsk_receive.py
>> @@ -0,0 +1,75 @@
>> +#!/usr/bin/env python3
>> +# SPDX-License-Identifier: GPL-2.0
>> +
>> +# This a test for virtio-net rx when there is a XDP socket bound to it. The test
>> +# is expected to be run in the host side.
>> +#
>> +# The run example:
>> +#
>> +# export NETIF=tap0
>> +# export LOCAL_V4=192.168.31.1
>> +# export REMOTE_V4=192.168.31.3
>> +# export REMOTE_TYPE=ssh
>> +# export REMOTE_ARGS='root@192.168.31.3'
>> +# ./ksft-net-drv/run_kselftest.sh -t drivers/net/hw:xsk_receive.py
>> +#
>> +# where:
>> +# - 192.168.31.1 is the IP of tap device in the host
>> +# - 192.168.31.3 is the IP of virtio-net device in the guest
>> +#
>> +# The Qemu command to setup virtio-net
>> +# -netdev tap,id=hostnet1,vhost=on,script=no,downscript=no
>> +# -device virtio-net-pci,netdev=hostnet1,iommu_platform=on,disable-legacy=on
>> +#
>> +# The MTU of tap device can be adjusted to test more cases:
>> +# - 1500: single buffer XDP
>> +# - 9000: multi-buffer XDP
>> +
>> +from lib.py import ksft_exit, ksft_run
>> +from lib.py import KsftSkipEx, KsftFailEx
>> +from lib.py import NetDrvEpEnv
>> +from lib.py import bkg, cmd, wait_port_listen
>> +from os import path
>> +
>> +SERVER_PORT = 8888
>> +CLIENT_PORT = 9999
>> +
>> +def test_xdp_pass(cfg, server_cmd, client_cmd):
>> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
>> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
>> +        cmd(client_cmd)
>> +
>> +def test_xdp_pass_zc(cfg, server_cmd, client_cmd):
>> +    server_cmd += " -z"
>> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
>> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
>> +        cmd(client_cmd)
>> +
>> +def test_xdp_redirect(cfg, server_cmd, client_cmd):
>> +    server_cmd += " -d"
>> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
>> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
>> +        cmd(client_cmd)
>> +
>> +def test_xdp_redirect_zc(cfg, server_cmd, client_cmd):
>> +    server_cmd += " -d -z"
>> +    with bkg(server_cmd, host=cfg.remote, exit_wait=True):
>> +        wait_port_listen(SERVER_PORT, proto="udp", host=cfg.remote)
>> +        cmd(client_cmd)
>> +
>> +def main():
>> +    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
>> +        cfg.bin_local = path.abspath(path.dirname(__file__)
>> +                            + "/../../../drivers/net/hw/xsk_receive")
>> +        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
>> +
>> +        server_cmd = f"{cfg.bin_remote} -s -i {cfg.remote_ifname} "
>> +        server_cmd += f"-r {cfg.remote_addr_v["4"]} -l {cfg.addr_v["4"]}"
>> +        client_cmd = f"{cfg.bin_local} -c -r {cfg.remote_addr_v["4"]} "
>> +        client_cmd += f"-l {cfg.addr_v["4"]}"
>> +
>> +        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, server_cmd, client_cmd))
>> +    ksft_exit()
>> +
>> +if __name__ == "__main__":
>> +    main()
>> -- 
>> 2.43.0
>>
>>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-29 14:29     ` Bui Quang Minh
@ 2025-05-30 11:45       ` Maciej Fijalkowski
  2025-05-31  8:51         ` Bui Quang Minh
  0 siblings, 1 reply; 18+ messages in thread
From: Maciej Fijalkowski @ 2025-05-30 11:45 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On Thu, May 29, 2025 at 09:29:14PM +0700, Bui Quang Minh wrote:
> On 5/29/25 18:18, Maciej Fijalkowski wrote:
> > On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
> > > This adds a test to test the virtio-net rx when there is a XDP socket
> > > bound to it. There are tests for both copy mode and zerocopy mode, both
> > > cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
> > > 
> > > Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> > Hi Bui,
> > 
> > have you considered adjusting xskxceiver for your needs? If yes and you
> > decided to go with another test app then what were the issues around it?
> > 
> > This is yet another approach for xsk testing where we already have a
> > test framework.
> 
> Hi,
> 
> I haven't tried much hard to adapt xskxceiver. I did have a look at
> xskxceiver but I felt the supported topology is not suitable for my need. To
> test the receiving side in virtio-net, I use Qemu to set up virtio-net in
> the guest and vhost-net in the host side. The sending side is in the host
> and the receiving is in the guest so I can't figure out how to do that with
> xskxceiver.

I see - couldn't the python side be executing xdpsock then instead of your
own app?

I wouldn't like to end up with several xsk tools for testing data path on
different environments.

> 
> Thanks,
> Quang Minh.
> 
> > 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-30 11:45       ` Maciej Fijalkowski
@ 2025-05-31  8:51         ` Bui Quang Minh
  2025-06-02 15:55           ` Maciej Fijalkowski
  0 siblings, 1 reply; 18+ messages in thread
From: Bui Quang Minh @ 2025-05-31  8:51 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 5/30/25 18:45, Maciej Fijalkowski wrote:
> On Thu, May 29, 2025 at 09:29:14PM +0700, Bui Quang Minh wrote:
>> On 5/29/25 18:18, Maciej Fijalkowski wrote:
>>> On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
>>>> This adds a test to test the virtio-net rx when there is a XDP socket
>>>> bound to it. There are tests for both copy mode and zerocopy mode, both
>>>> cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
>>>>
>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>> Hi Bui,
>>>
>>> have you considered adjusting xskxceiver for your needs? If yes and you
>>> decided to go with another test app then what were the issues around it?
>>>
>>> This is yet another approach for xsk testing where we already have a
>>> test framework.
>> Hi,
>>
>> I haven't tried much hard to adapt xskxceiver. I did have a look at
>> xskxceiver but I felt the supported topology is not suitable for my need. To
>> test the receiving side in virtio-net, I use Qemu to set up virtio-net in
>> the guest and vhost-net in the host side. The sending side is in the host
>> and the receiving is in the guest so I can't figure out how to do that with
>> xskxceiver.
> I see - couldn't the python side be executing xdpsock then instead of your
> own app?

I'm not aware of xdpsock. Could you give the path to that file?

> I wouldn't like to end up with several xsk tools for testing data path on
> different environments.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-05-31  8:51         ` Bui Quang Minh
@ 2025-06-02 15:55           ` Maciej Fijalkowski
  2025-06-03 14:18             ` Bui Quang Minh
  0 siblings, 1 reply; 18+ messages in thread
From: Maciej Fijalkowski @ 2025-06-02 15:55 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On Sat, May 31, 2025 at 03:51:57PM +0700, Bui Quang Minh wrote:
> On 5/30/25 18:45, Maciej Fijalkowski wrote:
> > On Thu, May 29, 2025 at 09:29:14PM +0700, Bui Quang Minh wrote:
> > > On 5/29/25 18:18, Maciej Fijalkowski wrote:
> > > > On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
> > > > > This adds a test to test the virtio-net rx when there is a XDP socket
> > > > > bound to it. There are tests for both copy mode and zerocopy mode, both
> > > > > cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
> > > > > 
> > > > > Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> > > > Hi Bui,
> > > > 
> > > > have you considered adjusting xskxceiver for your needs? If yes and you
> > > > decided to go with another test app then what were the issues around it?
> > > > 
> > > > This is yet another approach for xsk testing where we already have a
> > > > test framework.
> > > Hi,
> > > 
> > > I haven't tried much hard to adapt xskxceiver. I did have a look at
> > > xskxceiver but I felt the supported topology is not suitable for my need. To
> > > test the receiving side in virtio-net, I use Qemu to set up virtio-net in
> > > the guest and vhost-net in the host side. The sending side is in the host
> > > and the receiving is in the guest so I can't figure out how to do that with
> > > xskxceiver.
> > I see - couldn't the python side be executing xdpsock then instead of your
> > own app?
> 
> I'm not aware of xdpsock. Could you give the path to that file?

https://github.com/xdp-project/bpf-examples/tree/main/AF_XDP-example

this is our go-to app side of AF_XDP.

> 
> > I wouldn't like to end up with several xsk tools for testing data path on
> > different environments.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-05-29 12:28     ` Bui Quang Minh
@ 2025-06-03  2:56       ` Jason Wang
  2025-06-03  6:04         ` Lei Yang
  2025-06-03 14:22         ` Bui Quang Minh
  0 siblings, 2 replies; 18+ messages in thread
From: Jason Wang @ 2025-06-03  2:56 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On Thu, May 29, 2025 at 8:28 PM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>
> On 5/29/25 12:59, Jason Wang wrote:
> > On Wed, May 28, 2025 at 12:19 AM Bui Quang Minh
> > <minhquangbui99@gmail.com> wrote:
> >> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
> >> does not support multi buffer but a single buffer only. This commit adds
> >> support for multi mergeable receive buffer in the zerocopy XDP path by
> >> utilizing XDP buffer with frags.
> >>
> >> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> >> ---
> >>   drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
> >>   1 file changed, 66 insertions(+), 57 deletions(-)
> >>
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index e53ba600605a..a9558650f205 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
> >>   #define VIRTIO_XDP_TX          BIT(0)
> >>   #define VIRTIO_XDP_REDIR       BIT(1)
> >>
> >> +#define VIRTNET_MAX_ZC_SEGS    8
> >> +
> >>   /* RX packet size EWMA. The average packet size is used to determine the packet
> >>    * buffer size when refilling RX rings. As the entire RX ring may be refilled
> >>    * at once, the weight is chosen so that the EWMA will be insensitive to short-
> >> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
> >>          }
> >>   }
> >>
> >> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
> >> -                                  struct receive_queue *rq,
> >> -                                  struct sk_buff *head_skb,
> >> -                                  u32 num_buf,
> >> -                                  struct virtio_net_hdr_mrg_rxbuf *hdr,
> >> -                                  struct virtnet_rq_stats *stats)
> >> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
> >> +                                     struct receive_queue *rq,
> >> +                                     u32 num_buf,
> >> +                                     struct xdp_buff *xdp,
> >> +                                     struct virtnet_rq_stats *stats)
> >>   {
> >> -       struct sk_buff *curr_skb;
> >> -       struct xdp_buff *xdp;
> >> -       u32 len, truesize;
> >> -       struct page *page;
> >> +       unsigned int len;
> >>          void *buf;
> >>
> >> -       curr_skb = head_skb;
> >> +       if (num_buf < 2)
> >> +               return 0;
> >> +
> >> +       while (num_buf > 1) {
> >> +               struct xdp_buff *new_xdp;
> >>
> >> -       while (--num_buf) {
> >>                  buf = virtqueue_get_buf(rq->vq, &len);
> >> -               if (unlikely(!buf)) {
> >> -                       pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >> -                                vi->dev->name, num_buf,
> >> -                                virtio16_to_cpu(vi->vdev,
> >> -                                                hdr->num_buffers));
> >> +               if (!unlikely(buf)) {
> >> +                       pr_debug("%s: rx error: %d buffers missing\n",
> >> +                                vi->dev->name, num_buf);
> >>                          DEV_STATS_INC(vi->dev, rx_length_errors);
> >> -                       return -EINVAL;
> >> -               }
> >> -
> >> -               u64_stats_add(&stats->bytes, len);
> >> -
> >> -               xdp = buf_to_xdp(vi, rq, buf, len);
> >> -               if (!xdp)
> >> -                       goto err;
> >> -
> >> -               buf = napi_alloc_frag(len);
> >> -               if (!buf) {
> >> -                       xsk_buff_free(xdp);
> >> -                       goto err;
> >> +                       return -1;
> >>                  }
> >>
> >> -               memcpy(buf, xdp->data - vi->hdr_len, len);
> >> -
> >> -               xsk_buff_free(xdp);
> >> +               new_xdp = buf_to_xdp(vi, rq, buf, len);
> >> +               if (!new_xdp)
> >> +                       goto drop_bufs;
> >>
> >> -               page = virt_to_page(buf);
> >> +               /* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
> >> +                * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
> >> +                * However, only the first packet has the virtio_net_hdr, the
> >> +                * following ones do not. So we need to adjust the following
> > Typo here.
>
> I'm sorry, could you clarify which word contains the typo?
>
> >
> >> +                * packets' data pointer to the correct place.
> >> +                */
> > I wonder what happens if we don't use this trick? I meant we don't
> > reuse the header room for the virtio-net header. This seems to be fine
> > for a mergeable buffer and can help to reduce the trick.
>
> I don't think using the header room for virtio-net header creates this
> case handling. In my opinion, it comes from the slightly difference in
> the recvbuf between single buffer and multi-buffer. When we have n
> single-buffer packets, each buffer will have its own virtio-net header.
> But when we have 1 multi-buffer packet (which spans across n buffers),
> only the first buffer has virtio-net header, the following buffers do not.
>
> There 2 important pointers here. The pointer we announce to the vhost
> side to fill the data, let's call it announced_addr, and xdp_buff->data
> which is expected to point the the start of Ethernet frame. Currently,
>
>      announced_addr = xdp_buff->data - hdr_len
>
> The host side will write the virtio-net header to announced_addr then
> the Ethernet frame's data in the first buffer. In case of multi-buffer
> packet, in the following buffers, host side writes the Ethernet frame's
> data to the announced_addr no virtio-net header. So in the virtio-net,
> we need to subtract xdp_buff->data, otherwise, we lose some Ethernet
> frame's data.
>
> I think a slightly better solution is that we set announced_addr =
> xdp_buff->data then we only need to xdp_buff->data += hdr_len for the
> first buffer and do need to adjust xdp_buff->data of the following buffers.

Exactly my point.

>
> >
> >> +               new_xdp->data -= vi->hdr_len;
> >> +               new_xdp->data_end = new_xdp->data + len;
> >>
> >> -               truesize = len;
> >> +               if (!xsk_buff_add_frag(xdp, new_xdp))
> >> +                       goto drop_bufs;
> >>
> >> -               curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
> >> -                                                   buf, len, truesize);
> >> -               if (!curr_skb) {
> >> -                       put_page(page);
> >> -                       goto err;
> >> -               }
> >> +               num_buf--;
> >>          }
> >>
> >>          return 0;
> >>
> >> -err:
> >> +drop_bufs:
> >>          xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
> >> -       return -EINVAL;
> >> +       return -1;
> >>   }
> >>
> >>   static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
> >> @@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
> >>          num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
> >>
> >>          ret = XDP_PASS;
> >> +       if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
> >> +               goto drop;
> >> +
> >>          rcu_read_lock();
> >>          prog = rcu_dereference(rq->xdp_prog);
> >> -       /* TODO: support multi buffer. */
> >> -       if (prog && num_buf == 1)
> >> -               ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
> > Without this patch it looks like we had a bug:
> >
> >          ret = XDP_PASS;
> >          rcu_read_lock();
> >          prog = rcu_dereference(rq->xdp_prog);
> >          /* TODO: support multi buffer. */
> >          if (prog && num_buf == 1)
> >                  ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
> >          rcu_read_unlock();
> >
> > This implies if num_buf is greater than 1, we will assume XDP_PASS?
>
> Yes, I think XDP_DROP should be returned in that case.

Care to post a patch and cc stable?

>
> >
> >> +       if (prog) {
> >> +               /* We are in zerocopy mode so we cannot copy the multi-buffer
> >> +                * xdp buff to a single linear xdp buff. If we do so, in case
> >> +                * the BPF program decides to redirect to a XDP socket (XSK),
> >> +                * it will trigger the zerocopy receive logic in XDP socket.
> >> +                * The receive logic thinks it receives zerocopy buffer while
> >> +                * in fact, it is the copy one and everything is messed up.
> >> +                * So just drop the packet here if we have a multi-buffer xdp
> >> +                * buff and the BPF program does not support it.
> >> +                */
> >> +               if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
> >> +                       ret = XDP_DROP;
> > Could we move the check before trying to build a multi-buffer XDP buff?
>
> Yes, I'll fix this in next version.
>
> >
> >> +               else
> >> +                       ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
> >> +                                                 stats);
> >> +       }
> >>          rcu_read_unlock();
> >>
> >>          switch (ret) {
> >>          case XDP_PASS:
> >> -               skb = xsk_construct_skb(rq, xdp);
> >> +               skb = xdp_build_skb_from_zc(xdp);
> > Is this better to make this change a separate patch?
>
> Okay, I'll create a separate patch to convert the current XDP_PASS
> handler to use xdp_build_skb_from_zc helper.

That would be better.

>
> >
> >>                  if (!skb)
> >> -                       goto drop_bufs;
> >> +                       break;
> >>
> >> -               if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
> >> -                       dev_kfree_skb(skb);
> >> -                       goto drop;
> >> -               }
> >> +               /* Later, in virtnet_receive_done(), eth_type_trans()
> >> +                * is called. However, in xdp_build_skb_from_zc(), it is called
> >> +                * already. As a result, we need to reset the data to before
> >> +                * the mac header so that the later call in
> >> +                * virtnet_receive_done() works correctly.
> >> +                */
> >> +               skb_push(skb, ETH_HLEN);
> >>
> >>                  return skb;
> >>
> >> @@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
> >>                  return NULL;
> >>
> >>          default:
> >> -               /* drop packet */
> >> -               xsk_buff_free(xdp);
> >> +               break;
> >>          }
> >>
> >> -drop_bufs:
> >> -       xsk_drop_follow_bufs(dev, rq, num_buf, stats);
> >> -
> >>   drop:
> >> +       xsk_buff_free(xdp);
> >>          u64_stats_inc(&stats->drops);
> >>          return NULL;
> >>   }
> >> @@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
> >>                  return -ENOMEM;
> >>
> >>          len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
> >> +       /* Reserve some space for skb_shared_info */
> >> +       len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >>
> >>          for (i = 0; i < num; ++i) {
> >>                  /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
> >> @@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>          dev->netdev_ops = &virtnet_netdev;
> >>          dev->stat_ops = &virtnet_stat_ops;
> >>          dev->features = NETIF_F_HIGHDMA;
> >> +       dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
> >>
> >>          dev->ethtool_ops = &virtnet_ethtool_ops;
> >>          SET_NETDEV_DEV(dev, &vdev->dev);
> >> --
> >> 2.43.0
> >>
> > Thanks
> >
>

Thanks


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-06-03  2:56       ` Jason Wang
@ 2025-06-03  6:04         ` Lei Yang
  2025-06-03 14:22         ` Bui Quang Minh
  1 sibling, 0 replies; 18+ messages in thread
From: Lei Yang @ 2025-06-03  6:04 UTC (permalink / raw)
  To: Bui Quang Minh
  Cc: netdev, Michael S. Tsirkin, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, Jason Wang, John Fastabend,
	virtualization, linux-kernel, bpf

Tested this patch with virtio-net regression tests, everything works fine.

Tested-by: Lei Yang <leiyang@redhat.com>

On Tue, Jun 3, 2025 at 10:57 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, May 29, 2025 at 8:28 PM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >
> > On 5/29/25 12:59, Jason Wang wrote:
> > > On Wed, May 28, 2025 at 12:19 AM Bui Quang Minh
> > > <minhquangbui99@gmail.com> wrote:
> > >> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
> > >> does not support multi buffer but a single buffer only. This commit adds
> > >> support for multi mergeable receive buffer in the zerocopy XDP path by
> > >> utilizing XDP buffer with frags.
> > >>
> > >> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> > >> ---
> > >>   drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
> > >>   1 file changed, 66 insertions(+), 57 deletions(-)
> > >>
> > >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > >> index e53ba600605a..a9558650f205 100644
> > >> --- a/drivers/net/virtio_net.c
> > >> +++ b/drivers/net/virtio_net.c
> > >> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
> > >>   #define VIRTIO_XDP_TX          BIT(0)
> > >>   #define VIRTIO_XDP_REDIR       BIT(1)
> > >>
> > >> +#define VIRTNET_MAX_ZC_SEGS    8
> > >> +
> > >>   /* RX packet size EWMA. The average packet size is used to determine the packet
> > >>    * buffer size when refilling RX rings. As the entire RX ring may be refilled
> > >>    * at once, the weight is chosen so that the EWMA will be insensitive to short-
> > >> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
> > >>          }
> > >>   }
> > >>
> > >> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
> > >> -                                  struct receive_queue *rq,
> > >> -                                  struct sk_buff *head_skb,
> > >> -                                  u32 num_buf,
> > >> -                                  struct virtio_net_hdr_mrg_rxbuf *hdr,
> > >> -                                  struct virtnet_rq_stats *stats)
> > >> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
> > >> +                                     struct receive_queue *rq,
> > >> +                                     u32 num_buf,
> > >> +                                     struct xdp_buff *xdp,
> > >> +                                     struct virtnet_rq_stats *stats)
> > >>   {
> > >> -       struct sk_buff *curr_skb;
> > >> -       struct xdp_buff *xdp;
> > >> -       u32 len, truesize;
> > >> -       struct page *page;
> > >> +       unsigned int len;
> > >>          void *buf;
> > >>
> > >> -       curr_skb = head_skb;
> > >> +       if (num_buf < 2)
> > >> +               return 0;
> > >> +
> > >> +       while (num_buf > 1) {
> > >> +               struct xdp_buff *new_xdp;
> > >>
> > >> -       while (--num_buf) {
> > >>                  buf = virtqueue_get_buf(rq->vq, &len);
> > >> -               if (unlikely(!buf)) {
> > >> -                       pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >> -                                vi->dev->name, num_buf,
> > >> -                                virtio16_to_cpu(vi->vdev,
> > >> -                                                hdr->num_buffers));
> > >> +               if (!unlikely(buf)) {
> > >> +                       pr_debug("%s: rx error: %d buffers missing\n",
> > >> +                                vi->dev->name, num_buf);
> > >>                          DEV_STATS_INC(vi->dev, rx_length_errors);
> > >> -                       return -EINVAL;
> > >> -               }
> > >> -
> > >> -               u64_stats_add(&stats->bytes, len);
> > >> -
> > >> -               xdp = buf_to_xdp(vi, rq, buf, len);
> > >> -               if (!xdp)
> > >> -                       goto err;
> > >> -
> > >> -               buf = napi_alloc_frag(len);
> > >> -               if (!buf) {
> > >> -                       xsk_buff_free(xdp);
> > >> -                       goto err;
> > >> +                       return -1;
> > >>                  }
> > >>
> > >> -               memcpy(buf, xdp->data - vi->hdr_len, len);
> > >> -
> > >> -               xsk_buff_free(xdp);
> > >> +               new_xdp = buf_to_xdp(vi, rq, buf, len);
> > >> +               if (!new_xdp)
> > >> +                       goto drop_bufs;
> > >>
> > >> -               page = virt_to_page(buf);
> > >> +               /* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
> > >> +                * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
> > >> +                * However, only the first packet has the virtio_net_hdr, the
> > >> +                * following ones do not. So we need to adjust the following
> > > Typo here.
> >
> > I'm sorry, could you clarify which word contains the typo?
> >
> > >
> > >> +                * packets' data pointer to the correct place.
> > >> +                */
> > > I wonder what happens if we don't use this trick? I meant we don't
> > > reuse the header room for the virtio-net header. This seems to be fine
> > > for a mergeable buffer and can help to reduce the trick.
> >
> > I don't think using the header room for virtio-net header creates this
> > case handling. In my opinion, it comes from the slightly difference in
> > the recvbuf between single buffer and multi-buffer. When we have n
> > single-buffer packets, each buffer will have its own virtio-net header.
> > But when we have 1 multi-buffer packet (which spans across n buffers),
> > only the first buffer has virtio-net header, the following buffers do not.
> >
> > There 2 important pointers here. The pointer we announce to the vhost
> > side to fill the data, let's call it announced_addr, and xdp_buff->data
> > which is expected to point the the start of Ethernet frame. Currently,
> >
> >      announced_addr = xdp_buff->data - hdr_len
> >
> > The host side will write the virtio-net header to announced_addr then
> > the Ethernet frame's data in the first buffer. In case of multi-buffer
> > packet, in the following buffers, host side writes the Ethernet frame's
> > data to the announced_addr no virtio-net header. So in the virtio-net,
> > we need to subtract xdp_buff->data, otherwise, we lose some Ethernet
> > frame's data.
> >
> > I think a slightly better solution is that we set announced_addr =
> > xdp_buff->data then we only need to xdp_buff->data += hdr_len for the
> > first buffer and do need to adjust xdp_buff->data of the following buffers.
>
> Exactly my point.
>
> >
> > >
> > >> +               new_xdp->data -= vi->hdr_len;
> > >> +               new_xdp->data_end = new_xdp->data + len;
> > >>
> > >> -               truesize = len;
> > >> +               if (!xsk_buff_add_frag(xdp, new_xdp))
> > >> +                       goto drop_bufs;
> > >>
> > >> -               curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
> > >> -                                                   buf, len, truesize);
> > >> -               if (!curr_skb) {
> > >> -                       put_page(page);
> > >> -                       goto err;
> > >> -               }
> > >> +               num_buf--;
> > >>          }
> > >>
> > >>          return 0;
> > >>
> > >> -err:
> > >> +drop_bufs:
> > >>          xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
> > >> -       return -EINVAL;
> > >> +       return -1;
> > >>   }
> > >>
> > >>   static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
> > >> @@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
> > >>          num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
> > >>
> > >>          ret = XDP_PASS;
> > >> +       if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
> > >> +               goto drop;
> > >> +
> > >>          rcu_read_lock();
> > >>          prog = rcu_dereference(rq->xdp_prog);
> > >> -       /* TODO: support multi buffer. */
> > >> -       if (prog && num_buf == 1)
> > >> -               ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
> > > Without this patch it looks like we had a bug:
> > >
> > >          ret = XDP_PASS;
> > >          rcu_read_lock();
> > >          prog = rcu_dereference(rq->xdp_prog);
> > >          /* TODO: support multi buffer. */
> > >          if (prog && num_buf == 1)
> > >                  ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
> > >          rcu_read_unlock();
> > >
> > > This implies if num_buf is greater than 1, we will assume XDP_PASS?
> >
> > Yes, I think XDP_DROP should be returned in that case.
>
> Care to post a patch and cc stable?
>
> >
> > >
> > >> +       if (prog) {
> > >> +               /* We are in zerocopy mode so we cannot copy the multi-buffer
> > >> +                * xdp buff to a single linear xdp buff. If we do so, in case
> > >> +                * the BPF program decides to redirect to a XDP socket (XSK),
> > >> +                * it will trigger the zerocopy receive logic in XDP socket.
> > >> +                * The receive logic thinks it receives zerocopy buffer while
> > >> +                * in fact, it is the copy one and everything is messed up.
> > >> +                * So just drop the packet here if we have a multi-buffer xdp
> > >> +                * buff and the BPF program does not support it.
> > >> +                */
> > >> +               if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
> > >> +                       ret = XDP_DROP;
> > > Could we move the check before trying to build a multi-buffer XDP buff?
> >
> > Yes, I'll fix this in next version.
> >
> > >
> > >> +               else
> > >> +                       ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
> > >> +                                                 stats);
> > >> +       }
> > >>          rcu_read_unlock();
> > >>
> > >>          switch (ret) {
> > >>          case XDP_PASS:
> > >> -               skb = xsk_construct_skb(rq, xdp);
> > >> +               skb = xdp_build_skb_from_zc(xdp);
> > > Is this better to make this change a separate patch?
> >
> > Okay, I'll create a separate patch to convert the current XDP_PASS
> > handler to use xdp_build_skb_from_zc helper.
>
> That would be better.
>
> >
> > >
> > >>                  if (!skb)
> > >> -                       goto drop_bufs;
> > >> +                       break;
> > >>
> > >> -               if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
> > >> -                       dev_kfree_skb(skb);
> > >> -                       goto drop;
> > >> -               }
> > >> +               /* Later, in virtnet_receive_done(), eth_type_trans()
> > >> +                * is called. However, in xdp_build_skb_from_zc(), it is called
> > >> +                * already. As a result, we need to reset the data to before
> > >> +                * the mac header so that the later call in
> > >> +                * virtnet_receive_done() works correctly.
> > >> +                */
> > >> +               skb_push(skb, ETH_HLEN);
> > >>
> > >>                  return skb;
> > >>
> > >> @@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
> > >>                  return NULL;
> > >>
> > >>          default:
> > >> -               /* drop packet */
> > >> -               xsk_buff_free(xdp);
> > >> +               break;
> > >>          }
> > >>
> > >> -drop_bufs:
> > >> -       xsk_drop_follow_bufs(dev, rq, num_buf, stats);
> > >> -
> > >>   drop:
> > >> +       xsk_buff_free(xdp);
> > >>          u64_stats_inc(&stats->drops);
> > >>          return NULL;
> > >>   }
> > >> @@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
> > >>                  return -ENOMEM;
> > >>
> > >>          len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
> > >> +       /* Reserve some space for skb_shared_info */
> > >> +       len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > >>
> > >>          for (i = 0; i < num; ++i) {
> > >>                  /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
> > >> @@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> > >>          dev->netdev_ops = &virtnet_netdev;
> > >>          dev->stat_ops = &virtnet_stat_ops;
> > >>          dev->features = NETIF_F_HIGHDMA;
> > >> +       dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
> > >>
> > >>          dev->ethtool_ops = &virtnet_ethtool_ops;
> > >>          SET_NETDEV_DEV(dev, &vdev->dev);
> > >> --
> > >> 2.43.0
> > >>
> > > Thanks
> > >
> >
>
> Thanks
>
>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net
  2025-06-02 15:55           ` Maciej Fijalkowski
@ 2025-06-03 14:18             ` Bui Quang Minh
  0 siblings, 0 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-06-03 14:18 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 6/2/25 22:55, Maciej Fijalkowski wrote:
> On Sat, May 31, 2025 at 03:51:57PM +0700, Bui Quang Minh wrote:
>> On 5/30/25 18:45, Maciej Fijalkowski wrote:
>>> On Thu, May 29, 2025 at 09:29:14PM +0700, Bui Quang Minh wrote:
>>>> On 5/29/25 18:18, Maciej Fijalkowski wrote:
>>>>> On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
>>>>>> This adds a test to test the virtio-net rx when there is a XDP socket
>>>>>> bound to it. There are tests for both copy mode and zerocopy mode, both
>>>>>> cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
>>>>>>
>>>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>>>> Hi Bui,
>>>>>
>>>>> have you considered adjusting xskxceiver for your needs? If yes and you
>>>>> decided to go with another test app then what were the issues around it?
>>>>>
>>>>> This is yet another approach for xsk testing where we already have a
>>>>> test framework.
>>>> Hi,
>>>>
>>>> I haven't tried much hard to adapt xskxceiver. I did have a look at
>>>> xskxceiver but I felt the supported topology is not suitable for my need. To
>>>> test the receiving side in virtio-net, I use Qemu to set up virtio-net in
>>>> the guest and vhost-net in the host side. The sending side is in the host
>>>> and the receiving is in the guest so I can't figure out how to do that with
>>>> xskxceiver.
>>> I see - couldn't the python side be executing xdpsock then instead of your
>>> own app?
>> I'm not aware of xdpsock. Could you give the path to that file?
> https://github.com/xdp-project/bpf-examples/tree/main/AF_XDP-example
>
> this is our go-to app side of AF_XDP.

Thanks, I'll take a look at it and try to use it for selftest if 
possible in next version.

Quang Minh.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH net-next v2 1/2] virtio-net: support zerocopy multi buffer XDP in mergeable
  2025-06-03  2:56       ` Jason Wang
  2025-06-03  6:04         ` Lei Yang
@ 2025-06-03 14:22         ` Bui Quang Minh
  1 sibling, 0 replies; 18+ messages in thread
From: Bui Quang Minh @ 2025-06-03 14:22 UTC (permalink / raw)
  To: Jason Wang
  Cc: netdev, Michael S. Tsirkin, Xuan Zhuo, Eugenio Pérez,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization,
	linux-kernel, bpf

On 6/3/25 09:56, Jason Wang wrote:
> On Thu, May 29, 2025 at 8:28 PM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>> On 5/29/25 12:59, Jason Wang wrote:
>>> On Wed, May 28, 2025 at 12:19 AM Bui Quang Minh
>>> <minhquangbui99@gmail.com> wrote:
>>>> Currently, in zerocopy mode with mergeable receive buffer, virtio-net
>>>> does not support multi buffer but a single buffer only. This commit adds
>>>> support for multi mergeable receive buffer in the zerocopy XDP path by
>>>> utilizing XDP buffer with frags.
>>>>
>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>>> ---
>>>>    drivers/net/virtio_net.c | 123 +++++++++++++++++++++------------------
>>>>    1 file changed, 66 insertions(+), 57 deletions(-)
>>>>
>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>> index e53ba600605a..a9558650f205 100644
>>>> --- a/drivers/net/virtio_net.c
>>>> +++ b/drivers/net/virtio_net.c
>>>> @@ -45,6 +45,8 @@ module_param(napi_tx, bool, 0644);
>>>>    #define VIRTIO_XDP_TX          BIT(0)
>>>>    #define VIRTIO_XDP_REDIR       BIT(1)
>>>>
>>>> +#define VIRTNET_MAX_ZC_SEGS    8
>>>> +
>>>>    /* RX packet size EWMA. The average packet size is used to determine the packet
>>>>     * buffer size when refilling RX rings. As the entire RX ring may be refilled
>>>>     * at once, the weight is chosen so that the EWMA will be insensitive to short-
>>>> @@ -1232,65 +1234,53 @@ static void xsk_drop_follow_bufs(struct net_device *dev,
>>>>           }
>>>>    }
>>>>
>>>> -static int xsk_append_merge_buffer(struct virtnet_info *vi,
>>>> -                                  struct receive_queue *rq,
>>>> -                                  struct sk_buff *head_skb,
>>>> -                                  u32 num_buf,
>>>> -                                  struct virtio_net_hdr_mrg_rxbuf *hdr,
>>>> -                                  struct virtnet_rq_stats *stats)
>>>> +static int virtnet_build_xsk_buff_mrg(struct virtnet_info *vi,
>>>> +                                     struct receive_queue *rq,
>>>> +                                     u32 num_buf,
>>>> +                                     struct xdp_buff *xdp,
>>>> +                                     struct virtnet_rq_stats *stats)
>>>>    {
>>>> -       struct sk_buff *curr_skb;
>>>> -       struct xdp_buff *xdp;
>>>> -       u32 len, truesize;
>>>> -       struct page *page;
>>>> +       unsigned int len;
>>>>           void *buf;
>>>>
>>>> -       curr_skb = head_skb;
>>>> +       if (num_buf < 2)
>>>> +               return 0;
>>>> +
>>>> +       while (num_buf > 1) {
>>>> +               struct xdp_buff *new_xdp;
>>>>
>>>> -       while (--num_buf) {
>>>>                   buf = virtqueue_get_buf(rq->vq, &len);
>>>> -               if (unlikely(!buf)) {
>>>> -                       pr_debug("%s: rx error: %d buffers out of %d missing\n",
>>>> -                                vi->dev->name, num_buf,
>>>> -                                virtio16_to_cpu(vi->vdev,
>>>> -                                                hdr->num_buffers));
>>>> +               if (!unlikely(buf)) {
>>>> +                       pr_debug("%s: rx error: %d buffers missing\n",
>>>> +                                vi->dev->name, num_buf);
>>>>                           DEV_STATS_INC(vi->dev, rx_length_errors);
>>>> -                       return -EINVAL;
>>>> -               }
>>>> -
>>>> -               u64_stats_add(&stats->bytes, len);
>>>> -
>>>> -               xdp = buf_to_xdp(vi, rq, buf, len);
>>>> -               if (!xdp)
>>>> -                       goto err;
>>>> -
>>>> -               buf = napi_alloc_frag(len);
>>>> -               if (!buf) {
>>>> -                       xsk_buff_free(xdp);
>>>> -                       goto err;
>>>> +                       return -1;
>>>>                   }
>>>>
>>>> -               memcpy(buf, xdp->data - vi->hdr_len, len);
>>>> -
>>>> -               xsk_buff_free(xdp);
>>>> +               new_xdp = buf_to_xdp(vi, rq, buf, len);
>>>> +               if (!new_xdp)
>>>> +                       goto drop_bufs;
>>>>
>>>> -               page = virt_to_page(buf);
>>>> +               /* In virtnet_add_recvbuf_xsk(), we ask the host to fill from
>>>> +                * xdp->data - vi->hdr_len with both virtio_net_hdr and data.
>>>> +                * However, only the first packet has the virtio_net_hdr, the
>>>> +                * following ones do not. So we need to adjust the following
>>> Typo here.
>> I'm sorry, could you clarify which word contains the typo?
>>
>>>> +                * packets' data pointer to the correct place.
>>>> +                */
>>> I wonder what happens if we don't use this trick? I meant we don't
>>> reuse the header room for the virtio-net header. This seems to be fine
>>> for a mergeable buffer and can help to reduce the trick.
>> I don't think using the header room for virtio-net header creates this
>> case handling. In my opinion, it comes from the slightly difference in
>> the recvbuf between single buffer and multi-buffer. When we have n
>> single-buffer packets, each buffer will have its own virtio-net header.
>> But when we have 1 multi-buffer packet (which spans across n buffers),
>> only the first buffer has virtio-net header, the following buffers do not.
>>
>> There 2 important pointers here. The pointer we announce to the vhost
>> side to fill the data, let's call it announced_addr, and xdp_buff->data
>> which is expected to point the the start of Ethernet frame. Currently,
>>
>>       announced_addr = xdp_buff->data - hdr_len
>>
>> The host side will write the virtio-net header to announced_addr then
>> the Ethernet frame's data in the first buffer. In case of multi-buffer
>> packet, in the following buffers, host side writes the Ethernet frame's
>> data to the announced_addr no virtio-net header. So in the virtio-net,
>> we need to subtract xdp_buff->data, otherwise, we lose some Ethernet
>> frame's data.
>>
>> I think a slightly better solution is that we set announced_addr =
>> xdp_buff->data then we only need to xdp_buff->data += hdr_len for the
>> first buffer and do need to adjust xdp_buff->data of the following buffers.
> Exactly my point.
>
>>>> +               new_xdp->data -= vi->hdr_len;
>>>> +               new_xdp->data_end = new_xdp->data + len;
>>>>
>>>> -               truesize = len;
>>>> +               if (!xsk_buff_add_frag(xdp, new_xdp))
>>>> +                       goto drop_bufs;
>>>>
>>>> -               curr_skb  = virtnet_skb_append_frag(head_skb, curr_skb, page,
>>>> -                                                   buf, len, truesize);
>>>> -               if (!curr_skb) {
>>>> -                       put_page(page);
>>>> -                       goto err;
>>>> -               }
>>>> +               num_buf--;
>>>>           }
>>>>
>>>>           return 0;
>>>>
>>>> -err:
>>>> +drop_bufs:
>>>>           xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats);
>>>> -       return -EINVAL;
>>>> +       return -1;
>>>>    }
>>>>
>>>>    static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi,
>>>> @@ -1307,23 +1297,42 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>>>>           num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>>>>
>>>>           ret = XDP_PASS;
>>>> +       if (virtnet_build_xsk_buff_mrg(vi, rq, num_buf, xdp, stats))
>>>> +               goto drop;
>>>> +
>>>>           rcu_read_lock();
>>>>           prog = rcu_dereference(rq->xdp_prog);
>>>> -       /* TODO: support multi buffer. */
>>>> -       if (prog && num_buf == 1)
>>>> -               ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
>>> Without this patch it looks like we had a bug:
>>>
>>>           ret = XDP_PASS;
>>>           rcu_read_lock();
>>>           prog = rcu_dereference(rq->xdp_prog);
>>>           /* TODO: support multi buffer. */
>>>           if (prog && num_buf == 1)
>>>                   ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats);
>>>           rcu_read_unlock();
>>>
>>> This implies if num_buf is greater than 1, we will assume XDP_PASS?
>> Yes, I think XDP_DROP should be returned in that case.
> Care to post a patch and cc stable?

Okay, I'll submit a patch shortly.

Thanks,
Quang Minh.

>
>>>> +       if (prog) {
>>>> +               /* We are in zerocopy mode so we cannot copy the multi-buffer
>>>> +                * xdp buff to a single linear xdp buff. If we do so, in case
>>>> +                * the BPF program decides to redirect to a XDP socket (XSK),
>>>> +                * it will trigger the zerocopy receive logic in XDP socket.
>>>> +                * The receive logic thinks it receives zerocopy buffer while
>>>> +                * in fact, it is the copy one and everything is messed up.
>>>> +                * So just drop the packet here if we have a multi-buffer xdp
>>>> +                * buff and the BPF program does not support it.
>>>> +                */
>>>> +               if (xdp_buff_has_frags(xdp) && !prog->aux->xdp_has_frags)
>>>> +                       ret = XDP_DROP;
>>> Could we move the check before trying to build a multi-buffer XDP buff?
>> Yes, I'll fix this in next version.
>>
>>>> +               else
>>>> +                       ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
>>>> +                                                 stats);
>>>> +       }
>>>>           rcu_read_unlock();
>>>>
>>>>           switch (ret) {
>>>>           case XDP_PASS:
>>>> -               skb = xsk_construct_skb(rq, xdp);
>>>> +               skb = xdp_build_skb_from_zc(xdp);
>>> Is this better to make this change a separate patch?
>> Okay, I'll create a separate patch to convert the current XDP_PASS
>> handler to use xdp_build_skb_from_zc helper.
> That would be better.
>
>>>>                   if (!skb)
>>>> -                       goto drop_bufs;
>>>> +                       break;
>>>>
>>>> -               if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) {
>>>> -                       dev_kfree_skb(skb);
>>>> -                       goto drop;
>>>> -               }
>>>> +               /* Later, in virtnet_receive_done(), eth_type_trans()
>>>> +                * is called. However, in xdp_build_skb_from_zc(), it is called
>>>> +                * already. As a result, we need to reset the data to before
>>>> +                * the mac header so that the later call in
>>>> +                * virtnet_receive_done() works correctly.
>>>> +                */
>>>> +               skb_push(skb, ETH_HLEN);
>>>>
>>>>                   return skb;
>>>>
>>>> @@ -1332,14 +1341,11 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct
>>>>                   return NULL;
>>>>
>>>>           default:
>>>> -               /* drop packet */
>>>> -               xsk_buff_free(xdp);
>>>> +               break;
>>>>           }
>>>>
>>>> -drop_bufs:
>>>> -       xsk_drop_follow_bufs(dev, rq, num_buf, stats);
>>>> -
>>>>    drop:
>>>> +       xsk_buff_free(xdp);
>>>>           u64_stats_inc(&stats->drops);
>>>>           return NULL;
>>>>    }
>>>> @@ -1396,6 +1402,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
>>>>                   return -ENOMEM;
>>>>
>>>>           len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
>>>> +       /* Reserve some space for skb_shared_info */
>>>> +       len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>>>>
>>>>           for (i = 0; i < num; ++i) {
>>>>                   /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space.
>>>> @@ -6734,6 +6742,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>>>           dev->netdev_ops = &virtnet_netdev;
>>>>           dev->stat_ops = &virtnet_stat_ops;
>>>>           dev->features = NETIF_F_HIGHDMA;
>>>> +       dev->xdp_zc_max_segs = VIRTNET_MAX_ZC_SEGS;
>>>>
>>>>           dev->ethtool_ops = &virtnet_ethtool_ops;
>>>>           SET_NETDEV_DEV(dev, &vdev->dev);
>>>> --
>>>> 2.43.0
>>>>
>>> Thanks
>>>
> Thanks
>


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2025-06-03 14:22 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-27 16:19 [RFC PATCH net-next v2 0/2] virtio-net: support zerocopy multi buffer XDP in mergeable Bui Quang Minh
2025-05-27 16:19 ` [RFC PATCH net-next v2 1/2] " Bui Quang Minh
2025-05-28 16:44   ` ALOK TIWARI
2025-05-29  3:42     ` Bui Quang Minh
2025-05-29  5:59   ` Jason Wang
2025-05-29 12:28     ` Bui Quang Minh
2025-06-03  2:56       ` Jason Wang
2025-06-03  6:04         ` Lei Yang
2025-06-03 14:22         ` Bui Quang Minh
2025-05-27 16:19 ` [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net Bui Quang Minh
2025-05-28 17:04   ` ALOK TIWARI
2025-05-29  3:44     ` Bui Quang Minh
2025-05-29 11:18   ` Maciej Fijalkowski
2025-05-29 14:29     ` Bui Quang Minh
2025-05-30 11:45       ` Maciej Fijalkowski
2025-05-31  8:51         ` Bui Quang Minh
2025-06-02 15:55           ` Maciej Fijalkowski
2025-06-03 14:18             ` Bui Quang Minh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).