* [PULL V2 01/17] tap: Add USO support to tap device.
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 02/17] tap: Add check for USO features Jason Wang
` (16 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Andrew Melnychenko, Yuri Benditovich, Jason Wang
From: Andrew Melnychenko <andrew@daynix.com>
Passing additional parameters (USOv4 and USOv6 offloads) when
setting TAP offloads
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/e1000e_core.c | 2 +-
hw/net/igb_core.c | 2 +-
hw/net/virtio-net.c | 4 +++-
hw/net/vmxnet3.c | 2 ++
include/net/net.h | 4 ++--
net/net.c | 4 ++--
net/netmap.c | 2 +-
net/tap-bsd.c | 2 +-
net/tap-linux.c | 15 ++++++++++++---
net/tap-linux.h | 2 ++
net/tap-solaris.c | 2 +-
net/tap-stub.c | 2 +-
net/tap-win32.c | 2 +-
net/tap.c | 6 +++---
net/tap_int.h | 3 ++-
15 files changed, 35 insertions(+), 19 deletions(-)
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index f8aeafa..d405595 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -2852,7 +2852,7 @@ e1000e_update_rx_offloads(E1000ECore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
}
}
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 8b6b75c..389eef1 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -2753,7 +2753,7 @@ igb_update_rx_offloads(IGBCore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
}
}
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7102ec4..d2311e7 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -859,7 +859,9 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
- !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
}
static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 3fb1087..226c077 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
s->lro_supported,
s->lro_supported,
0,
+ 0,
+ 0,
0);
}
}
diff --git a/include/net/net.h b/include/net/net.h
index 1448d00..b5ccfbb 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -58,7 +58,7 @@ typedef bool (HasVnetHdr)(NetClientState *);
typedef bool (HasVnetHdrLen)(NetClientState *, int);
typedef bool (GetUsingVnetHdr)(NetClientState *);
typedef void (UsingVnetHdr)(NetClientState *, bool);
-typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
typedef int (GetVnetHdrLen)(NetClientState *);
typedef void (SetVnetHdrLen)(NetClientState *, int);
typedef int (SetVnetLE)(NetClientState *, bool);
@@ -192,7 +192,7 @@ bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
bool qemu_get_using_vnet_hdr(NetClientState *nc);
void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo);
+ int ecn, int ufo, int uso4, int uso6);
int qemu_get_vnet_hdr_len(NetClientState *nc);
void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
int qemu_set_vnet_le(NetClientState *nc, bool is_le);
diff --git a/net/net.c b/net/net.c
index 6492ad5..543e6de 100644
--- a/net/net.c
+++ b/net/net.c
@@ -532,13 +532,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
}
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo)
+ int ecn, int ufo, int uso4, int uso6)
{
if (!nc || !nc->info->set_offload) {
return;
}
- nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+ nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
int qemu_get_vnet_hdr_len(NetClientState *nc)
diff --git a/net/netmap.c b/net/netmap.c
index 9e0cec5..241b27c 100644
--- a/net/netmap.c
+++ b/net/netmap.c
@@ -371,7 +371,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
}
static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo)
+ int ecn, int ufo, int uso4, int uso6)
{
NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 4c98fdd..abd16a2 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -232,7 +232,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-linux.c b/net/tap-linux.c
index f54f308..30fcca1 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -237,7 +237,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
unsigned int offload = 0;
@@ -256,13 +256,22 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
offload |= TUN_F_TSO_ECN;
if (ufo)
offload |= TUN_F_UFO;
+ if (uso4) {
+ offload |= TUN_F_USO4;
+ }
+ if (uso6) {
+ offload |= TUN_F_USO6;
+ }
}
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
- offload &= ~TUN_F_UFO;
+ offload &= ~(TUN_F_USO4 | TUN_F_USO6);
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
- fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
+ offload &= ~TUN_F_UFO;
+ if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
+ fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
strerror(errno));
+ }
}
}
}
diff --git a/net/tap-linux.h b/net/tap-linux.h
index bbbb62c..9a58cec 100644
--- a/net/tap-linux.h
+++ b/net/tap-linux.h
@@ -50,5 +50,7 @@
#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */
+#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */
#endif /* QEMU_TAP_LINUX_H */
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index 38e1502..a617a10 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -236,7 +236,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-stub.c b/net/tap-stub.c
index a0fa258..ac8dfc0 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -67,7 +67,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-win32.c b/net/tap-win32.c
index f327d62..7b8b4be 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -741,7 +741,7 @@ static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap.c b/net/tap.c
index 1bf085d..14ea4ef 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -307,14 +307,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
if (s->fd < 0) {
return;
}
- tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo);
+ tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
static void tap_exit_notify(Notifier *notifier, void *data)
@@ -414,7 +414,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
s->using_vnet_hdr = false;
s->has_ufo = tap_probe_has_ufo(s->fd);
s->enabled = true;
- tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
+ tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0);
/*
* Make sure host header length is set correctly in tap:
* it might have been modified by another instance of qemu.
diff --git a/net/tap_int.h b/net/tap_int.h
index 547f8a5..d8861d8 100644
--- a/net/tap_int.h
+++ b/net/tap_int.h
@@ -37,7 +37,8 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
int tap_probe_vnet_hdr(int fd, Error **errp);
int tap_probe_vnet_hdr_len(int fd, int len);
int tap_probe_has_ufo(int fd);
-void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
+void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
+ int uso4, int uso6);
void tap_fd_set_vnet_hdr_len(int fd, int len);
int tap_fd_set_vnet_le(int fd, int vnet_is_le);
int tap_fd_set_vnet_be(int fd, int vnet_is_be);
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 02/17] tap: Add check for USO features
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
2023-09-18 8:31 ` [PULL V2 01/17] tap: Add USO support to tap device Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 03/17] virtio-net: Add USO flags to vhost support Jason Wang
` (15 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Yuri Benditovich, Andrew Melnychecnko, Jason Wang
From: Yuri Benditovich <yuri.benditovich@daynix.com>
Tap indicates support for USO features according to
capabilities of current kernel module.
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/net/net.h | 3 +++
net/net.c | 9 +++++++++
net/tap-bsd.c | 5 +++++
net/tap-linux.c | 12 ++++++++++++
net/tap-solaris.c | 5 +++++
net/tap-stub.c | 5 +++++
net/tap.c | 12 ++++++++++++
net/tap_int.h | 1 +
8 files changed, 52 insertions(+)
diff --git a/include/net/net.h b/include/net/net.h
index b5ccfbb..330d285 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -54,6 +54,7 @@ typedef void (LinkStatusChanged)(NetClientState *);
typedef void (NetClientDestructor)(NetClientState *);
typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasUso)(NetClientState *);
typedef bool (HasVnetHdr)(NetClientState *);
typedef bool (HasVnetHdrLen)(NetClientState *, int);
typedef bool (GetUsingVnetHdr)(NetClientState *);
@@ -84,6 +85,7 @@ typedef struct NetClientInfo {
QueryRxFilter *query_rx_filter;
NetPoll *poll;
HasUfo *has_ufo;
+ HasUso *has_uso;
HasVnetHdr *has_vnet_hdr;
HasVnetHdrLen *has_vnet_hdr_len;
GetUsingVnetHdr *get_using_vnet_hdr;
@@ -187,6 +189,7 @@ void qemu_set_info_str(NetClientState *nc,
const char *fmt, ...) G_GNUC_PRINTF(2, 3);
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_uso(NetClientState *nc);
bool qemu_has_vnet_hdr(NetClientState *nc);
bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
bool qemu_get_using_vnet_hdr(NetClientState *nc);
diff --git a/net/net.c b/net/net.c
index 543e6de..b110e61 100644
--- a/net/net.c
+++ b/net/net.c
@@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
return nc->info->has_ufo(nc);
}
+bool qemu_has_uso(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_uso) {
+ return false;
+ }
+
+ return nc->info->has_uso(nc);
+}
+
bool qemu_has_vnet_hdr(NetClientState *nc)
{
if (!nc || !nc->info->has_vnet_hdr) {
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index abd16a2..274ea7b 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
diff --git a/net/tap-linux.c b/net/tap-linux.c
index 30fcca1..c7e514e 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
return 1;
}
+int tap_probe_has_uso(int fd)
+{
+ unsigned offload;
+
+ offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
+
+ if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
+ return 0;
+ }
+ return 1;
+}
+
/* Verify that we can assign given length */
int tap_probe_vnet_hdr_len(int fd, int len)
{
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index a617a10..08b13af 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
diff --git a/net/tap-stub.c b/net/tap-stub.c
index ac8dfc0..4b24f61 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
diff --git a/net/tap.c b/net/tap.c
index 14ea4ef..bcea8d0 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -57,6 +57,7 @@ typedef struct TAPState {
bool write_poll;
bool using_vnet_hdr;
bool has_ufo;
+ bool has_uso;
bool enabled;
VHostNetState *vhost_net;
unsigned host_vnet_hdr_len;
@@ -237,6 +238,15 @@ static bool tap_has_ufo(NetClientState *nc)
return s->has_ufo;
}
+static bool tap_has_uso(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return s->has_uso;
+}
+
static bool tap_has_vnet_hdr(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -384,6 +394,7 @@ static NetClientInfo net_tap_info = {
.poll = tap_poll,
.cleanup = tap_cleanup,
.has_ufo = tap_has_ufo,
+ .has_uso = tap_has_uso,
.has_vnet_hdr = tap_has_vnet_hdr,
.has_vnet_hdr_len = tap_has_vnet_hdr_len,
.get_using_vnet_hdr = tap_get_using_vnet_hdr,
@@ -413,6 +424,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
s->using_vnet_hdr = false;
s->has_ufo = tap_probe_has_ufo(s->fd);
+ s->has_uso = tap_probe_has_uso(s->fd);
s->enabled = true;
tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0);
/*
diff --git a/net/tap_int.h b/net/tap_int.h
index d8861d8..9a21756 100644
--- a/net/tap_int.h
+++ b/net/tap_int.h
@@ -37,6 +37,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
int tap_probe_vnet_hdr(int fd, Error **errp);
int tap_probe_vnet_hdr_len(int fd, int len);
int tap_probe_has_ufo(int fd);
+int tap_probe_has_uso(int fd);
void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
int uso4, int uso6);
void tap_fd_set_vnet_hdr_len(int fd, int len);
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 03/17] virtio-net: Add USO flags to vhost support.
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
2023-09-18 8:31 ` [PULL V2 01/17] tap: Add USO support to tap device Jason Wang
2023-09-18 8:31 ` [PULL V2 02/17] tap: Add check for USO features Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 04/17] virtio-net: Add support for USO features Jason Wang
` (14 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Andrew Melnychenko, Yuri Benditovich, Jason Wang
From: Andrew Melnychenko <andrew@daynix.com>
New features are subject to check with vhost-user and vdpa.
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/vhost_net.c | 3 +++
net/vhost-vdpa.c | 3 +++
2 files changed, 6 insertions(+)
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6b958d6..57427a3 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -78,6 +78,9 @@ static const int user_feature_bits[] = {
VIRTIO_F_RING_RESET,
VIRTIO_NET_F_RSS,
VIRTIO_NET_F_HASH_REPORT,
+ VIRTIO_NET_F_GUEST_USO4,
+ VIRTIO_NET_F_GUEST_USO6,
+ VIRTIO_NET_F_HOST_USO,
/* This bit implies RARP isn't sent by QEMU out of band */
VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 34202ca..4e94c50 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = {
VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_UFO,
+ VIRTIO_NET_F_GUEST_USO4,
+ VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_HOST_ECN,
VIRTIO_NET_F_HOST_TSO4,
VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_UFO,
+ VIRTIO_NET_F_HOST_USO,
VIRTIO_NET_F_MQ,
VIRTIO_NET_F_MRG_RXBUF,
VIRTIO_NET_F_MTU,
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 04/17] virtio-net: Add support for USO features
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (2 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 03/17] virtio-net: Add USO flags to vhost support Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 05/17] igb: remove TCP ACK detection Jason Wang
` (13 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Yuri Benditovich, Andrew Melnychecnko, Jason Wang
From: Yuri Benditovich <yuri.benditovich@daynix.com>
USO features of virtio-net device depend on kernel ability
to support them, for backward compatibility by default the
features are disabled on 8.0 and earlier.
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/core/machine.c | 4 ++++
hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++--
2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/hw/core/machine.c b/hw/core/machine.c
index da699cf..230aab8 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -38,6 +38,7 @@
#include "exec/confidential-guest-support.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-net.h"
GlobalProperty hw_compat_8_1[] = {};
const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
@@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
GlobalProperty hw_compat_8_0[] = {
{ "migration", "multifd-flush-after-each-section", "on"},
{ TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
+ { TYPE_VIRTIO_NET, "host_uso", "off"},
+ { TYPE_VIRTIO_NET, "guest_uso4", "off"},
+ { TYPE_VIRTIO_NET, "guest_uso6", "off"},
};
const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index d2311e7..bd0ead9 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
return n->has_ufo;
}
+static int peer_has_uso(VirtIONet *n)
+{
+ if (!peer_has_vnet_hdr(n)) {
+ return 0;
+ }
+
+ return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
int version_1, int hash_report)
{
@@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
+ virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+
virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
}
@@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
}
+ if (!peer_has_uso(n)) {
+ virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+ }
+
if (!get_vhost_net(nc->peer)) {
return features;
}
@@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
}
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
{
static const uint64_t guest_offloads_mask =
(1ULL << VIRTIO_NET_F_GUEST_CSUM) |
(1ULL << VIRTIO_NET_F_GUEST_TSO4) |
(1ULL << VIRTIO_NET_F_GUEST_TSO6) |
(1ULL << VIRTIO_NET_F_GUEST_ECN) |
- (1ULL << VIRTIO_NET_F_GUEST_UFO);
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+ (1ULL << VIRTIO_NET_F_GUEST_USO4) |
+ (1ULL << VIRTIO_NET_F_GUEST_USO6);
return guest_offloads_mask & features;
}
@@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = {
DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+ DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+ VIRTIO_NET_F_GUEST_USO4, true),
+ DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+ VIRTIO_NET_F_GUEST_USO6, true),
+ DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+ VIRTIO_NET_F_HOST_USO, true),
DEFINE_PROP_END_OF_LIST(),
};
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 05/17] igb: remove TCP ACK detection
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (3 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 04/17] virtio-net: Add support for USO features Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 06/17] igb: rename E1000E_RingInfo_st Jason Wang
` (12 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
TCP ACK detection is no longer present in igb.
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/igb_core.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 389eef1..a83e4aa 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1327,11 +1327,6 @@ igb_build_rx_metadata(IGBCore *core,
trace_e1000e_rx_metadata_ip_id(*ip_id);
}
- if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
- *status_flags |= E1000_RXD_STAT_ACK;
- trace_e1000e_rx_metadata_ack();
- }
-
if (pkt_info) {
*pkt_info = rss_info->enabled ? rss_info->type : 0;
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 06/17] igb: rename E1000E_RingInfo_st
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (4 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 05/17] igb: remove TCP ACK detection Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 07/17] igb: RX descriptors guest writting refactoring Jason Wang
` (11 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Rename E1000E_RingInfo_st and E1000E_RingInfo according to qemu typdefs guide.
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/e1000e_core.c | 34 +++++++++++++++++-----------------
hw/net/igb_core.c | 42 +++++++++++++++++++++---------------------
2 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index d405595..91aae37 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -810,24 +810,24 @@ e1000e_txdesc_writeback(E1000ECore *core, dma_addr_t base,
return e1000e_tx_wb_interrupt_cause(core, queue_idx);
}
-typedef struct E1000E_RingInfo_st {
+typedef struct E1000ERingInfo {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
-} E1000E_RingInfo;
+} E1000ERingInfo;
static inline bool
-e1000e_ring_empty(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_empty(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
-e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_base(E1000ECore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@@ -836,13 +836,13 @@ e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
-e1000e_ring_head_descr(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_head_descr(E1000ECore *core, const E1000ERingInfo *r)
{
return e1000e_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
-e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
+e1000e_ring_advance(E1000ECore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@@ -852,7 +852,7 @@ e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
-e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_free_descr_num(E1000ECore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@@ -871,19 +871,19 @@ e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline bool
-e1000e_ring_enabled(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_enabled(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
static inline uint32_t
-e1000e_ring_len(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_len(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen];
}
typedef struct E1000E_TxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
struct e1000e_tx *tx;
} E1000E_TxRing;
@@ -896,7 +896,7 @@ e1000e_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
{
- static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+ static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ TDBAH, TDBAL, TDLEN, TDH, TDT, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }
};
@@ -908,13 +908,13 @@ e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
{
- static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+ static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }
};
@@ -930,7 +930,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
dma_addr_t base;
struct e1000_tx_desc desc;
bool ide = false;
- const E1000E_RingInfo *txi = txr->i;
+ const E1000ERingInfo *txi = txr->i;
uint32_t cause = E1000_ICS_TXQE;
if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
@@ -960,7 +960,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
}
static bool
-e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r,
+e1000e_has_rxbufs(E1000ECore *core, const E1000ERingInfo *r,
size_t total_size)
{
uint32_t bufs = e1000e_ring_free_descr_num(core, r);
@@ -1460,7 +1460,7 @@ e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size)
}
static inline bool
-e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi)
+e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000ERingInfo *rxi)
{
return e1000e_ring_free_descr_num(core, rxi) ==
e1000e_ring_len(core, rxi) >> core->rxbuf_min_shift;
@@ -1521,7 +1521,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
struct iovec *iov = net_rx_pkt_get_iovec(pkt);
size_t size = net_rx_pkt_get_total_len(pkt);
size_t total_size = size + e1000x_fcs_len(core->mac);
- const E1000E_RingInfo *rxi;
+ const E1000ERingInfo *rxi;
size_t ps_hdr_len = 0;
bool do_ps = e1000e_do_ps(core, pkt, &ps_hdr_len);
bool is_first = true;
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index a83e4aa..d50e6b1 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -694,24 +694,24 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
}
-typedef struct E1000E_RingInfo_st {
+typedef struct E1000ERingInfo {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
-} E1000E_RingInfo;
+} E1000ERingInfo;
static inline bool
-igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_empty(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
-igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_base(IGBCore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@@ -720,13 +720,13 @@ igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
-igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_head_descr(IGBCore *core, const E1000ERingInfo *r)
{
return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
-igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
+igb_ring_advance(IGBCore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@@ -736,7 +736,7 @@ igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
-igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_free_descr_num(IGBCore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@@ -755,13 +755,13 @@ igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
}
static inline bool
-igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_enabled(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
typedef struct IGB_TxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
struct igb_tx *tx;
} IGB_TxRing;
@@ -774,7 +774,7 @@ igb_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
{
- static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+ static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 },
{ TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 },
@@ -800,13 +800,13 @@ igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
{
- static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+ static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 },
{ RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 },
@@ -833,7 +833,7 @@ igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
static uint32_t
igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
union e1000_adv_tx_desc *tx_desc,
- const E1000E_RingInfo *txi)
+ const E1000ERingInfo *txi)
{
PCIDevice *d;
uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
@@ -866,7 +866,7 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
}
static inline bool
-igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi)
+igb_tx_enabled(IGBCore *core, const E1000ERingInfo *txi)
{
bool vmdq = core->mac[MRQC] & 1;
uint16_t qn = txi->idx;
@@ -883,7 +883,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
PCIDevice *d;
dma_addr_t base;
union e1000_adv_tx_desc desc;
- const E1000E_RingInfo *txi = txr->i;
+ const E1000ERingInfo *txi = txr->i;
uint32_t eic = 0;
if (!igb_tx_enabled(core, txi)) {
@@ -918,7 +918,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
}
static uint32_t
-igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
+igb_rxbufsize(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK;
@@ -930,7 +930,7 @@ igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
}
static bool
-igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
+igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size)
{
uint32_t bufs = igb_ring_free_descr_num(core, r);
uint32_t bufsize = igb_rxbufsize(core, r);
@@ -1522,7 +1522,7 @@ igb_write_to_rx_buffers(IGBCore *core,
}
static void
-igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
+igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi,
size_t pkt_size, size_t pkt_fcs_size)
{
eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt);
@@ -1540,7 +1540,7 @@ igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
}
static inline bool
-igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
{
return igb_ring_free_descr_num(core, rxi) ==
((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
@@ -1562,7 +1562,7 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
struct iovec *iov = net_rx_pkt_get_iovec(pkt);
size_t size = net_rx_pkt_get_total_len(pkt);
size_t total_size = size + e1000x_fcs_len(core->mac);
- const E1000E_RingInfo *rxi = rxr->i;
+ const E1000ERingInfo *rxi = rxr->i;
size_t bufsize = igb_rxbufsize(core, rxi);
d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
@@ -1643,7 +1643,7 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
}
static bool
-igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_strip_vlan(IGBCore *core, const E1000ERingInfo *rxi)
{
if (core->mac[MRQC] & 1) {
uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS;
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 07/17] igb: RX descriptors guest writting refactoring
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (5 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 06/17] igb: rename E1000E_RingInfo_st Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 08/17] igb: RX payload " Jason Wang
` (10 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Refactoring is done in preparation for support of multiple advanced
descriptors RX modes, especially packet-split modes.
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/igb_core.c | 170 +++++++++++++++++++++++++++-------------------------
hw/net/igb_regs.h | 10 ++--
hw/net/trace-events | 4 +-
3 files changed, 96 insertions(+), 88 deletions(-)
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index d50e6b1..e140358 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1281,15 +1281,11 @@ igb_verify_csum_in_sw(IGBCore *core,
}
static void
-igb_build_rx_metadata(IGBCore *core,
- struct NetRxPkt *pkt,
- bool is_eop,
- const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
- uint16_t *pkt_info, uint16_t *hdr_info,
- uint32_t *rss,
- uint32_t *status_flags,
- uint16_t *ip_id,
- uint16_t *vlan_tag)
+igb_build_rx_metadata_common(IGBCore *core,
+ struct NetRxPkt *pkt,
+ bool is_eop,
+ uint32_t *status_flags,
+ uint16_t *vlan_tag)
{
struct virtio_net_hdr *vhdr;
bool hasip4, hasip6, csum_valid;
@@ -1298,7 +1294,6 @@ igb_build_rx_metadata(IGBCore *core,
*status_flags = E1000_RXD_STAT_DD;
/* No additional metadata needed for non-EOP descriptors */
- /* TODO: EOP apply only to status so don't skip whole function. */
if (!is_eop) {
goto func_exit;
}
@@ -1315,59 +1310,6 @@ igb_build_rx_metadata(IGBCore *core,
trace_e1000e_rx_metadata_vlan(*vlan_tag);
}
- /* Packet parsing results */
- if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
- if (rss_info->enabled) {
- *rss = cpu_to_le32(rss_info->hash);
- trace_igb_rx_metadata_rss(*rss);
- }
- } else if (hasip4) {
- *status_flags |= E1000_RXD_STAT_IPIDV;
- *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
- trace_e1000e_rx_metadata_ip_id(*ip_id);
- }
-
- if (pkt_info) {
- *pkt_info = rss_info->enabled ? rss_info->type : 0;
-
- if (etqf < 8) {
- *pkt_info |= (BIT(11) | etqf) << 4;
- } else {
- if (hasip4) {
- *pkt_info |= E1000_ADVRXD_PKT_IP4;
- }
-
- if (hasip6) {
- *pkt_info |= E1000_ADVRXD_PKT_IP6;
- }
-
- switch (l4hdr_proto) {
- case ETH_L4_HDR_PROTO_TCP:
- *pkt_info |= E1000_ADVRXD_PKT_TCP;
- break;
-
- case ETH_L4_HDR_PROTO_UDP:
- *pkt_info |= E1000_ADVRXD_PKT_UDP;
- break;
-
- case ETH_L4_HDR_PROTO_SCTP:
- *pkt_info |= E1000_ADVRXD_PKT_SCTP;
- break;
-
- default:
- break;
- }
- }
- }
-
- if (hdr_info) {
- *hdr_info = 0;
- }
-
- if (ts) {
- *status_flags |= BIT(16);
- }
-
/* RX CSO information */
if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
trace_e1000e_rx_metadata_ipv6_sum_disabled();
@@ -1423,43 +1365,108 @@ func_exit:
static inline void
igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
struct NetRxPkt *pkt,
- const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
+ const E1000E_RSSInfo *rss_info,
uint16_t length)
{
- uint32_t status_flags, rss;
- uint16_t ip_id;
+ uint32_t status_flags;
assert(!rss_info->enabled);
+
+ memset(desc, 0, sizeof(*desc));
desc->length = cpu_to_le16(length);
- desc->csum = 0;
+ igb_build_rx_metadata_common(core, pkt, pkt != NULL,
+ &status_flags,
+ &desc->special);
- igb_build_rx_metadata(core, pkt, pkt != NULL,
- rss_info, etqf, ts,
- NULL, NULL, &rss,
- &status_flags, &ip_id,
- &desc->special);
desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
desc->status = (uint8_t) le32_to_cpu(status_flags);
}
+static uint16_t
+igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
+{
+ uint16_t pkt_type;
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+
+ if (etqf < 8) {
+ pkt_type = BIT(11) | etqf;
+ return pkt_type;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+ pkt_type = E1000_ADVRXD_PKT_IP6;
+ } else if (hasip4) {
+ pkt_type = E1000_ADVRXD_PKT_IP4;
+ } else {
+ pkt_type = 0;
+ }
+
+ switch (l4hdr_proto) {
+ case ETH_L4_HDR_PROTO_TCP:
+ pkt_type |= E1000_ADVRXD_PKT_TCP;
+ break;
+ case ETH_L4_HDR_PROTO_UDP:
+ pkt_type |= E1000_ADVRXD_PKT_UDP;
+ break;
+ case ETH_L4_HDR_PROTO_SCTP:
+ pkt_type |= E1000_ADVRXD_PKT_SCTP;
+ break;
+ default:
+ break;
+ }
+
+ return pkt_type;
+}
+
static inline void
igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
uint16_t length)
{
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+ uint16_t rss_type = 0, pkt_type;
+ bool eop = (pkt != NULL);
+ uint32_t adv_desc_status_error = 0;
memset(&desc->wb, 0, sizeof(desc->wb));
desc->wb.upper.length = cpu_to_le16(length);
+ igb_build_rx_metadata_common(core, pkt, eop,
+ &desc->wb.upper.status_error,
+ &desc->wb.upper.vlan);
+
+ if (!eop) {
+ return;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
+ if (rss_info->enabled) {
+ desc->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
+ rss_type = rss_info->type;
+ trace_igb_rx_metadata_rss(desc->wb.lower.hi_dword.rss, rss_type);
+ }
+ } else if (hasip4) {
+ adv_desc_status_error |= E1000_RXD_STAT_IPIDV;
+ desc->wb.lower.hi_dword.csum_ip.ip_id =
+ cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
+ trace_e1000e_rx_metadata_ip_id(
+ desc->wb.lower.hi_dword.csum_ip.ip_id);
+ }
+
+ if (ts) {
+ adv_desc_status_error |= BIT(16);
+ }
- igb_build_rx_metadata(core, pkt, pkt != NULL,
- rss_info, etqf, ts,
- &desc->wb.lower.lo_dword.pkt_info,
- &desc->wb.lower.lo_dword.hdr_info,
- &desc->wb.lower.hi_dword.rss,
- &desc->wb.upper.status_error,
- &desc->wb.lower.hi_dword.csum_ip.ip_id,
- &desc->wb.upper.vlan);
+ pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
+ trace_e1000e_rx_metadata_pkt_type(pkt_type);
+ desc->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
+ desc->wb.upper.status_error |= cpu_to_le32(adv_desc_status_error);
}
static inline void
@@ -1468,8 +1475,7 @@ igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
uint16_t etqf, bool ts, uint16_t length)
{
if (igb_rx_use_legacy_descriptor(core)) {
- igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
- etqf, ts, length);
+ igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, length);
} else {
igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
etqf, ts, length);
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index 82ff195..71a8833 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -692,11 +692,11 @@ union e1000_adv_rx_desc {
#define E1000_STATUS_NUM_VFS_SHIFT 14
-#define E1000_ADVRXD_PKT_IP4 BIT(4)
-#define E1000_ADVRXD_PKT_IP6 BIT(6)
-#define E1000_ADVRXD_PKT_TCP BIT(8)
-#define E1000_ADVRXD_PKT_UDP BIT(9)
-#define E1000_ADVRXD_PKT_SCTP BIT(10)
+#define E1000_ADVRXD_PKT_IP4 BIT(0)
+#define E1000_ADVRXD_PKT_IP6 BIT(2)
+#define E1000_ADVRXD_PKT_TCP BIT(4)
+#define E1000_ADVRXD_PKT_UDP BIT(5)
+#define E1000_ADVRXD_PKT_SCTP BIT(6)
static inline uint8_t igb_ivar_entry_rx(uint8_t i)
{
diff --git a/hw/net/trace-events b/hw/net/trace-events
index 6b5ba66..b8305c0 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -280,7 +280,7 @@ igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "
igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
-igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
+igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
@@ -295,6 +295,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
+igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
+
# igbvf.c
igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 08/17] igb: RX payload guest writting refactoring
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (6 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 07/17] igb: RX descriptors guest writting refactoring Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 09/17] igb: add IPv6 extended headers traffic detection Jason Wang
` (9 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Refactoring is done in preparation for support of multiple advanced
descriptors RX modes, especially packet-split modes.
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/e1000e_core.c | 18 ++--
hw/net/igb_core.c | 213 ++++++++++++++++++++++++++++++-----------------
tests/qtest/libqos/igb.c | 5 ++
3 files changed, 150 insertions(+), 86 deletions(-)
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index 91aae37..cc243b7 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
}
static void
-e1000e_write_to_rx_buffers(E1000ECore *core,
- hwaddr ba[MAX_PS_BUFFERS],
- e1000e_ba_state *bastate,
- const char *data,
- dma_addr_t data_len)
+e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core,
+ hwaddr ba[MAX_PS_BUFFERS],
+ e1000e_ba_state *bastate,
+ const char *data,
+ dma_addr_t data_len)
{
while (data_len > 0) {
uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
@@ -1594,8 +1594,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
while (copy_size) {
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
- e1000e_write_to_rx_buffers(core, ba, &bastate,
- iov->iov_base + iov_ofs, iov_copy);
+ e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
+ iov->iov_base +
+ iov_ofs,
+ iov_copy);
copy_size -= iov_copy;
iov_ofs += iov_copy;
@@ -1607,7 +1609,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
if (desc_offset + desc_size >= total_size) {
/* Simulate FCS checksum presence in the last descriptor */
- e1000e_write_to_rx_buffers(core, ba, &bastate,
+ e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
(const char *) &fcs_pad, e1000x_fcs_len(core->mac));
}
}
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index e140358..6d2712e 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -941,6 +941,14 @@ igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size)
bufsize;
}
+static uint32_t
+igb_rxhdrbufsize(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
+ return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >>
+ E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+}
+
void
igb_start_recv(IGBCore *core)
{
@@ -1231,6 +1239,21 @@ igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
*buff_addr = le64_to_cpu(desc->read.pkt_addr);
}
+typedef struct IGBPacketRxDMAState {
+ size_t size;
+ size_t total_size;
+ size_t ps_hdr_len;
+ size_t desc_size;
+ size_t desc_offset;
+ uint32_t rx_desc_packet_buf_size;
+ uint32_t rx_desc_header_buf_size;
+ struct iovec *iov;
+ size_t iov_ofs;
+ bool is_first;
+ uint16_t written;
+ hwaddr ba;
+} IGBPacketRxDMAState;
+
static inline void
igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
hwaddr *buff_addr)
@@ -1515,19 +1538,6 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
}
static void
-igb_write_to_rx_buffers(IGBCore *core,
- PCIDevice *d,
- hwaddr ba,
- uint16_t *written,
- const char *data,
- dma_addr_t data_len)
-{
- trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
- pci_dma_write(d, ba + *written, data, data_len);
- *written += data_len;
-}
-
-static void
igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi,
size_t pkt_size, size_t pkt_fcs_size)
{
@@ -1553,6 +1563,93 @@ igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
}
static void
+igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size)
+{
+ if (*size > pdma_st->rx_desc_packet_buf_size) {
+ *size = pdma_st->rx_desc_packet_buf_size;
+ }
+}
+
+static void
+igb_write_payload_frag_to_rx_buffers(IGBCore *core,
+ PCIDevice *d,
+ hwaddr ba,
+ uint16_t *written,
+ uint32_t cur_buf_len,
+ const char *data,
+ dma_addr_t data_len)
+{
+ trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
+ pci_dma_write(d, ba + *written, data, data_len);
+ *written += data_len;
+}
+
+static void
+igb_write_payload_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ size_t *copy_size)
+{
+ static const uint32_t fcs_pad;
+ size_t iov_copy;
+
+ /* Copy packet payload */
+ while (*copy_size) {
+ iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
+ igb_write_payload_frag_to_rx_buffers(core, d,
+ pdma_st->ba,
+ &pdma_st->written,
+ pdma_st->rx_desc_packet_buf_size,
+ pdma_st->iov->iov_base +
+ pdma_st->iov_ofs,
+ iov_copy);
+
+ *copy_size -= iov_copy;
+ pdma_st->iov_ofs += iov_copy;
+ if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+ pdma_st->iov++;
+ pdma_st->iov_ofs = 0;
+ }
+ }
+
+ if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
+ /* Simulate FCS checksum presence in the last descriptor */
+ igb_write_payload_frag_to_rx_buffers(core, d,
+ pdma_st->ba,
+ &pdma_st->written,
+ pdma_st->rx_desc_packet_buf_size,
+ (const char *) &fcs_pad,
+ e1000x_fcs_len(core->mac));
+ }
+}
+
+static void
+igb_write_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st)
+{
+ size_t copy_size;
+
+ if (!pdma_st->ba) {
+ /* as per intel docs; skip descriptors with null buf addr */
+ trace_e1000e_rx_null_descriptor();
+ return;
+ }
+
+ if (pdma_st->desc_offset >= pdma_st->size) {
+ return;
+ }
+
+ pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
+ igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
+ copy_size = pdma_st->size - pdma_st->desc_offset;
+ igb_truncate_to_descriptor_size(pdma_st, ©_size);
+ igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, ©_size);
+}
+
+static void
igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
const E1000E_RxRing *rxr,
const E1000E_RSSInfo *rss_info,
@@ -1561,91 +1658,51 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
PCIDevice *d;
dma_addr_t base;
union e1000_rx_desc_union desc;
- size_t desc_size;
- size_t desc_offset = 0;
- size_t iov_ofs = 0;
-
- struct iovec *iov = net_rx_pkt_get_iovec(pkt);
- size_t size = net_rx_pkt_get_total_len(pkt);
- size_t total_size = size + e1000x_fcs_len(core->mac);
- const E1000ERingInfo *rxi = rxr->i;
- size_t bufsize = igb_rxbufsize(core, rxi);
-
+ const E1000ERingInfo *rxi;
+ size_t rx_desc_len;
+
+ IGBPacketRxDMAState pdma_st = {0};
+ pdma_st.is_first = true;
+ pdma_st.size = net_rx_pkt_get_total_len(pkt);
+ pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
+
+ rxi = rxr->i;
+ rx_desc_len = core->rx_desc_len;
+ pdma_st.rx_desc_packet_buf_size = igb_rxbufsize(core, rxi);
+ pdma_st.rx_desc_header_buf_size = igb_rxhdrbufsize(core, rxi);
+ pdma_st.iov = net_rx_pkt_get_iovec(pkt);
d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
if (!d) {
d = core->owner;
}
do {
- hwaddr ba;
- uint16_t written = 0;
+ pdma_st.written = 0;
bool is_last = false;
- desc_size = total_size - desc_offset;
-
- if (desc_size > bufsize) {
- desc_size = bufsize;
- }
-
if (igb_ring_empty(core, rxi)) {
return;
}
base = igb_ring_head_descr(core, rxi);
+ pci_dma_read(d, base, &desc, rx_desc_len);
+ trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
- pci_dma_read(d, base, &desc, core->rx_desc_len);
-
- trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
-
- igb_read_rx_descr(core, &desc, &ba);
-
- if (ba) {
- if (desc_offset < size) {
- static const uint32_t fcs_pad;
- size_t iov_copy;
- size_t copy_size = size - desc_offset;
- if (copy_size > bufsize) {
- copy_size = bufsize;
- }
-
- /* Copy packet payload */
- while (copy_size) {
- iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
-
- igb_write_to_rx_buffers(core, d, ba, &written,
- iov->iov_base + iov_ofs, iov_copy);
+ igb_read_rx_descr(core, &desc, &pdma_st.ba);
- copy_size -= iov_copy;
- iov_ofs += iov_copy;
- if (iov_ofs == iov->iov_len) {
- iov++;
- iov_ofs = 0;
- }
- }
-
- if (desc_offset + desc_size >= total_size) {
- /* Simulate FCS checksum presence in the last descriptor */
- igb_write_to_rx_buffers(core, d, ba, &written,
- (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
- }
- }
- } else { /* as per intel docs; skip descriptors with null buf addr */
- trace_e1000e_rx_null_descriptor();
- }
- desc_offset += desc_size;
- if (desc_offset >= total_size) {
+ igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
+ pdma_st.desc_offset += pdma_st.desc_size;
+ if (pdma_st.desc_offset >= pdma_st.total_size) {
is_last = true;
}
igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
- rss_info, etqf, ts, written);
- igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
-
- igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
-
- } while (desc_offset < total_size);
+ rss_info, etqf, ts, pdma_st.written);
+ igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len);
+ igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN);
+ } while (pdma_st.desc_offset < pdma_st.total_size);
- igb_update_rx_stats(core, rxi, size, total_size);
+ igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
}
static bool
diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
index a603468..f40c4ec 100644
--- a/tests/qtest/libqos/igb.c
+++ b/tests/qtest/libqos/igb.c
@@ -109,6 +109,11 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
E1000_RAH_AV | E1000_RAH_POOL_1 |
le16_to_cpu(*(uint16_t *)(address + 4)));
+ /* Set supported receive descriptor mode */
+ e1000e_macreg_write(&d->e1000e,
+ E1000_SRRCTL(0),
+ E1000_SRRCTL_DESCTYPE_ADV_ONEBUF);
+
/* Enable receive */
e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 09/17] igb: add IPv6 extended headers traffic detection
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (7 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 08/17] igb: RX payload " Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 10/17] igb: packet-split descriptors support Jason Wang
` (8 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/igb_core.c | 4 +++-
hw/net/igb_regs.h | 1 +
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 6d2712e..9f43fe5 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1420,7 +1420,9 @@ igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
- pkt_type = E1000_ADVRXD_PKT_IP6;
+ eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt);
+ pkt_type = ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
+ E1000_ADVRXD_PKT_IP6;
} else if (hasip4) {
pkt_type = E1000_ADVRXD_PKT_IP4;
} else {
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index 71a8833..36763f2 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -694,6 +694,7 @@ union e1000_adv_rx_desc {
#define E1000_ADVRXD_PKT_IP4 BIT(0)
#define E1000_ADVRXD_PKT_IP6 BIT(2)
+#define E1000_ADVRXD_PKT_IP6E BIT(3)
#define E1000_ADVRXD_PKT_TCP BIT(4)
#define E1000_ADVRXD_PKT_UDP BIT(5)
#define E1000_ADVRXD_PKT_SCTP BIT(6)
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 10/17] igb: packet-split descriptors support
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (8 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 09/17] igb: add IPv6 extended headers traffic detection Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 11/17] e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers Jason Wang
` (7 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Packet-split descriptors are used by Linux VF driver for MTU values from 2048
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/igb_core.c | 348 +++++++++++++++++++++++++++++++++++++++++++++-------
hw/net/igb_regs.h | 9 ++
hw/net/trace-events | 2 +-
3 files changed, 316 insertions(+), 43 deletions(-)
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 9f43fe5..f6a5e23 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -267,6 +267,29 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
return false;
}
+typedef struct E1000ERingInfo {
+ int dbah;
+ int dbal;
+ int dlen;
+ int dh;
+ int dt;
+ int idx;
+} E1000ERingInfo;
+
+static uint32_t
+igb_rx_queue_desctyp_get(IGBCore *core, const E1000ERingInfo *r)
+{
+ return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
+}
+
+static bool
+igb_rx_use_ps_descriptor(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+ return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT ||
+ desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
static inline bool
igb_rss_enabled(IGBCore *core)
{
@@ -694,15 +717,6 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
}
-typedef struct E1000ERingInfo {
- int dbah;
- int dbal;
- int dlen;
- int dh;
- int dt;
- int idx;
-} E1000ERingInfo;
-
static inline bool
igb_ring_empty(IGBCore *core, const E1000ERingInfo *r)
{
@@ -1233,12 +1247,31 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
}
static inline void
-igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
- hwaddr *buff_addr)
+igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+ hwaddr *buff_addr)
{
*buff_addr = le64_to_cpu(desc->read.pkt_addr);
}
+static inline void
+igb_read_adv_rx_split_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+ hwaddr *buff_addr)
+{
+ buff_addr[0] = le64_to_cpu(desc->read.hdr_addr);
+ buff_addr[1] = le64_to_cpu(desc->read.pkt_addr);
+}
+
+typedef struct IGBBAState {
+ uint16_t written[IGB_MAX_PS_BUFFERS];
+ uint8_t cur_idx;
+} IGBBAState;
+
+typedef struct IGBSplitDescriptorData {
+ bool sph;
+ bool hbo;
+ size_t hdr_len;
+} IGBSplitDescriptorData;
+
typedef struct IGBPacketRxDMAState {
size_t size;
size_t total_size;
@@ -1249,20 +1282,42 @@ typedef struct IGBPacketRxDMAState {
uint32_t rx_desc_header_buf_size;
struct iovec *iov;
size_t iov_ofs;
+ bool do_ps;
bool is_first;
- uint16_t written;
- hwaddr ba;
+ IGBBAState bastate;
+ hwaddr ba[IGB_MAX_PS_BUFFERS];
+ IGBSplitDescriptorData ps_desc_data;
} IGBPacketRxDMAState;
static inline void
-igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
- hwaddr *buff_addr)
+igb_read_rx_descr(IGBCore *core,
+ union e1000_rx_desc_union *desc,
+ IGBPacketRxDMAState *pdma_st,
+ const E1000ERingInfo *r)
{
+ uint32_t desc_type;
+
if (igb_rx_use_legacy_descriptor(core)) {
- igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
- } else {
- igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
+ igb_read_lgcy_rx_descr(core, &desc->legacy, &pdma_st->ba[1]);
+ pdma_st->ba[0] = 0;
+ return;
+ }
+
+ /* advanced header split descriptor */
+ if (igb_rx_use_ps_descriptor(core, r)) {
+ igb_read_adv_rx_split_buf_descr(core, &desc->adv, &pdma_st->ba[0]);
+ return;
+ }
+
+ /* descriptor replication modes not supported */
+ desc_type = igb_rx_queue_desctyp_get(core, r);
+ if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
+ trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
}
+
+ /* advanced single buffer descriptor */
+ igb_read_adv_rx_single_buf_descr(core, &desc->adv, &pdma_st->ba[1]);
+ pdma_st->ba[0] = 0;
}
static void
@@ -1405,6 +1460,13 @@ igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
desc->status = (uint8_t) le32_to_cpu(status_flags);
}
+static bool
+igb_rx_ps_descriptor_split_always(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+ return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
static uint16_t
igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
{
@@ -1495,15 +1557,54 @@ igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
}
static inline void
-igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
- struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
- uint16_t etqf, bool ts, uint16_t length)
+igb_write_adv_ps_rx_descr(IGBCore *core,
+ union e1000_adv_rx_desc *desc,
+ struct NetRxPkt *pkt,
+ const E1000E_RSSInfo *rss_info,
+ const E1000ERingInfo *r,
+ uint16_t etqf,
+ bool ts,
+ IGBPacketRxDMAState *pdma_st)
+{
+ size_t pkt_len;
+ uint16_t hdr_info = 0;
+
+ if (pdma_st->do_ps) {
+ pkt_len = pdma_st->bastate.written[1];
+ } else {
+ pkt_len = pdma_st->bastate.written[0] + pdma_st->bastate.written[1];
+ }
+
+ igb_write_adv_rx_descr(core, desc, pkt, rss_info, etqf, ts, pkt_len);
+
+ hdr_info = (pdma_st->ps_desc_data.hdr_len << E1000_ADVRXD_HDR_LEN_OFFSET) &
+ E1000_ADVRXD_ADV_HDR_LEN_MASK;
+ hdr_info |= pdma_st->ps_desc_data.sph ? E1000_ADVRXD_HDR_SPH : 0;
+ desc->wb.lower.lo_dword.hdr_info = cpu_to_le16(hdr_info);
+
+ desc->wb.upper.status_error |= cpu_to_le32(
+ pdma_st->ps_desc_data.hbo ? E1000_ADVRXD_ST_ERR_HBO_OFFSET : 0);
+}
+
+static inline void
+igb_write_rx_descr(IGBCore *core,
+ union e1000_rx_desc_union *desc,
+ struct NetRxPkt *pkt,
+ const E1000E_RSSInfo *rss_info,
+ uint16_t etqf,
+ bool ts,
+ IGBPacketRxDMAState *pdma_st,
+ const E1000ERingInfo *r)
{
if (igb_rx_use_legacy_descriptor(core)) {
- igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, length);
+ igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
+ pdma_st->bastate.written[1]);
+ } else if (igb_rx_use_ps_descriptor(core, r)) {
+ igb_write_adv_ps_rx_descr(core, &desc->adv, pkt, rss_info, r, etqf, ts,
+ pdma_st);
} else {
igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
- etqf, ts, length);
+ etqf, ts, pdma_st->bastate.written[1]);
}
}
@@ -1564,26 +1665,179 @@ igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
}
+static bool
+igb_do_ps(IGBCore *core,
+ const E1000ERingInfo *r,
+ struct NetRxPkt *pkt,
+ IGBPacketRxDMAState *pdma_st)
+{
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+ bool fragment;
+ bool split_always;
+ size_t bheader_size;
+ size_t total_pkt_len;
+
+ if (!igb_rx_use_ps_descriptor(core, r)) {
+ return false;
+ }
+
+ total_pkt_len = net_rx_pkt_get_total_len(pkt);
+ bheader_size = igb_rxhdrbufsize(core, r);
+ split_always = igb_rx_ps_descriptor_split_always(core, r);
+ if (split_always && total_pkt_len <= bheader_size) {
+ pdma_st->ps_hdr_len = total_pkt_len;
+ pdma_st->ps_desc_data.hdr_len = total_pkt_len;
+ return true;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if (hasip4) {
+ fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
+ } else if (hasip6) {
+ fragment = net_rx_pkt_get_ip6_info(pkt)->fragment;
+ } else {
+ pdma_st->ps_desc_data.hdr_len = bheader_size;
+ goto header_not_handled;
+ }
+
+ if (fragment && (core->mac[RFCTL] & E1000_RFCTL_IPFRSP_DIS)) {
+ pdma_st->ps_desc_data.hdr_len = bheader_size;
+ goto header_not_handled;
+ }
+
+ /* no header splitting for SCTP */
+ if (!fragment && (l4hdr_proto == ETH_L4_HDR_PROTO_UDP ||
+ l4hdr_proto == ETH_L4_HDR_PROTO_TCP)) {
+ pdma_st->ps_hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt);
+ } else {
+ pdma_st->ps_hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt);
+ }
+
+ pdma_st->ps_desc_data.sph = true;
+ pdma_st->ps_desc_data.hdr_len = pdma_st->ps_hdr_len;
+
+ if (pdma_st->ps_hdr_len > bheader_size) {
+ pdma_st->ps_desc_data.hbo = true;
+ goto header_not_handled;
+ }
+
+ return true;
+
+header_not_handled:
+ if (split_always) {
+ pdma_st->ps_hdr_len = bheader_size;
+ return true;
+ }
+
+ return false;
+}
+
static void
igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size)
{
- if (*size > pdma_st->rx_desc_packet_buf_size) {
- *size = pdma_st->rx_desc_packet_buf_size;
+ if (pdma_st->do_ps && pdma_st->is_first) {
+ if (*size > pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len) {
+ *size = pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len;
+ }
+ } else {
+ if (*size > pdma_st->rx_desc_packet_buf_size) {
+ *size = pdma_st->rx_desc_packet_buf_size;
+ }
+ }
+}
+
+static inline void
+igb_write_hdr_frag_to_rx_buffers(IGBCore *core,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ const char *data,
+ dma_addr_t data_len)
+{
+ assert(data_len <= pdma_st->rx_desc_header_buf_size -
+ pdma_st->bastate.written[0]);
+ pci_dma_write(d,
+ pdma_st->ba[0] + pdma_st->bastate.written[0],
+ data, data_len);
+ pdma_st->bastate.written[0] += data_len;
+ pdma_st->bastate.cur_idx = 1;
+}
+
+static void
+igb_write_header_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ size_t *copy_size)
+{
+ size_t iov_copy;
+ size_t ps_hdr_copied = 0;
+
+ if (!pdma_st->is_first) {
+ /* Leave buffer 0 of each descriptor except first */
+ /* empty */
+ pdma_st->bastate.cur_idx = 1;
+ return;
}
+
+ do {
+ iov_copy = MIN(pdma_st->ps_hdr_len - ps_hdr_copied,
+ pdma_st->iov->iov_len - pdma_st->iov_ofs);
+
+ igb_write_hdr_frag_to_rx_buffers(core, d, pdma_st,
+ pdma_st->iov->iov_base,
+ iov_copy);
+
+ *copy_size -= iov_copy;
+ ps_hdr_copied += iov_copy;
+
+ pdma_st->iov_ofs += iov_copy;
+ if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+ pdma_st->iov++;
+ pdma_st->iov_ofs = 0;
+ }
+ } while (ps_hdr_copied < pdma_st->ps_hdr_len);
+
+ pdma_st->is_first = false;
}
static void
igb_write_payload_frag_to_rx_buffers(IGBCore *core,
PCIDevice *d,
- hwaddr ba,
- uint16_t *written,
- uint32_t cur_buf_len,
+ IGBPacketRxDMAState *pdma_st,
const char *data,
dma_addr_t data_len)
{
- trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
- pci_dma_write(d, ba + *written, data, data_len);
- *written += data_len;
+ while (data_len > 0) {
+ assert(pdma_st->bastate.cur_idx < IGB_MAX_PS_BUFFERS);
+
+ uint32_t cur_buf_bytes_left =
+ pdma_st->rx_desc_packet_buf_size -
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx];
+ uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
+
+ trace_igb_rx_desc_buff_write(
+ pdma_st->bastate.cur_idx,
+ pdma_st->ba[pdma_st->bastate.cur_idx],
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+ data,
+ bytes_to_write);
+
+ pci_dma_write(d,
+ pdma_st->ba[pdma_st->bastate.cur_idx] +
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+ data, bytes_to_write);
+
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx] += bytes_to_write;
+ data += bytes_to_write;
+ data_len -= bytes_to_write;
+
+ if (pdma_st->bastate.written[pdma_st->bastate.cur_idx] ==
+ pdma_st->rx_desc_packet_buf_size) {
+ pdma_st->bastate.cur_idx++;
+ }
+ }
}
static void
@@ -1600,9 +1854,7 @@ igb_write_payload_to_rx_buffers(IGBCore *core,
while (*copy_size) {
iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
igb_write_payload_frag_to_rx_buffers(core, d,
- pdma_st->ba,
- &pdma_st->written,
- pdma_st->rx_desc_packet_buf_size,
+ pdma_st,
pdma_st->iov->iov_base +
pdma_st->iov_ofs,
iov_copy);
@@ -1618,9 +1870,7 @@ igb_write_payload_to_rx_buffers(IGBCore *core,
if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
/* Simulate FCS checksum presence in the last descriptor */
igb_write_payload_frag_to_rx_buffers(core, d,
- pdma_st->ba,
- &pdma_st->written,
- pdma_st->rx_desc_packet_buf_size,
+ pdma_st,
(const char *) &fcs_pad,
e1000x_fcs_len(core->mac));
}
@@ -1634,7 +1884,7 @@ igb_write_to_rx_buffers(IGBCore *core,
{
size_t copy_size;
- if (!pdma_st->ba) {
+ if (!(pdma_st->ba)[1] || (pdma_st->do_ps && !(pdma_st->ba[0]))) {
/* as per intel docs; skip descriptors with null buf addr */
trace_e1000e_rx_null_descriptor();
return;
@@ -1648,6 +1898,14 @@ igb_write_to_rx_buffers(IGBCore *core,
igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
copy_size = pdma_st->size - pdma_st->desc_offset;
igb_truncate_to_descriptor_size(pdma_st, ©_size);
+
+ /* For PS mode copy the packet header first */
+ if (pdma_st->do_ps) {
+ igb_write_header_to_rx_buffers(core, pkt, d, pdma_st, ©_size);
+ } else {
+ pdma_st->bastate.cur_idx = 1;
+ }
+
igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, ©_size);
}
@@ -1678,8 +1936,10 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
d = core->owner;
}
+ pdma_st.do_ps = igb_do_ps(core, rxi, pkt, &pdma_st);
+
do {
- pdma_st.written = 0;
+ memset(&pdma_st.bastate, 0, sizeof(IGBBAState));
bool is_last = false;
if (igb_ring_empty(core, rxi)) {
@@ -1690,7 +1950,7 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
pci_dma_read(d, base, &desc, rx_desc_len);
trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
- igb_read_rx_descr(core, &desc, &pdma_st.ba);
+ igb_read_rx_descr(core, &desc, &pdma_st, rxi);
igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
pdma_st.desc_offset += pdma_st.desc_size;
@@ -1698,8 +1958,12 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
is_last = true;
}
- igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
- rss_info, etqf, ts, pdma_st.written);
+ igb_write_rx_descr(core, &desc,
+ is_last ? pkt : NULL,
+ rss_info,
+ etqf, ts,
+ &pdma_st,
+ rxi);
igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len);
igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN);
} while (pdma_st.desc_offset < pdma_st.total_size);
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index 36763f2..ed7427b 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -452,6 +452,7 @@ union e1000_adv_rx_desc {
#define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00
#define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */
#define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000
+#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT 0x04000000
#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
#define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000
#define E1000_SRRCTL_DROP_EN 0x80000000
@@ -699,6 +700,14 @@ union e1000_adv_rx_desc {
#define E1000_ADVRXD_PKT_UDP BIT(5)
#define E1000_ADVRXD_PKT_SCTP BIT(6)
+#define IGB_MAX_PS_BUFFERS 2
+
+#define E1000_ADVRXD_HDR_LEN_OFFSET (21 - 16)
+#define E1000_ADVRXD_ADV_HDR_LEN_MASK ((BIT(10) - 1) << \
+ E1000_ADVRXD_HDR_LEN_OFFSET)
+#define E1000_ADVRXD_HDR_SPH BIT(15)
+#define E1000_ADVRXD_ST_ERR_HBO_OFFSET BIT(3 + 20)
+
static inline uint8_t igb_ivar_entry_rx(uint8_t i)
{
return i < 8 ? i * 4 : (i - 8) * 4 + 2;
diff --git a/hw/net/trace-events b/hw/net/trace-events
index b8305c0..3abfd65 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -278,7 +278,7 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
-igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
+igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer %u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 11/17] e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (9 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 10/17] igb: packet-split descriptors support Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 12/17] tests: bump libvirt-ci for libasan and libxdp Jason Wang
` (6 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Tomasz Dzieciol, Akihiko Odaki, Jason Wang
From: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Rename e1000e_ba_state according and e1000e_write_hdr_to_rx_buffers for
consistency with IGB.
Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/e1000e_core.c | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index cc243b7..e324c02 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1397,17 +1397,17 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
}
}
-typedef struct e1000e_ba_state_st {
+typedef struct E1000EBAState {
uint16_t written[MAX_PS_BUFFERS];
uint8_t cur_idx;
-} e1000e_ba_state;
+} E1000EBAState;
static inline void
-e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
- hwaddr ba[MAX_PS_BUFFERS],
- e1000e_ba_state *bastate,
- const char *data,
- dma_addr_t data_len)
+e1000e_write_hdr_frag_to_rx_buffers(E1000ECore *core,
+ hwaddr ba[MAX_PS_BUFFERS],
+ E1000EBAState *bastate,
+ const char *data,
+ dma_addr_t data_len)
{
assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]);
@@ -1420,7 +1420,7 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
static void
e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core,
hwaddr ba[MAX_PS_BUFFERS],
- e1000e_ba_state *bastate,
+ E1000EBAState *bastate,
const char *data,
dma_addr_t data_len)
{
@@ -1530,7 +1530,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
do {
hwaddr ba[MAX_PS_BUFFERS];
- e1000e_ba_state bastate = { { 0 } };
+ E1000EBAState bastate = { { 0 } };
bool is_last = false;
desc_size = total_size - desc_offset;
@@ -1568,8 +1568,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
iov_copy = MIN(ps_hdr_len - ps_hdr_copied,
iov->iov_len - iov_ofs);
- e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
- iov->iov_base, iov_copy);
+ e1000e_write_hdr_frag_to_rx_buffers(core, ba,
+ &bastate,
+ iov->iov_base,
+ iov_copy);
copy_size -= iov_copy;
ps_hdr_copied += iov_copy;
@@ -1585,8 +1587,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
} else {
/* Leave buffer 0 of each descriptor except first */
/* empty as per spec 7.1.5.1 */
- e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
- NULL, 0);
+ e1000e_write_hdr_frag_to_rx_buffers(core, ba, &bastate,
+ NULL, 0);
}
}
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 12/17] tests: bump libvirt-ci for libasan and libxdp
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (10 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 11/17] e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 13/17] net: add initial support for AF_XDP network backend Jason Wang
` (5 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Ilya Maximets, Daniel P . Berrangé, Jason Wang
From: Ilya Maximets <i.maximets@ovn.org>
This pulls in the fixes for libasan version as well as support for
libxdp that will be used for af-xdp netdev in the next commits.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
tests/docker/dockerfiles/debian-amd64-cross.docker | 2 +-
tests/docker/dockerfiles/debian-amd64.docker | 2 +-
tests/docker/dockerfiles/debian-arm64-cross.docker | 2 +-
tests/docker/dockerfiles/debian-armel-cross.docker | 2 +-
tests/docker/dockerfiles/debian-armhf-cross.docker | 2 +-
tests/docker/dockerfiles/debian-ppc64el-cross.docker | 2 +-
tests/docker/dockerfiles/debian-s390x-cross.docker | 2 +-
tests/docker/dockerfiles/opensuse-leap.docker | 2 +-
tests/docker/dockerfiles/ubuntu2004.docker | 2 +-
tests/docker/dockerfiles/ubuntu2204.docker | 2 +-
tests/lcitool/libvirt-ci | 2 +-
11 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/tests/docker/dockerfiles/debian-amd64-cross.docker b/tests/docker/dockerfiles/debian-amd64-cross.docker
index b66b9cc..0cf3ba6 100644
--- a/tests/docker/dockerfiles/debian-amd64-cross.docker
+++ b/tests/docker/dockerfiles/debian-amd64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-x86-64-linux-gnu \
gcc-x86-64-linux-gnu \
libaio-dev:amd64 \
- libasan5:amd64 \
+ libasan6:amd64 \
libasound2-dev:amd64 \
libattr1-dev:amd64 \
libbpf-dev:amd64 \
diff --git a/tests/docker/dockerfiles/debian-amd64.docker b/tests/docker/dockerfiles/debian-amd64.docker
index 02262bc..e3e1de2 100644
--- a/tests/docker/dockerfiles/debian-amd64.docker
+++ b/tests/docker/dockerfiles/debian-amd64.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \
diff --git a/tests/docker/dockerfiles/debian-arm64-cross.docker b/tests/docker/dockerfiles/debian-arm64-cross.docker
index a0a968b..d8cd4f8 100644
--- a/tests/docker/dockerfiles/debian-arm64-cross.docker
+++ b/tests/docker/dockerfiles/debian-arm64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-aarch64-linux-gnu \
gcc-aarch64-linux-gnu \
libaio-dev:arm64 \
- libasan5:arm64 \
+ libasan6:arm64 \
libasound2-dev:arm64 \
libattr1-dev:arm64 \
libbpf-dev:arm64 \
diff --git a/tests/docker/dockerfiles/debian-armel-cross.docker b/tests/docker/dockerfiles/debian-armel-cross.docker
index f1fc34a..75342c0 100644
--- a/tests/docker/dockerfiles/debian-armel-cross.docker
+++ b/tests/docker/dockerfiles/debian-armel-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabi \
gcc-arm-linux-gnueabi \
libaio-dev:armel \
- libasan5:armel \
+ libasan6:armel \
libasound2-dev:armel \
libattr1-dev:armel \
libbpf-dev:armel \
diff --git a/tests/docker/dockerfiles/debian-armhf-cross.docker b/tests/docker/dockerfiles/debian-armhf-cross.docker
index a278578..f45cfed 100644
--- a/tests/docker/dockerfiles/debian-armhf-cross.docker
+++ b/tests/docker/dockerfiles/debian-armhf-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabihf \
gcc-arm-linux-gnueabihf \
libaio-dev:armhf \
- libasan5:armhf \
+ libasan6:armhf \
libasound2-dev:armhf \
libattr1-dev:armhf \
libbpf-dev:armhf \
diff --git a/tests/docker/dockerfiles/debian-ppc64el-cross.docker b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
index 30e5efa..52f8c34 100644
--- a/tests/docker/dockerfiles/debian-ppc64el-cross.docker
+++ b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-powerpc64le-linux-gnu \
gcc-powerpc64le-linux-gnu \
libaio-dev:ppc64el \
- libasan5:ppc64el \
+ libasan6:ppc64el \
libasound2-dev:ppc64el \
libattr1-dev:ppc64el \
libbpf-dev:ppc64el \
diff --git a/tests/docker/dockerfiles/debian-s390x-cross.docker b/tests/docker/dockerfiles/debian-s390x-cross.docker
index ee6db7b..208e57b 100644
--- a/tests/docker/dockerfiles/debian-s390x-cross.docker
+++ b/tests/docker/dockerfiles/debian-s390x-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-s390x-linux-gnu \
gcc-s390x-linux-gnu \
libaio-dev:s390x \
- libasan5:s390x \
+ libasan6:s390x \
libasound2-dev:s390x \
libattr1-dev:s390x \
libbpf-dev:s390x \
diff --git a/tests/docker/dockerfiles/opensuse-leap.docker b/tests/docker/dockerfiles/opensuse-leap.docker
index fef8d5a..ed04b4d 100644
--- a/tests/docker/dockerfiles/opensuse-leap.docker
+++ b/tests/docker/dockerfiles/opensuse-leap.docker
@@ -40,7 +40,7 @@ RUN zypper update -y && \
libSDL2-devel \
libSDL2_image-devel \
libaio-devel \
- libasan6 \
+ libasan8 \
libattr-devel \
libbpf-devel \
libbz2-devel \
diff --git a/tests/docker/dockerfiles/ubuntu2004.docker b/tests/docker/dockerfiles/ubuntu2004.docker
index 4180cd8..d3e2120 100644
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbrlapi-dev \
diff --git a/tests/docker/dockerfiles/ubuntu2204.docker b/tests/docker/dockerfiles/ubuntu2204.docker
index 88493f0..94c2c16 100644
--- a/tests/docker/dockerfiles/ubuntu2204.docker
+++ b/tests/docker/dockerfiles/ubuntu2204.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \
diff --git a/tests/lcitool/libvirt-ci b/tests/lcitool/libvirt-ci
index bbd55b4..5f84a21 160000
--- a/tests/lcitool/libvirt-ci
+++ b/tests/lcitool/libvirt-ci
@@ -1 +1 @@
-Subproject commit bbd55b4d18cce8f89b5167675e434a6941315634
+Subproject commit 5f84a21881577a5fb56cc956f6fe4e2abd6fcff0
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 13/17] net: add initial support for AF_XDP network backend
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (11 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 12/17] tests: bump libvirt-ci for libasan and libxdp Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 14/17] hw/net/fsl_etsec/rings.c: Avoid variable length array Jason Wang
` (4 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Ilya Maximets, Daniel P . Berrangé, Jason Wang
From: Ilya Maximets <i.maximets@ovn.org>
AF_XDP is a network socket family that allows communication directly
with the network device driver in the kernel, bypassing most or all
of the kernel networking stack. In the essence, the technology is
pretty similar to netmap. But, unlike netmap, AF_XDP is Linux-native
and works with any network interfaces without driver modifications.
Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
require access to character devices or unix sockets. Only access to
the network interface itself is necessary.
This patch implements a network backend that communicates with the
kernel by creating an AF_XDP socket. A chunk of userspace memory
is shared between QEMU and the host kernel. 4 ring buffers (Tx, Rx,
Fill and Completion) are placed in that memory along with a pool of
memory buffers for the packet data. Data transmission is done by
allocating one of the buffers, copying packet data into it and
placing the pointer into Tx ring. After transmission, device will
return the buffer via Completion ring. On Rx, device will take
a buffer form a pre-populated Fill ring, write the packet data into
it and place the buffer into Rx ring.
AF_XDP network backend takes on the communication with the host
kernel and the network interface and forwards packets to/from the
peer device in QEMU.
Usage example:
-device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
-netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
XDP program bridges the socket with a network interface. It can be
attached to the interface in 2 different modes:
1. skb - this mode should work for any interface and doesn't require
driver support. With a caveat of lower performance.
2. native - this does require support from the driver and allows to
bypass skb allocation in the kernel and potentially use
zero-copy while getting packets in/out userspace.
By default, QEMU will try to use native mode and fall back to skb.
Mode can be forced via 'mode' option. To force 'copy' even in native
mode, use 'force-copy=on' option. This might be useful if there is
some issue with the driver.
Option 'queues=N' allows to specify how many device queues should
be open. Note that all the queues that are not open are still
functional and can receive traffic, but it will not be delivered to
QEMU. So, the number of device queues should generally match the
QEMU configuration, unless the device is shared with something
else and the traffic re-direction to appropriate queues is correctly
configured on a device level (e.g. with ethtool -N).
'start-queue=M' option can be used to specify from which queue id
QEMU should start configuring 'N' queues. It might also be necessary
to use this option with certain NICs, e.g. MLX5 NICs. See the docs
for examples.
In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
or CAP_BPF capabilities in order to load default XSK/XDP programs to
the network interface and configure BPF maps. It is possible, however,
to run with no capabilities. For that to work, an external process
with enough capabilities will need to pre-load default XSK program,
create AF_XDP sockets and pass their file descriptors to QEMU process
on startup via 'sock-fds' option. Network backend will need to be
configured with 'inhibit=on' to avoid loading of the program.
QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue
or CAP_IPC_LOCK.
There are few performance challenges with the current network backends.
First is that they do not support IO threads. This means that data
path is handled by the main thread in QEMU and may slow down other
work or may be slowed down by some other work. This also means that
taking advantage of multi-queue is generally not possible today.
Another thing is that data path is going through the device emulation
code, which is not really optimized for performance. The fastest
"frontend" device is virtio-net. But it's not optimized for heavy
traffic either, because it expects such use-cases to be handled via
some implementation of vhost (user, kernel, vdpa). In practice, we
have virtio notifications and rcu lock/unlock on a per-packet basis
and not very efficient accesses to the guest memory. Communication
channels between backend and frontend devices do not allow passing
more than one packet at a time as well.
Some of these challenges can be avoided in the future by adding better
batching into device emulation or by implementing vhost-af-xdp variant.
There are also a few kernel limitations. AF_XDP sockets do not
support any kinds of checksum or segmentation offloading. Buffers
are limited to a page size (4K), i.e. MTU is limited. Multi-buffer
support implementation for AF_XDP is in progress, but not ready yet.
Also, transmission in all non-zero-copy modes is synchronous, i.e.
done in a syscall. That doesn't allow high packet rates on virtual
interfaces.
However, keeping in mind all of these challenges, current implementation
of the AF_XDP backend shows a decent performance while running on top
of a physical NIC with zero-copy support.
Test setup:
2 VMs running on 2 physical hosts connected via ConnectX6-Dx card.
Network backend is configured to open the NIC directly in native mode.
The driver supports zero-copy. NIC is configured to use 1 queue.
Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd
for PPS testing.
iperf3 result:
TCP stream : 19.1 Gbps
dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
Tx only : 3.4 Mpps
Rx only : 2.0 Mpps
L2 FWD Loopback : 1.5 Mpps
In skb mode the same setup shows much lower performance, similar to
the setup where pair of physical NICs is replaced with veth pair:
iperf3 result:
TCP stream : 9 Gbps
dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
Tx only : 1.2 Mpps
Rx only : 1.0 Mpps
L2 FWD Loopback : 0.7 Mpps
Results in skb mode or over the veth are close to results of a tap
backend with vhost=on and disabled segmentation offloading bridged
with a NIC.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> (docker/lcitool)
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
MAINTAINERS | 4 +
hmp-commands.hx | 3 +
meson.build | 9 +
meson_options.txt | 2 +
net/af-xdp.c | 526 ++++++++++++++++++++++++
net/clients.h | 5 +
net/meson.build | 3 +
net/net.c | 6 +
qapi/net.json | 58 +++
qemu-options.hx | 70 +++-
scripts/ci/org.centos/stream/8/x86_64/configure | 1 +
scripts/meson-buildoptions.sh | 3 +
tests/docker/dockerfiles/alpine.docker | 1 +
tests/docker/dockerfiles/centos8.docker | 1 +
tests/docker/dockerfiles/fedora.docker | 1 +
tests/lcitool/projects/qemu.yml | 1 +
16 files changed, 693 insertions(+), 1 deletion(-)
create mode 100644 net/af-xdp.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 00562f9..67cefaa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
S: Maintained
F: net/netmap.c
+AF_XDP network backend
+R: Ilya Maximets <i.maximets@ovn.org>
+F: net/af-xdp.c
+
Host Memory Backends
M: David Hildenbrand <david@redhat.com>
M: Igor Mammedov <imammedo@redhat.com>
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f7..63eac22 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1296,6 +1296,9 @@ ERST
.name = "netdev_add",
.args_type = "netdev:O",
.params = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+ "|af-xdp"
+#endif
#ifdef CONFIG_VMNET
"|vmnet-host|vmnet-shared|vmnet-bridged"
#endif
diff --git a/meson.build b/meson.build
index 5150a74..f426861 100644
--- a/meson.build
+++ b/meson.build
@@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
endif
endif
+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+ libxdp = dependency('libxdp', required: get_option('af_xdp'),
+ version: '>=1.4.0', method: 'pkg-config')
+endif
+
# libdw
libdw = not_found
if not get_option('libdw').auto() or \
@@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars
config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@@ -4270,6 +4278,7 @@ summary_info = {}
if targetos == 'darwin'
summary_info += {'vmnet.framework support': vmnet}
endif
+summary_info += {'AF_XDP support': libxdp}
summary_info += {'slirp support': slirp}
summary_info += {'vde support': vde}
summary_info += {'netmap support': have_netmap}
diff --git a/meson_options.txt b/meson_options.txt
index f82d88b..2ca40f2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
+option('af_xdp', type : 'feature', value : 'auto',
+ description: 'AF_XDP network backend support')
option('attr', type : 'feature', value : 'auto',
description: 'attr/xattr support')
option('auth_pam', type : 'feature', value : 'auto',
diff --git a/net/af-xdp.c b/net/af-xdp.c
new file mode 100644
index 0000000..6c65028
--- /dev/null
+++ b/net/af-xdp.c
@@ -0,0 +1,526 @@
+/*
+ * AF_XDP network backend.
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Authors:
+ * Ilya Maximets <i.maximets@ovn.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+
+#include "qemu/osdep.h"
+#include <bpf/bpf.h>
+#include <inttypes.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <net/if.h>
+#include <xdp/xsk.h>
+
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/memalign.h"
+
+
+typedef struct AFXDPState {
+ NetClientState nc;
+
+ struct xsk_socket *xsk;
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_cons cq;
+ struct xsk_ring_prod fq;
+
+ char ifname[IFNAMSIZ];
+ int ifindex;
+ bool read_poll;
+ bool write_poll;
+ uint32_t outstanding_tx;
+
+ uint64_t *pool;
+ uint32_t n_pool;
+ char *buffer;
+ struct xsk_umem *umem;
+
+ uint32_t n_queues;
+ uint32_t xdp_flags;
+ bool inhibit;
+} AFXDPState;
+
+#define AF_XDP_BATCH_SIZE 64
+
+static void af_xdp_send(void *opaque);
+static void af_xdp_writable(void *opaque);
+
+/* Set the event-loop handlers for the af-xdp backend. */
+static void af_xdp_update_fd_handler(AFXDPState *s)
+{
+ qemu_set_fd_handler(xsk_socket__fd(s->xsk),
+ s->read_poll ? af_xdp_send : NULL,
+ s->write_poll ? af_xdp_writable : NULL,
+ s);
+}
+
+/* Update the read handler. */
+static void af_xdp_read_poll(AFXDPState *s, bool enable)
+{
+ if (s->read_poll != enable) {
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+/* Update the write handler. */
+static void af_xdp_write_poll(AFXDPState *s, bool enable)
+{
+ if (s->write_poll != enable) {
+ s->write_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_poll(NetClientState *nc, bool enable)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ if (s->read_poll != enable || s->write_poll != enable) {
+ s->write_poll = enable;
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_complete_tx(AFXDPState *s)
+{
+ uint32_t idx = 0;
+ uint32_t done, i;
+ uint64_t *addr;
+
+ done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
+
+ for (i = 0; i < done; i++) {
+ addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
+ s->pool[s->n_pool++] = *addr;
+ s->outstanding_tx--;
+ }
+
+ if (done) {
+ xsk_ring_cons__release(&s->cq, done);
+ }
+}
+
+/*
+ * The fd_write() callback, invoked if the fd is marked as writable
+ * after a poll.
+ */
+static void af_xdp_writable(void *opaque)
+{
+ AFXDPState *s = opaque;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ /*
+ * Unregister the handler, unless we still have packets to transmit
+ * and kernel needs a wake up.
+ */
+ if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, false);
+ }
+
+ /* Flush any buffered packets. */
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t af_xdp_receive(NetClientState *nc,
+ const uint8_t *buf, size_t size)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+ struct xdp_desc *desc;
+ uint32_t idx;
+ void *data;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
+ /* We can't transmit packet this size... */
+ return size;
+ }
+
+ if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
+ /*
+ * Out of buffers or space in tx ring. Poll until we can write.
+ * This will also kick the Tx, if it was waiting on CQ.
+ */
+ af_xdp_write_poll(s, true);
+ return 0;
+ }
+
+ desc = xsk_ring_prod__tx_desc(&s->tx, idx);
+ desc->addr = s->pool[--s->n_pool];
+ desc->len = size;
+
+ data = xsk_umem__get_data(s->buffer, desc->addr);
+ memcpy(data, buf, size);
+
+ xsk_ring_prod__submit(&s->tx, 1);
+ s->outstanding_tx++;
+
+ if (xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, true);
+ }
+
+ return size;
+}
+
+/*
+ * Complete a previous send (backend --> guest) and enable the
+ * fd_read callback.
+ */
+static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ af_xdp_read_poll(s, true);
+}
+
+static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
+{
+ uint32_t i, idx = 0;
+
+ /* Leave one packet for Tx, just in case. */
+ if (s->n_pool < n + 1) {
+ n = s->n_pool;
+ }
+
+ if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
+ }
+ xsk_ring_prod__submit(&s->fq, n);
+
+ if (xsk_ring_prod__needs_wakeup(&s->fq)) {
+ /* Receive was blocked by not having enough buffers. Wake it up. */
+ af_xdp_read_poll(s, true);
+ }
+}
+
+static void af_xdp_send(void *opaque)
+{
+ uint32_t i, n_rx, idx = 0;
+ AFXDPState *s = opaque;
+
+ n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
+ if (!n_rx) {
+ return;
+ }
+
+ for (i = 0; i < n_rx; i++) {
+ const struct xdp_desc *desc;
+ struct iovec iov;
+
+ desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
+
+ iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
+ iov.iov_len = desc->len;
+
+ s->pool[s->n_pool++] = desc->addr;
+
+ if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
+ af_xdp_send_completed)) {
+ /*
+ * The peer does not receive anymore. Packet is queued, stop
+ * reading from the backend until af_xdp_send_completed().
+ */
+ af_xdp_read_poll(s, false);
+
+ /* Return unused descriptors to not break the ring cache. */
+ xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
+ n_rx = i + 1;
+ break;
+ }
+ }
+
+ /* Release actually sent descriptors and try to re-fill. */
+ xsk_ring_cons__release(&s->rx, n_rx);
+ af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
+}
+
+/* Flush and close. */
+static void af_xdp_cleanup(NetClientState *nc)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ qemu_purge_queued_packets(nc);
+
+ af_xdp_poll(nc, false);
+
+ xsk_socket__delete(s->xsk);
+ s->xsk = NULL;
+ g_free(s->pool);
+ s->pool = NULL;
+ xsk_umem__delete(s->umem);
+ s->umem = NULL;
+ qemu_vfree(s->buffer);
+ s->buffer = NULL;
+
+ /* Remove the program if it's the last open queue. */
+ if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
+ && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
+ fprintf(stderr,
+ "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
+ s->ifname, s->ifindex);
+ }
+}
+
+static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
+{
+ struct xsk_umem_config config = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .frame_headroom = 0,
+ };
+ uint64_t n_descs;
+ uint64_t size;
+ int64_t i;
+ int ret;
+
+ /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
+ n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
+ + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
+ size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
+
+ s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
+ memset(s->buffer, 0, size);
+
+ if (sock_fd < 0) {
+ ret = xsk_umem__create(&s->umem, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ } else {
+ ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ }
+
+ if (ret) {
+ qemu_vfree(s->buffer);
+ error_setg_errno(errp, errno,
+ "failed to create umem for %s queue_index: %d",
+ s->ifname, s->nc.queue_index);
+ return -1;
+ }
+
+ s->pool = g_new(uint64_t, n_descs);
+ /* Fill the pool in the opposite order, because it's a LIFO queue. */
+ for (i = n_descs; i >= 0; i--) {
+ s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ }
+ s->n_pool = n_descs;
+
+ af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+
+ return 0;
+}
+
+static int af_xdp_socket_create(AFXDPState *s,
+ const NetdevAFXDPOptions *opts, Error **errp)
+{
+ struct xsk_socket_config cfg = {
+ .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .libxdp_flags = 0,
+ .bind_flags = XDP_USE_NEED_WAKEUP,
+ .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
+ };
+ int queue_id, error = 0;
+
+ s->inhibit = opts->has_inhibit && opts->inhibit;
+ if (s->inhibit) {
+ cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
+ }
+
+ if (opts->has_force_copy && opts->force_copy) {
+ cfg.bind_flags |= XDP_COPY;
+ }
+
+ queue_id = s->nc.queue_index;
+ if (opts->has_start_queue && opts->start_queue > 0) {
+ queue_id += opts->start_queue;
+ }
+
+ if (opts->has_mode) {
+ /* Specific mode requested. */
+ cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
+ ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ } else {
+ /* No mode requested, try native first. */
+ cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ /* Can't use native mode, try skb. */
+ cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
+ cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ }
+ }
+
+ if (error) {
+ error_setg_errno(errp, error,
+ "failed to create AF_XDP socket for %s queue_id: %d",
+ s->ifname, queue_id);
+ return -1;
+ }
+
+ s->xdp_flags = cfg.xdp_flags;
+
+ return 0;
+}
+
+/* NetClientInfo methods. */
+static NetClientInfo net_af_xdp_info = {
+ .type = NET_CLIENT_DRIVER_AF_XDP,
+ .size = sizeof(AFXDPState),
+ .receive = af_xdp_receive,
+ .poll = af_xdp_poll,
+ .cleanup = af_xdp_cleanup,
+};
+
+static int *parse_socket_fds(const char *sock_fds_str,
+ int64_t n_expected, Error **errp)
+{
+ gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
+ int64_t i, n_sock_fds = g_strv_length(substrings);
+ int *sock_fds = NULL;
+
+ if (n_sock_fds != n_expected) {
+ error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
+ n_expected, n_sock_fds);
+ goto exit;
+ }
+
+ sock_fds = g_new(int, n_sock_fds);
+
+ for (i = 0; i < n_sock_fds; i++) {
+ sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
+ if (sock_fds[i] < 0) {
+ g_free(sock_fds);
+ sock_fds = NULL;
+ goto exit;
+ }
+ }
+
+exit:
+ g_strfreev(substrings);
+ return sock_fds;
+}
+
+/*
+ * The exported init function.
+ *
+ * ... -netdev af-xdp,ifname="..."
+ */
+int net_init_af_xdp(const Netdev *netdev,
+ const char *name, NetClientState *peer, Error **errp)
+{
+ const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
+ NetClientState *nc, *nc0 = NULL;
+ unsigned int ifindex;
+ uint32_t prog_id = 0;
+ int *sock_fds = NULL;
+ int64_t i, queues;
+ Error *err = NULL;
+ AFXDPState *s;
+
+ ifindex = if_nametoindex(opts->ifname);
+ if (!ifindex) {
+ error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
+ opts->ifname);
+ return -1;
+ }
+
+ queues = opts->has_queues ? opts->queues : 1;
+ if (queues < 1) {
+ error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
+ queues, opts->ifname);
+ return -1;
+ }
+
+ if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
+ error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
+ return -1;
+ }
+
+ if (opts->sock_fds) {
+ sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
+ if (!sock_fds) {
+ return -1;
+ }
+ }
+
+ for (i = 0; i < queues; i++) {
+ nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
+ qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
+ nc->queue_index = i;
+
+ if (!nc0) {
+ nc0 = nc;
+ }
+
+ s = DO_UPCAST(AFXDPState, nc, nc);
+
+ pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
+ s->ifindex = ifindex;
+ s->n_queues = queues;
+
+ if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
+ || af_xdp_socket_create(s, opts, errp)) {
+ /* Make sure the XDP program will be removed. */
+ s->n_queues = i;
+ error_propagate(errp, err);
+ goto err;
+ }
+ }
+
+ if (nc0) {
+ s = DO_UPCAST(AFXDPState, nc, nc0);
+ if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
+ error_setg_errno(errp, errno,
+ "no XDP program loaded on '%s', ifindex: %d",
+ s->ifname, s->ifindex);
+ goto err;
+ }
+ }
+
+ af_xdp_read_poll(s, true); /* Initially only poll for reads. */
+
+ return 0;
+
+err:
+ g_free(sock_fds);
+ if (nc0) {
+ qemu_del_net_client(nc0);
+ }
+
+ return -1;
+}
diff --git a/net/clients.h b/net/clients.h
index ed8bdff..be53794 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif
+#ifdef CONFIG_AF_XDP
+int net_init_af_xdp(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
int net_init_vhost_user(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
diff --git a/net/meson.build b/net/meson.build
index 51caa42..ce99bd4 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c'))
if have_netmap
system_ss.add(files('netmap.c'))
endif
+
+system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
+
if have_vhost_net_user
system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))
diff --git a/net/net.c b/net/net.c
index b110e61..1c0bfda 100644
--- a/net/net.c
+++ b/net/net.c
@@ -1091,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
#ifdef CONFIG_NETMAP
[NET_CLIENT_DRIVER_NETMAP] = net_init_netmap,
#endif
+#ifdef CONFIG_AF_XDP
+ [NET_CLIENT_DRIVER_AF_XDP] = net_init_af_xdp,
+#endif
#ifdef CONFIG_NET_BRIDGE
[NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge,
#endif
@@ -1195,6 +1198,9 @@ void show_netdevs(void)
#ifdef CONFIG_NETMAP
"netmap",
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp",
+#endif
#ifdef CONFIG_POSIX
"vhost-user",
#endif
diff --git a/qapi/net.json b/qapi/net.json
index 313c8a6..8095b68 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -409,6 +409,60 @@
'*devname': 'str' } }
##
+# @AFXDPMode:
+#
+# Attach mode for a default XDP program
+#
+# @skb: generic mode, no driver support necessary
+#
+# @native: DRV mode, program is attached to a driver, packets are passed to
+# the socket without allocation of skb.
+#
+# Since: 8.2
+##
+{ 'enum': 'AFXDPMode',
+ 'data': [ 'native', 'skb' ],
+ 'if': 'CONFIG_AF_XDP' }
+
+##
+# @NetdevAFXDPOptions:
+#
+# AF_XDP network backend
+#
+# @ifname: The name of an existing network interface.
+#
+# @mode: Attach mode for a default XDP program. If not specified, then
+# 'native' will be tried first, then 'skb'.
+#
+# @force-copy: Force XDP copy mode even if device supports zero-copy.
+# (default: false)
+#
+# @queues: number of queues to be used for multiqueue interfaces (default: 1).
+#
+# @start-queue: Use @queues starting from this queue number (default: 0).
+#
+# @inhibit: Don't load a default XDP program, use one already loaded to
+# the interface (default: false). Requires @sock-fds.
+#
+# @sock-fds: A colon (:) separated list of file descriptors for already open
+# but not bound AF_XDP sockets in the queue order. One fd per queue.
+# These descriptors should already be added into XDP socket map for
+# corresponding queues. Requires @inhibit.
+#
+# Since: 8.2
+##
+{ 'struct': 'NetdevAFXDPOptions',
+ 'data': {
+ 'ifname': 'str',
+ '*mode': 'AFXDPMode',
+ '*force-copy': 'bool',
+ '*queues': 'int',
+ '*start-queue': 'int',
+ '*inhibit': 'bool',
+ '*sock-fds': 'str' },
+ 'if': 'CONFIG_AF_XDP' }
+
+##
# @NetdevVhostUserOptions:
#
# Vhost-user network backend
@@ -642,6 +696,7 @@
# @vmnet-bridged: since 7.1
# @stream: since 7.2
# @dgram: since 7.2
+# @af-xdp: since 8.2
#
# Since: 2.7
##
@@ -649,6 +704,7 @@
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
'vhost-vdpa',
+ { 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
@@ -679,6 +735,8 @@
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
+ 'af-xdp': { 'type': 'NetdevAFXDPOptions',
+ 'if': 'CONFIG_AF_XDP' },
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions',
'vmnet-host': { 'type': 'NetdevVmnetHostOptions',
diff --git a/qemu-options.hx b/qemu-options.hx
index 6be621c..2bcf7e4 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
" VALE port (created on the fly) called 'name' ('nmname' is name of the \n"
" netmap device, defaults to '/dev/netmap')\n"
#endif
+#ifdef CONFIG_AF_XDP
+ "-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
+ " [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
+ " attach to the existing network interface 'name' with AF_XDP socket\n"
+ " use 'mode=MODE' to specify an XDP program attach mode\n"
+ " use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n"
+ " use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n"
+ " with inhibit=on,\n"
+ " use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n"
+ " added to a socket map in XDP program. One socket per queue.\n"
+ " use 'queues=n' to specify how many queues of a multiqueue interface should be used\n"
+ " use 'start-queue=m' to specify the first queue that should be used\n"
+#endif
#ifdef CONFIG_POSIX
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
" configure a vhost-user network, backed by a chardev 'dev'\n"
@@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_POSIX
"vhost-user|"
#endif
@@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_VMNET
"vmnet-host|vmnet-shared|vmnet-bridged|"
#endif
@@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
" old way to initialize a host network interface\n"
" (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
SRST
-``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
+``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
This option is a shortcut for configuring both the on-board
(default) guest NIC hardware and the host network backend in one go.
The host backend options are the same as with the corresponding
@@ -3376,6 +3395,55 @@ SRST
# launch QEMU instance
|qemu_system| linux.img -nic vde,sock=/tmp/myswitch
+``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
+ Configure AF_XDP backend to connect to a network interface 'name'
+ using AF_XDP socket. A specific program attach mode for a default
+ XDP program can be forced with 'mode', defaults to best-effort,
+ where the likely most performant mode will be in use. Number of queues
+ 'n' should generally match the number or queues in the interface,
+ defaults to 1. Traffic arriving on non-configured device queues will
+ not be delivered to the network backend.
+
+ .. parsed-literal::
+
+ # set number of queues to 4
+ ethtool -L eth0 combined 4
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=4
+
+ 'start-queue' option can be specified if a particular range of queues
+ [m, m + n] should be in use. For example, this is may be necessary in
+ order to use certain NICs in native mode. Kernel allows the driver to
+ create a separate set of XDP queues on top of regular ones, and only
+ these queues can be used for AF_XDP sockets. NICs that work this way
+ may also require an additional traffic redirection with ethtool to these
+ special queues.
+
+ .. parsed-literal::
+
+ # set number of queues to 1
+ ethtool -L eth0 combined 1
+ # redirect all the traffic to the second queue (id: 1)
+ # note: drivers may require non-empty key/mask pair.
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
+
+ XDP program can also be loaded externally. In this case 'inhibit' option
+ should be set to 'on' and 'sock-fds' provided with file descriptors for
+ already open but not bound XDP sockets already added to a socket map for
+ corresponding queues. One socket per queue.
+
+ .. parsed-literal::
+
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
+
``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
Establish a vhost-user netdev, backed by a chardev id. The chardev
should be a unix domain socket backed one. The vhost-user uses a
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 131f8ee..76781f1 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -35,6 +35,7 @@
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
--tls-priority=@QEMU,SYSTEM \
+--disable-af-xdp \
--disable-attr \
--disable-auth-pam \
--disable-avx2 \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index e1d1783..2301193 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -76,6 +76,7 @@ meson_options_help() {
printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available'
printf "%s\n" '(unless built with --without-default-features):'
printf "%s\n" ''
+ printf "%s\n" ' af-xdp AF_XDP network backend support'
printf "%s\n" ' alsa ALSA sound support'
printf "%s\n" ' attr attr/xattr support'
printf "%s\n" ' auth-pam PAM access control'
@@ -208,6 +209,8 @@ meson_options_help() {
}
_meson_option_parse() {
case $1 in
+ --enable-af-xdp) printf "%s" -Daf_xdp=enabled ;;
+ --disable-af-xdp) printf "%s" -Daf_xdp=disabled ;;
--enable-alsa) printf "%s" -Dalsa=enabled ;;
--disable-alsa) printf "%s" -Dalsa=disabled ;;
--enable-attr) printf "%s" -Dattr=enabled ;;
diff --git a/tests/docker/dockerfiles/alpine.docker b/tests/docker/dockerfiles/alpine.docker
index fa455f1..d25649c 100644
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@@ -59,6 +59,7 @@ RUN apk update && \
libtasn1-dev \
liburing-dev \
libusb-dev \
+ libxdp-dev \
linux-pam-dev \
llvm \
lttng-ust-dev \
diff --git a/tests/docker/dockerfiles/centos8.docker b/tests/docker/dockerfiles/centos8.docker
index fc18309..68bfe60 100644
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker
index c5b6c96..f00e9e2 100644
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
diff --git a/tests/lcitool/projects/qemu.yml b/tests/lcitool/projects/qemu.yml
index 584f78c..6f08851 100644
--- a/tests/lcitool/projects/qemu.yml
+++ b/tests/lcitool/projects/qemu.yml
@@ -69,6 +69,7 @@ packages:
- liburing
- libusbx
- libvdeplug
+ - libxdp
- libzstd
- llvm
- lttng-ust
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 14/17] hw/net/fsl_etsec/rings.c: Avoid variable length array
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (12 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 13/17] net: add initial support for AF_XDP network backend Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 15/17] hw/net/rocker: " Jason Wang
` (3 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Peter Maydell, Philippe Mathieu-Daudé, Jason Wang
From: Peter Maydell <peter.maydell@linaro.org>
In fill_rx_bd() we create a variable length array of size
etsec->rx_padding. In fact we know that this will never be
larger than 64 bytes, because rx_padding is set in rx_init_frame()
in a way that ensures it is only that large. Use a fixed sized
array and assert that it is big enough.
Since padd[] is now potentially rather larger than the actual
padding required, adjust the memset() we do on it to match the
size that we write with cpu_physical_memory_write(), rather than
clearing the entire array.
The codebase has very few VLAs, and if we can get rid of them all we
can make the compiler error on new additions. This is a defensive
measure against security bugs where an on-stack dynamic allocation
isn't correctly size-checked (e.g. CVE-2021-3527).
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/fsl_etsec/rings.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
index 788463f..2f2f359 100644
--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@@ -372,6 +372,12 @@ void etsec_walk_tx_ring(eTSEC *etsec, int ring_nbr)
etsec->regs[TSTAT].value |= 1 << (31 - ring_nbr);
}
+/*
+ * rx_init_frame() ensures we never do more padding than this
+ * (checksum plus minimum data packet size)
+ */
+#define MAX_RX_PADDING 64
+
static void fill_rx_bd(eTSEC *etsec,
eTSEC_rxtx_bd *bd,
const uint8_t **buf,
@@ -380,9 +386,11 @@ static void fill_rx_bd(eTSEC *etsec,
uint16_t to_write;
hwaddr bufptr = bd->bufptr +
((hwaddr)(etsec->regs[TBDBPH].value & 0xF) << 32);
- uint8_t padd[etsec->rx_padding];
+ uint8_t padd[MAX_RX_PADDING];
uint8_t rem;
+ assert(etsec->rx_padding <= MAX_RX_PADDING);
+
RING_DEBUG("eTSEC fill Rx buffer @ 0x%016" HWADDR_PRIx
" size:%zu(padding + crc:%u) + fcb:%u\n",
bufptr, *size, etsec->rx_padding, etsec->rx_fcb_size);
@@ -426,7 +434,7 @@ static void fill_rx_bd(eTSEC *etsec,
rem = MIN(etsec->regs[MRBLR].value - bd->length, etsec->rx_padding);
if (rem > 0) {
- memset(padd, 0x0, sizeof(padd));
+ memset(padd, 0x0, rem);
etsec->rx_padding -= rem;
*size -= rem;
bd->length += rem;
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 15/17] hw/net/rocker: Avoid variable length array
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (13 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 14/17] hw/net/fsl_etsec/rings.c: Avoid variable length array Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 16/17] net/dump: " Jason Wang
` (2 subsequent siblings)
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Peter Maydell, Francisco Iglesias, Jason Wang
From: Peter Maydell <peter.maydell@linaro.org>
Replace an on-stack variable length array in of_dpa_ig() with
a g_autofree heap allocation.
The codebase has very few VLAs, and if we can get rid of them all we
can make the compiler error on new additions. This is a defensive
measure against security bugs where an on-stack dynamic allocation
isn't correctly size-checked (e.g. CVE-2021-3527).
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Francisco Iglesias <frasse.iglesias@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/rocker/rocker_of_dpa.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c
index dfe4754..5e16056 100644
--- a/hw/net/rocker/rocker_of_dpa.c
+++ b/hw/net/rocker/rocker_of_dpa.c
@@ -1043,7 +1043,7 @@ static void of_dpa_flow_ig_tbl(OfDpaFlowContext *fc, uint32_t tbl_id)
static ssize_t of_dpa_ig(World *world, uint32_t pport,
const struct iovec *iov, int iovcnt)
{
- struct iovec iov_copy[iovcnt + 2];
+ g_autofree struct iovec *iov_copy = g_new(struct iovec, iovcnt + 2);
OfDpaFlowContext fc = {
.of_dpa = world_private(world),
.in_pport = pport,
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 16/17] net/dump: Avoid variable length array
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (14 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 15/17] hw/net/rocker: " Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-18 8:31 ` [PULL V2 17/17] net/tap: Avoid variable-length array Jason Wang
2023-09-19 19:12 ` [PULL V2 00/17] Net patches Stefan Hajnoczi
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Peter Maydell, Francisco Iglesias, Jason Wang
From: Peter Maydell <peter.maydell@linaro.org>
Use a g_autofree heap allocation instead of a variable length
array in dump_receive_iov().
The codebase has very few VLAs, and if we can get rid of them all we
can make the compiler error on new additions. This is a defensive
measure against security bugs where an on-stack dynamic allocation
isn't correctly size-checked (e.g. CVE-2021-3527).
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Francisco Iglesias <frasse.iglesias@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
net/dump.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/dump.c b/net/dump.c
index 7d05f16..16073f2 100644
--- a/net/dump.c
+++ b/net/dump.c
@@ -68,7 +68,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt,
int64_t ts;
int caplen;
size_t size = iov_size(iov, cnt) - offset;
- struct iovec dumpiov[cnt + 1];
+ g_autofree struct iovec *dumpiov = g_new(struct iovec, cnt + 1);
/* Early return in case of previous error. */
if (s->fd < 0) {
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PULL V2 17/17] net/tap: Avoid variable-length array
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (15 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 16/17] net/dump: " Jason Wang
@ 2023-09-18 8:31 ` Jason Wang
2023-09-19 19:12 ` [PULL V2 00/17] Net patches Stefan Hajnoczi
17 siblings, 0 replies; 19+ messages in thread
From: Jason Wang @ 2023-09-18 8:31 UTC (permalink / raw)
To: qemu-devel; +Cc: Peter Maydell, Francisco Iglesias, Jason Wang
From: Peter Maydell <peter.maydell@linaro.org>
Use a heap allocation instead of a variable length array in
tap_receive_iov().
The codebase has very few VLAs, and if we can get rid of them all we
can make the compiler error on new additions. This is a defensive
measure against security bugs where an on-stack dynamic allocation
isn't correctly size-checked (e.g. CVE-2021-3527).
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Francisco Iglesias <frasse.iglesias@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
net/tap.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/tap.c b/net/tap.c
index bcea8d0..c23d032 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -118,10 +118,11 @@ static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov,
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
const struct iovec *iovp = iov;
- struct iovec iov_copy[iovcnt + 1];
+ g_autofree struct iovec *iov_copy = NULL;
struct virtio_net_hdr_mrg_rxbuf hdr = { };
if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+ iov_copy = g_new(struct iovec, iovcnt + 1);
iov_copy[0].iov_base = &hdr;
iov_copy[0].iov_len = s->host_vnet_hdr_len;
memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
--
2.7.4
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PULL V2 00/17] Net patches
2023-09-18 8:31 [PULL V2 00/17] Net patches Jason Wang
` (16 preceding siblings ...)
2023-09-18 8:31 ` [PULL V2 17/17] net/tap: Avoid variable-length array Jason Wang
@ 2023-09-19 19:12 ` Stefan Hajnoczi
17 siblings, 0 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2023-09-19 19:12 UTC (permalink / raw)
To: Jason Wang; +Cc: qemu-devel
[-- Attachment #1: Type: text/plain, Size: 115 bytes --]
Applied, thanks.
Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any user-visible changes.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 19+ messages in thread