* [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
@ 2025-05-08 12:34 Daniel Borkmann via
2025-05-08 22:53 ` Ilya Maximets
0 siblings, 1 reply; 8+ messages in thread
From: Daniel Borkmann via @ 2025-05-08 12:34 UTC (permalink / raw)
To: qemu-devel; +Cc: daniel, Ilya Maximets, Jason Wang, Anton Protopopov
Extend inhibit=on setting with the option to specify a pinned XSK map
path along with a starting index (default 0) to push the created XSK
sockets into. Example usage:
# ./build/qemu-system-x86_64 [...] \
-netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
-device virtio-net-pci,netdev=net0 [...]
This is useful for the case where an existing XDP program with XSK map
is present on the AF_XDP supported phys device and the XSK map not yet
populated. Qemu will then push the XSK sockets into the specified map.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ilya Maximets <i.maximets@ovn.org>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Anton Protopopov <aspsk@isovalent.com>
---
net/af-xdp.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
qapi/net.json | 24 +++++++++++++-------
2 files changed, 72 insertions(+), 15 deletions(-)
diff --git a/net/af-xdp.c b/net/af-xdp.c
index 01c5fb914e..ddc52f1307 100644
--- a/net/af-xdp.c
+++ b/net/af-xdp.c
@@ -51,6 +51,8 @@ typedef struct AFXDPState {
uint32_t n_queues;
uint32_t xdp_flags;
+ char *map_path;
+ uint32_t map_start_index;
bool inhibit;
} AFXDPState;
@@ -261,6 +263,7 @@ static void af_xdp_send(void *opaque)
static void af_xdp_cleanup(NetClientState *nc)
{
AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+ int pin_fd, idx;
qemu_purge_queued_packets(nc);
@@ -282,6 +285,22 @@ static void af_xdp_cleanup(NetClientState *nc)
"af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
s->ifname, s->ifindex);
}
+ if (s->map_path) {
+ pin_fd = bpf_obj_get(s->map_path);
+ if (pin_fd < 0) {
+ fprintf(stderr,
+ "af-xdp: unable to remove %s's XSK sockets from '%s', ifindex: %d\n",
+ s->ifname, s->map_path, s->ifindex);
+ } else {
+ idx = nc->queue_index;
+ if (s->map_start_index > 0) {
+ idx += s->map_start_index;
+ }
+ bpf_map_delete_elem(pin_fd, &idx);
+ close(pin_fd);
+ }
+ }
+ g_free(s->map_path);
}
static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
@@ -343,7 +362,7 @@ static int af_xdp_socket_create(AFXDPState *s,
.bind_flags = XDP_USE_NEED_WAKEUP,
.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
};
- int queue_id, error = 0;
+ int queue_id, pin_fd, xsk_fd, idx, error = 0;
s->inhibit = opts->has_inhibit && opts->inhibit;
if (s->inhibit) {
@@ -384,6 +403,23 @@ static int af_xdp_socket_create(AFXDPState *s,
}
}
+ if (!error && s->map_path) {
+ pin_fd = bpf_obj_get(s->map_path);
+ if (pin_fd < 0) {
+ error = errno;
+ } else {
+ xsk_fd = xsk_socket__fd(s->xsk);
+ idx = s->nc.queue_index;
+ if (s->map_start_index) {
+ idx += s->map_start_index;
+ }
+ if (bpf_map_update_elem(pin_fd, &idx, &xsk_fd, 0)) {
+ error = errno;
+ }
+ close(pin_fd);
+ }
+ }
+
if (error) {
error_setg_errno(errp, error,
"failed to create AF_XDP socket for %s queue_id: %d",
@@ -465,8 +501,8 @@ int net_init_af_xdp(const Netdev *netdev,
return -1;
}
- if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
- error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
+ if ((opts->has_inhibit && opts->inhibit) != !!(opts->sock_fds || opts->map_path)) {
+ error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path' and vice versa");
return -1;
}
@@ -491,6 +527,12 @@ int net_init_af_xdp(const Netdev *netdev,
pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
s->ifindex = ifindex;
s->n_queues = queues;
+ if (opts->map_path) {
+ s->map_path = g_strdup(opts->map_path);
+ if (opts->has_map_start_index && opts->map_start_index > 0) {
+ s->map_start_index = opts->map_start_index;
+ }
+ }
if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
|| af_xdp_socket_create(s, opts, errp)) {
@@ -504,10 +546,17 @@ int net_init_af_xdp(const Netdev *netdev,
if (nc0) {
s = DO_UPCAST(AFXDPState, nc, nc0);
if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
- error_setg_errno(errp, errno,
- "no XDP program loaded on '%s', ifindex: %d",
- s->ifname, s->ifindex);
- goto err;
+ if (!s->map_path) {
+ error_setg_errno(errp, errno,
+ "no XDP program loaded on '%s', ifindex: %d",
+ s->ifname, s->ifindex);
+ goto err;
+ } else {
+ warn_report("no XDP program loaded on '%s', ifindex: %d, "
+ "only %"PRIi64" XSK socket%s loaded into map %s at this point",
+ s->ifname, s->ifindex, queues, queues > 1 ? "s" : "",
+ s->map_path);
+ }
}
}
diff --git a/qapi/net.json b/qapi/net.json
index 310cc4fd19..c750b805e8 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -454,7 +454,7 @@
# (default: 0).
#
# @inhibit: Don't load a default XDP program, use one already loaded
-# to the interface (default: false). Requires @sock-fds.
+# to the interface (default: false). Requires @sock-fds or @map-path.
#
# @sock-fds: A colon (:) separated list of file descriptors for
# already open but not bound AF_XDP sockets in the queue order.
@@ -462,17 +462,25 @@
# into XDP socket map for corresponding queues. Requires
# @inhibit.
#
+# @map-path: The path to a pinned xsk map to push file descriptors
+# for bound AF_XDP sockets into. Requires @inhibit.
+#
+# @map-start-index: Use @map-path to insert xsk sockets starting from
+# this index number (default: 0). Requires @map-path.
+#
# Since: 8.2
##
{ 'struct': 'NetdevAFXDPOptions',
'data': {
- 'ifname': 'str',
- '*mode': 'AFXDPMode',
- '*force-copy': 'bool',
- '*queues': 'int',
- '*start-queue': 'int',
- '*inhibit': 'bool',
- '*sock-fds': 'str' },
+ 'ifname': 'str',
+ '*mode': 'AFXDPMode',
+ '*force-copy': 'bool',
+ '*queues': 'int',
+ '*start-queue': 'int',
+ '*inhibit': 'bool',
+ '*sock-fds': 'str',
+ '*map-path': 'str',
+ '*map-start-index': 'int' },
'if': 'CONFIG_AF_XDP' }
##
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-08 12:34 [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets Daniel Borkmann via
@ 2025-05-08 22:53 ` Ilya Maximets
2025-05-09 14:05 ` Daniel Borkmann via
0 siblings, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2025-05-08 22:53 UTC (permalink / raw)
To: Daniel Borkmann, qemu-devel; +Cc: i.maximets, Jason Wang, Anton Protopopov
On 5/8/25 2:34 PM, Daniel Borkmann wrote:
> Extend inhibit=on setting with the option to specify a pinned XSK map
> path along with a starting index (default 0) to push the created XSK
> sockets into. Example usage:
>
> # ./build/qemu-system-x86_64 [...] \
> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
> -device virtio-net-pci,netdev=net0 [...]
>
> This is useful for the case where an existing XDP program with XSK map
> is present on the AF_XDP supported phys device and the XSK map not yet
> populated. Qemu will then push the XSK sockets into the specified map.
Hi, Daniel.
Thanks for the patch!
Could you, please, explain the use case a little more? Is this patch
aiming to improve usability? Do you have a specific use case in mind?
The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
of privileges to use the pre-loaded program and the pre-created sockets.
But creating the sockets and setting them into a map doesn't allow us to
run without privileges, IIUC. May be worth mentioning at least in the
commit message.
Also, isn't map-start-index the same thing as start-queue ? Do we need
both of them?
I didn't read much into the code yet, but this patch is missing a few
bits of documentation, e.g. it's missing an update for qemu-options.hx.
See a few other quick comment below.
Best regards, Ilya Maximets.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Ilya Maximets <i.maximets@ovn.org>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Anton Protopopov <aspsk@isovalent.com>
> ---
> net/af-xdp.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
> qapi/net.json | 24 +++++++++++++-------
> 2 files changed, 72 insertions(+), 15 deletions(-)
>
> diff --git a/net/af-xdp.c b/net/af-xdp.c
> index 01c5fb914e..ddc52f1307 100644
> --- a/net/af-xdp.c
> +++ b/net/af-xdp.c
> @@ -51,6 +51,8 @@ typedef struct AFXDPState {
>
> uint32_t n_queues;
> uint32_t xdp_flags;
> + char *map_path;
> + uint32_t map_start_index;
> bool inhibit;
> } AFXDPState;
>
> @@ -261,6 +263,7 @@ static void af_xdp_send(void *opaque)
> static void af_xdp_cleanup(NetClientState *nc)
> {
> AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
> + int pin_fd, idx;
>
> qemu_purge_queued_packets(nc);
>
> @@ -282,6 +285,22 @@ static void af_xdp_cleanup(NetClientState *nc)
> "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
> s->ifname, s->ifindex);
> }
> + if (s->map_path) {
> + pin_fd = bpf_obj_get(s->map_path);
> + if (pin_fd < 0) {
> + fprintf(stderr,
> + "af-xdp: unable to remove %s's XSK sockets from '%s', ifindex: %d\n",
> + s->ifname, s->map_path, s->ifindex);
> + } else {
> + idx = nc->queue_index;
> + if (s->map_start_index > 0) {
> + idx += s->map_start_index;
> + }
> + bpf_map_delete_elem(pin_fd, &idx);
> + close(pin_fd);
> + }
> + }
> + g_free(s->map_path);
> }
>
> static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
> @@ -343,7 +362,7 @@ static int af_xdp_socket_create(AFXDPState *s,
> .bind_flags = XDP_USE_NEED_WAKEUP,
> .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
> };
> - int queue_id, error = 0;
> + int queue_id, pin_fd, xsk_fd, idx, error = 0;
>
> s->inhibit = opts->has_inhibit && opts->inhibit;
> if (s->inhibit) {
> @@ -384,6 +403,23 @@ static int af_xdp_socket_create(AFXDPState *s,
> }
> }
>
> + if (!error && s->map_path) {
> + pin_fd = bpf_obj_get(s->map_path);
> + if (pin_fd < 0) {
> + error = errno;
> + } else {
> + xsk_fd = xsk_socket__fd(s->xsk);
Indentation is off. Tbas mixed with spaces here and in some other
places in the patch.
> + idx = s->nc.queue_index;
> + if (s->map_start_index) {
> + idx += s->map_start_index;
> + }
> + if (bpf_map_update_elem(pin_fd, &idx, &xsk_fd, 0)) {
> + error = errno;
> + }
> + close(pin_fd);
> + }
> + }
> +
> if (error) {
> error_setg_errno(errp, error,
> "failed to create AF_XDP socket for %s queue_id: %d",
> @@ -465,8 +501,8 @@ int net_init_af_xdp(const Netdev *netdev,
> return -1;
> }
>
> - if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
> - error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
> + if ((opts->has_inhibit && opts->inhibit) != !!(opts->sock_fds || opts->map_path)) {
> + error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path' and vice versa");
This may need some re-wording as 'A requires B or C or vice versa' is
a little hard to understand.
> return -1;
> }
>
> @@ -491,6 +527,12 @@ int net_init_af_xdp(const Netdev *netdev,
> pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
> s->ifindex = ifindex;
> s->n_queues = queues;
> + if (opts->map_path) {
> + s->map_path = g_strdup(opts->map_path);
> + if (opts->has_map_start_index && opts->map_start_index > 0) {
> + s->map_start_index = opts->map_start_index;
> + }
> + }
>
> if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
> || af_xdp_socket_create(s, opts, errp)) {
> @@ -504,10 +546,17 @@ int net_init_af_xdp(const Netdev *netdev,
> if (nc0) {
> s = DO_UPCAST(AFXDPState, nc, nc0);
> if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
> - error_setg_errno(errp, errno,
> - "no XDP program loaded on '%s', ifindex: %d",
> - s->ifname, s->ifindex);
> - goto err;
> + if (!s->map_path) {
> + error_setg_errno(errp, errno,
> + "no XDP program loaded on '%s', ifindex: %d",
> + s->ifname, s->ifindex);
> + goto err;
> + } else {
> + warn_report("no XDP program loaded on '%s', ifindex: %d, "
> + "only %"PRIi64" XSK socket%s loaded into map %s at this point",
> + s->ifname, s->ifindex, queues, queues > 1 ? "s" : "",
> + s->map_path);
How a missing program is not an error? Seems strange.
> + }
> }
> }
>
> diff --git a/qapi/net.json b/qapi/net.json
> index 310cc4fd19..c750b805e8 100644
> --- a/qapi/net.json
> +++ b/qapi/net.json
> @@ -454,7 +454,7 @@
> # (default: 0).
> #
> # @inhibit: Don't load a default XDP program, use one already loaded
> -# to the interface (default: false). Requires @sock-fds.
> +# to the interface (default: false). Requires @sock-fds or @map-path.
> #
> # @sock-fds: A colon (:) separated list of file descriptors for
> # already open but not bound AF_XDP sockets in the queue order.
> @@ -462,17 +462,25 @@
> # into XDP socket map for corresponding queues. Requires
> # @inhibit.
> #
> +# @map-path: The path to a pinned xsk map to push file descriptors
> +# for bound AF_XDP sockets into. Requires @inhibit.
> +#
> +# @map-start-index: Use @map-path to insert xsk sockets starting from
> +# this index number (default: 0). Requires @map-path.
These are new fields that do not exist in the older versions, so
they will need their own 'since' qualifiers.
> +#
> # Since: 8.2
> ##
> { 'struct': 'NetdevAFXDPOptions',
> 'data': {
> - 'ifname': 'str',
> - '*mode': 'AFXDPMode',
> - '*force-copy': 'bool',
> - '*queues': 'int',
> - '*start-queue': 'int',
> - '*inhibit': 'bool',
> - '*sock-fds': 'str' },
> + 'ifname': 'str',
> + '*mode': 'AFXDPMode',
> + '*force-copy': 'bool',
> + '*queues': 'int',
> + '*start-queue': 'int',
> + '*inhibit': 'bool',
> + '*sock-fds': 'str',
> + '*map-path': 'str',
> + '*map-start-index': 'int' },
> 'if': 'CONFIG_AF_XDP' }
>
> ##
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-08 22:53 ` Ilya Maximets
@ 2025-05-09 14:05 ` Daniel Borkmann via
2025-05-12 12:03 ` Ilya Maximets
0 siblings, 1 reply; 8+ messages in thread
From: Daniel Borkmann via @ 2025-05-09 14:05 UTC (permalink / raw)
To: Ilya Maximets, qemu-devel; +Cc: Jason Wang, Anton Protopopov
Hi Ilya,
On 5/9/25 12:53 AM, Ilya Maximets wrote:
> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>> Extend inhibit=on setting with the option to specify a pinned XSK map
>> path along with a starting index (default 0) to push the created XSK
>> sockets into. Example usage:
>>
>> # ./build/qemu-system-x86_64 [...] \
>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>> -device virtio-net-pci,netdev=net0 [...]
>>
>> This is useful for the case where an existing XDP program with XSK map
>> is present on the AF_XDP supported phys device and the XSK map not yet
>> populated. Qemu will then push the XSK sockets into the specified map.
>
> Thanks for the patch!
>
> Could you, please, explain the use case a little more? Is this patch
> aiming to improve usability? Do you have a specific use case in mind?
The use case we have is basically that the phys NIC has an XDP program
already attached which redirects into xsk map (e.g. installed from separate
control plane), the xsk map got pinned during that process into bpf fs,
and now qemu is launched, it creates the xsk sockets and then places them
into the map by gathering the map fd from the pinned bpf fs file.
> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
> of privileges to use the pre-loaded program and the pre-created sockets.
> But creating the sockets and setting them into a map doesn't allow us to
> run without privileges, IIUC. May be worth mentioning at least in the
> commit message.
Yes, privileges for above use case are still needed. Will clarify in the
description.
> Also, isn't map-start-index the same thing as start-queue ? Do we need
> both of them?
I'd say yes given it does not have to be an exact mapping wrt queue index
to map slot. The default is 0 though and I expect this to be the most used
scenario.
> I didn't read much into the code yet, but this patch is missing a few
> bits of documentation, e.g. it's missing an update for qemu-options.hx.
>
> See a few other quick comment below.
Thanks a lot for the review, appreciate it!
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> Cc: Ilya Maximets <i.maximets@ovn.org>
>> Cc: Jason Wang <jasowang@redhat.com>
>> Cc: Anton Protopopov <aspsk@isovalent.com>
>> ---
>> net/af-xdp.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
>> qapi/net.json | 24 +++++++++++++-------
>> 2 files changed, 72 insertions(+), 15 deletions(-)
>>
>> diff --git a/net/af-xdp.c b/net/af-xdp.c
>> index 01c5fb914e..ddc52f1307 100644
>> --- a/net/af-xdp.c
>> +++ b/net/af-xdp.c
>> @@ -51,6 +51,8 @@ typedef struct AFXDPState {
>>
>> uint32_t n_queues;
>> uint32_t xdp_flags;
>> + char *map_path;
>> + uint32_t map_start_index;
>> bool inhibit;
>> } AFXDPState;
>>
>> @@ -261,6 +263,7 @@ static void af_xdp_send(void *opaque)
>> static void af_xdp_cleanup(NetClientState *nc)
>> {
>> AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
>> + int pin_fd, idx;
>>
>> qemu_purge_queued_packets(nc);
>>
>> @@ -282,6 +285,22 @@ static void af_xdp_cleanup(NetClientState *nc)
>> "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
>> s->ifname, s->ifindex);
>> }
>> + if (s->map_path) {
>> + pin_fd = bpf_obj_get(s->map_path);
>> + if (pin_fd < 0) {
>> + fprintf(stderr,
>> + "af-xdp: unable to remove %s's XSK sockets from '%s', ifindex: %d\n",
>> + s->ifname, s->map_path, s->ifindex);
>> + } else {
>> + idx = nc->queue_index;
>> + if (s->map_start_index > 0) {
>> + idx += s->map_start_index;
>> + }
>> + bpf_map_delete_elem(pin_fd, &idx);
>> + close(pin_fd);
>> + }
>> + }
>> + g_free(s->map_path);
>> }
>>
>> static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
>> @@ -343,7 +362,7 @@ static int af_xdp_socket_create(AFXDPState *s,
>> .bind_flags = XDP_USE_NEED_WAKEUP,
>> .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
>> };
>> - int queue_id, error = 0;
>> + int queue_id, pin_fd, xsk_fd, idx, error = 0;
>>
>> s->inhibit = opts->has_inhibit && opts->inhibit;
>> if (s->inhibit) {
>> @@ -384,6 +403,23 @@ static int af_xdp_socket_create(AFXDPState *s,
>> }
>> }
>>
>> + if (!error && s->map_path) {
>> + pin_fd = bpf_obj_get(s->map_path);
>> + if (pin_fd < 0) {
>> + error = errno;
>> + } else {
>> + xsk_fd = xsk_socket__fd(s->xsk);
>
> Indentation is off. Tbas mixed with spaces here and in some other
> places in the patch.
Sigh, sorry, will fix (editor was set up for kernel style).
>> + idx = s->nc.queue_index;
>> + if (s->map_start_index) {
>> + idx += s->map_start_index;
>> + }
>> + if (bpf_map_update_elem(pin_fd, &idx, &xsk_fd, 0)) {
>> + error = errno;
>> + }
>> + close(pin_fd);
>> + }
>> + }
>> +
>> if (error) {
>> error_setg_errno(errp, error,
>> "failed to create AF_XDP socket for %s queue_id: %d",
>> @@ -465,8 +501,8 @@ int net_init_af_xdp(const Netdev *netdev,
>> return -1;
>> }
>>
>> - if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
>> - error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
>> + if ((opts->has_inhibit && opts->inhibit) != !!(opts->sock_fds || opts->map_path)) {
>> + error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path' and vice versa");
>
> This may need some re-wording as 'A requires B or C or vice versa' is
> a little hard to understand.
ack, will do
>> return -1;
>> }
>>
>> @@ -491,6 +527,12 @@ int net_init_af_xdp(const Netdev *netdev,
>> pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
>> s->ifindex = ifindex;
>> s->n_queues = queues;
>> + if (opts->map_path) {
>> + s->map_path = g_strdup(opts->map_path);
>> + if (opts->has_map_start_index && opts->map_start_index > 0) {
>> + s->map_start_index = opts->map_start_index;
>> + }
>> + }
>>
>> if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
>> || af_xdp_socket_create(s, opts, errp)) {
>> @@ -504,10 +546,17 @@ int net_init_af_xdp(const Netdev *netdev,
>> if (nc0) {
>> s = DO_UPCAST(AFXDPState, nc, nc0);
>> if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
>> - error_setg_errno(errp, errno,
>> - "no XDP program loaded on '%s', ifindex: %d",
>> - s->ifname, s->ifindex);
>> - goto err;
>> + if (!s->map_path) {
>> + error_setg_errno(errp, errno,
>> + "no XDP program loaded on '%s', ifindex: %d",
>> + s->ifname, s->ifindex);
>> + goto err;
>> + } else {
>> + warn_report("no XDP program loaded on '%s', ifindex: %d, "
>> + "only %"PRIi64" XSK socket%s loaded into map %s at this point",
>> + s->ifname, s->ifindex, queues, queues > 1 ? "s" : "",
>> + s->map_path);
>
> How a missing program is not an error? Seems strange.
Just the xsk map could be populated with the xsk sockets from qemu, but not
yet attached through XDP to the NIC.
>> + }
>> }
>> }
>>
>> diff --git a/qapi/net.json b/qapi/net.json
>> index 310cc4fd19..c750b805e8 100644
>> --- a/qapi/net.json
>> +++ b/qapi/net.json
>> @@ -454,7 +454,7 @@
>> # (default: 0).
>> #
>> # @inhibit: Don't load a default XDP program, use one already loaded
>> -# to the interface (default: false). Requires @sock-fds.
>> +# to the interface (default: false). Requires @sock-fds or @map-path.
>> #
>> # @sock-fds: A colon (:) separated list of file descriptors for
>> # already open but not bound AF_XDP sockets in the queue order.
>> @@ -462,17 +462,25 @@
>> # into XDP socket map for corresponding queues. Requires
>> # @inhibit.
>> #
>> +# @map-path: The path to a pinned xsk map to push file descriptors
>> +# for bound AF_XDP sockets into. Requires @inhibit.
>> +#
>> +# @map-start-index: Use @map-path to insert xsk sockets starting from
>> +# this index number (default: 0). Requires @map-path.
>
> These are new fields that do not exist in the older versions, so
> they will need their own 'since' qualifiers.
Oh didn't know that there's a section for each version, will do.
>> +#
>> # Since: 8.2
>> ##
>> { 'struct': 'NetdevAFXDPOptions',
>> 'data': {
>> - 'ifname': 'str',
>> - '*mode': 'AFXDPMode',
>> - '*force-copy': 'bool',
>> - '*queues': 'int',
>> - '*start-queue': 'int',
>> - '*inhibit': 'bool',
>> - '*sock-fds': 'str' },
>> + 'ifname': 'str',
>> + '*mode': 'AFXDPMode',
>> + '*force-copy': 'bool',
>> + '*queues': 'int',
>> + '*start-queue': 'int',
>> + '*inhibit': 'bool',
>> + '*sock-fds': 'str',
>> + '*map-path': 'str',
>> + '*map-start-index': 'int' },
>> 'if': 'CONFIG_AF_XDP' }
>>
>> ##
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-09 14:05 ` Daniel Borkmann via
@ 2025-05-12 12:03 ` Ilya Maximets
2025-05-12 14:23 ` Daniel Borkmann via
0 siblings, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2025-05-12 12:03 UTC (permalink / raw)
To: Daniel Borkmann, qemu-devel; +Cc: i.maximets, Jason Wang, Anton Protopopov
On 5/9/25 4:05 PM, Daniel Borkmann wrote:
> Hi Ilya,
>
> On 5/9/25 12:53 AM, Ilya Maximets wrote:
>> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>>> Extend inhibit=on setting with the option to specify a pinned XSK map
>>> path along with a starting index (default 0) to push the created XSK
>>> sockets into. Example usage:
>>>
>>> # ./build/qemu-system-x86_64 [...] \
>>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>>> -device virtio-net-pci,netdev=net0 [...]
>>>
>>> This is useful for the case where an existing XDP program with XSK map
>>> is present on the AF_XDP supported phys device and the XSK map not yet
>>> populated. Qemu will then push the XSK sockets into the specified map.
>>
>> Thanks for the patch!
>>
>> Could you, please, explain the use case a little more? Is this patch
>> aiming to improve usability? Do you have a specific use case in mind?
>
> The use case we have is basically that the phys NIC has an XDP program
> already attached which redirects into xsk map (e.g. installed from separate
> control plane), the xsk map got pinned during that process into bpf fs,
> and now qemu is launched, it creates the xsk sockets and then places them
> into the map by gathering the map fd from the pinned bpf fs file.
OK. That's what I thought. Would be good to expand the commit message
a bit explaining the use case.
>
>> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
>> of privileges to use the pre-loaded program and the pre-created sockets.
>> But creating the sockets and setting them into a map doesn't allow us to
>> run without privileges, IIUC. May be worth mentioning at least in the
>> commit message.
>
> Yes, privileges for above use case are still needed. Will clarify in the
> description.
OK.
>
>> Also, isn't map-start-index the same thing as start-queue ? Do we need
>> both of them?
>
> I'd say yes given it does not have to be an exact mapping wrt queue index
> to map slot. The default is 0 though and I expect this to be the most used
> scenario.
I'm still not sure about this. For example, libxdp treats queue id as a map
index. And this value is actually not being used much in libxdp when the
program load is inhibited. I see that with a custom XDP program the indexes
inside the map may not directly correspond to queues in the device, and, in
fact, may have no relation to the actual queues in the device at all.
However, we're still calling them "queues" from the QEMU interface (as in the
"queues" parameter of the net/af-xdp device), and QEMU will just treat every
slot in the BPF map as separate queues, as this BPF map is essentially the
network device that QEMU is working with, it doesn't actually know what's
behind it.
So, I think, it should be appropriate to simplify the interface and
just use existing start-queue configuration knob for this.
What do you think?
>
>> I didn't read much into the code yet, but this patch is missing a few
>> bits of documentation, e.g. it's missing an update for qemu-options.hx.
>>
>> See a few other quick comment below.
>
> Thanks a lot for the review, appreciate it!
>
>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>> Cc: Ilya Maximets <i.maximets@ovn.org>
>>> Cc: Jason Wang <jasowang@redhat.com>
>>> Cc: Anton Protopopov <aspsk@isovalent.com>
>>> ---
>>> net/af-xdp.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
>>> qapi/net.json | 24 +++++++++++++-------
>>> 2 files changed, 72 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/net/af-xdp.c b/net/af-xdp.c
>>> index 01c5fb914e..ddc52f1307 100644
>>> --- a/net/af-xdp.c
>>> +++ b/net/af-xdp.c
>>> @@ -51,6 +51,8 @@ typedef struct AFXDPState {
>>>
>>> uint32_t n_queues;
>>> uint32_t xdp_flags;
>>> + char *map_path;
>>> + uint32_t map_start_index;
>>> bool inhibit;
>>> } AFXDPState;
>>>
>>> @@ -261,6 +263,7 @@ static void af_xdp_send(void *opaque)
>>> static void af_xdp_cleanup(NetClientState *nc)
>>> {
>>> AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
>>> + int pin_fd, idx;
>>>
>>> qemu_purge_queued_packets(nc);
>>>
>>> @@ -282,6 +285,22 @@ static void af_xdp_cleanup(NetClientState *nc)
>>> "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
>>> s->ifname, s->ifindex);
>>> }
>>> + if (s->map_path) {
>>> + pin_fd = bpf_obj_get(s->map_path);
>>> + if (pin_fd < 0) {
>>> + fprintf(stderr,
>>> + "af-xdp: unable to remove %s's XSK sockets from '%s', ifindex: %d\n",
>>> + s->ifname, s->map_path, s->ifindex);
>>> + } else {
>>> + idx = nc->queue_index;
>>> + if (s->map_start_index > 0) {
>>> + idx += s->map_start_index;
>>> + }
>>> + bpf_map_delete_elem(pin_fd, &idx);
>>> + close(pin_fd);
>>> + }
>>> + }
>>> + g_free(s->map_path);
>>> }
>>>
>>> static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
>>> @@ -343,7 +362,7 @@ static int af_xdp_socket_create(AFXDPState *s,
>>> .bind_flags = XDP_USE_NEED_WAKEUP,
>>> .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
>>> };
>>> - int queue_id, error = 0;
>>> + int queue_id, pin_fd, xsk_fd, idx, error = 0;
>>>
>>> s->inhibit = opts->has_inhibit && opts->inhibit;
>>> if (s->inhibit) {
>>> @@ -384,6 +403,23 @@ static int af_xdp_socket_create(AFXDPState *s,
>>> }
>>> }
>>>
>>> + if (!error && s->map_path) {
>>> + pin_fd = bpf_obj_get(s->map_path);
>>> + if (pin_fd < 0) {
>>> + error = errno;
>>> + } else {
>>> + xsk_fd = xsk_socket__fd(s->xsk);
>>
>> Indentation is off. Tbas mixed with spaces here and in some other
>> places in the patch.
>
> Sigh, sorry, will fix (editor was set up for kernel style).
You can use ./scripts/checkpatch.pl to catch this kind of issues.
>
>>> + idx = s->nc.queue_index;
>>> + if (s->map_start_index) {
>>> + idx += s->map_start_index;
>>> + }
>>> + if (bpf_map_update_elem(pin_fd, &idx, &xsk_fd, 0)) {
>>> + error = errno;
>>> + }
>>> + close(pin_fd);
>>> + }
>>> + }
>>> +
>>> if (error) {
>>> error_setg_errno(errp, error,
>>> "failed to create AF_XDP socket for %s queue_id: %d",
>>> @@ -465,8 +501,8 @@ int net_init_af_xdp(const Netdev *netdev,
>>> return -1;
>>> }
>>>
>>> - if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
>>> - error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
>>> + if ((opts->has_inhibit && opts->inhibit) != !!(opts->sock_fds || opts->map_path)) {
>>> + error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path' and vice versa");
>>
>> This may need some re-wording as 'A requires B or C or vice versa' is
>> a little hard to understand.
>
> ack, will do
>
>>> return -1;
>>> }
>>>
>>> @@ -491,6 +527,12 @@ int net_init_af_xdp(const Netdev *netdev,
>>> pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
>>> s->ifindex = ifindex;
>>> s->n_queues = queues;
>>> + if (opts->map_path) {
>>> + s->map_path = g_strdup(opts->map_path);
>>> + if (opts->has_map_start_index && opts->map_start_index > 0) {
>>> + s->map_start_index = opts->map_start_index;
>>> + }
>>> + }
>>>
>>> if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
>>> || af_xdp_socket_create(s, opts, errp)) {
>>> @@ -504,10 +546,17 @@ int net_init_af_xdp(const Netdev *netdev,
>>> if (nc0) {
>>> s = DO_UPCAST(AFXDPState, nc, nc0);
>>> if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
>>> - error_setg_errno(errp, errno,
>>> - "no XDP program loaded on '%s', ifindex: %d",
>>> - s->ifname, s->ifindex);
>>> - goto err;
>>> + if (!s->map_path) {
>>> + error_setg_errno(errp, errno,
>>> + "no XDP program loaded on '%s', ifindex: %d",
>>> + s->ifname, s->ifindex);
>>> + goto err;
>>> + } else {
>>> + warn_report("no XDP program loaded on '%s', ifindex: %d, "
>>> + "only %"PRIi64" XSK socket%s loaded into map %s at this point",
>>> + s->ifname, s->ifindex, queues, queues > 1 ? "s" : "",
>>> + s->map_path);
>>
>> How a missing program is not an error? Seems strange.
>
> Just the xsk map could be populated with the xsk sockets from qemu, but not
> yet attached through XDP to the NIC.
OK, I see. Yeah, that makes sense. But shouldn't that be true for all the
cases where we do not have direct control over the program loading, i.e. all
the cases with inhibit=on ?
>
>>> + }
>>> }
>>> }
>>>
>>> diff --git a/qapi/net.json b/qapi/net.json
>>> index 310cc4fd19..c750b805e8 100644
>>> --- a/qapi/net.json
>>> +++ b/qapi/net.json
>>> @@ -454,7 +454,7 @@
>>> # (default: 0).
>>> #
>>> # @inhibit: Don't load a default XDP program, use one already loaded
>>> -# to the interface (default: false). Requires @sock-fds.
>>> +# to the interface (default: false). Requires @sock-fds or @map-path.
>>> #
>>> # @sock-fds: A colon (:) separated list of file descriptors for
>>> # already open but not bound AF_XDP sockets in the queue order.
>>> @@ -462,17 +462,25 @@
>>> # into XDP socket map for corresponding queues. Requires
>>> # @inhibit.
>>> #
>>> +# @map-path: The path to a pinned xsk map to push file descriptors
>>> +# for bound AF_XDP sockets into. Requires @inhibit.
>>> +#
>>> +# @map-start-index: Use @map-path to insert xsk sockets starting from
>>> +# this index number (default: 0). Requires @map-path.
>>
>> These are new fields that do not exist in the older versions, so
>> they will need their own 'since' qualifiers.
>
> Oh didn't know that there's a section for each version, will do.
>
>>> +#
>>> # Since: 8.2
>>> ##
>>> { 'struct': 'NetdevAFXDPOptions',
>>> 'data': {
>>> - 'ifname': 'str',
>>> - '*mode': 'AFXDPMode',
>>> - '*force-copy': 'bool',
>>> - '*queues': 'int',
>>> - '*start-queue': 'int',
>>> - '*inhibit': 'bool',
>>> - '*sock-fds': 'str' },
>>> + 'ifname': 'str',
>>> + '*mode': 'AFXDPMode',
>>> + '*force-copy': 'bool',
>>> + '*queues': 'int',
>>> + '*start-queue': 'int',
>>> + '*inhibit': 'bool',
>>> + '*sock-fds': 'str',
>>> + '*map-path': 'str',
>>> + '*map-start-index': 'int' },
>>> 'if': 'CONFIG_AF_XDP' }
>>>
>>> ##
>>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-12 12:03 ` Ilya Maximets
@ 2025-05-12 14:23 ` Daniel Borkmann via
2025-05-14 19:02 ` Ilya Maximets
0 siblings, 1 reply; 8+ messages in thread
From: Daniel Borkmann via @ 2025-05-12 14:23 UTC (permalink / raw)
To: Ilya Maximets, qemu-devel; +Cc: Jason Wang, Anton Protopopov
On 5/12/25 2:03 PM, Ilya Maximets wrote:
> On 5/9/25 4:05 PM, Daniel Borkmann wrote:
>> On 5/9/25 12:53 AM, Ilya Maximets wrote:
>>> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>>>> Extend inhibit=on setting with the option to specify a pinned XSK map
>>>> path along with a starting index (default 0) to push the created XSK
>>>> sockets into. Example usage:
>>>>
>>>> # ./build/qemu-system-x86_64 [...] \
>>>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>>>> -device virtio-net-pci,netdev=net0 [...]
>>>>
>>>> This is useful for the case where an existing XDP program with XSK map
>>>> is present on the AF_XDP supported phys device and the XSK map not yet
>>>> populated. Qemu will then push the XSK sockets into the specified map.
>>>
>>> Thanks for the patch!
>>>
>>> Could you, please, explain the use case a little more? Is this patch
>>> aiming to improve usability? Do you have a specific use case in mind?
>>
>> The use case we have is basically that the phys NIC has an XDP program
>> already attached which redirects into xsk map (e.g. installed from separate
>> control plane), the xsk map got pinned during that process into bpf fs,
>> and now qemu is launched, it creates the xsk sockets and then places them
>> into the map by gathering the map fd from the pinned bpf fs file.
>
> OK. That's what I thought. Would be good to expand the commit message
> a bit explaining the use case.
Ack, I already adjusted locally. Planning to send v2 ~today with your feedback
incorporated. Much appreciated!
>>> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
>>> of privileges to use the pre-loaded program and the pre-created sockets.
>>> But creating the sockets and setting them into a map doesn't allow us to
>>> run without privileges, IIUC. May be worth mentioning at least in the
>>> commit message.
>>
>> Yes, privileges for above use case are still needed. Will clarify in the
>> description.
>
> OK.
>
>>> Also, isn't map-start-index the same thing as start-queue ? Do we need
>>> both of them?
>>
>> I'd say yes given it does not have to be an exact mapping wrt queue index
>> to map slot. The default is 0 though and I expect this to be the most used
>> scenario.
>
> I'm still not sure about this. For example, libxdp treats queue id as a map
> index. And this value is actually not being used much in libxdp when the
> program load is inhibited. I see that with a custom XDP program the indexes
> inside the map may not directly correspond to queues in the device, and, in
> fact, may have no relation to the actual queues in the device at all.
Right, that's correct.
> However, we're still calling them "queues" from the QEMU interface (as in the
> "queues" parameter of the net/af-xdp device), and QEMU will just treat every
> slot in the BPF map as separate queues, as this BPF map is essentially the
> network device that QEMU is working with, it doesn't actually know what's
> behind it.
>
> So, I think, it should be appropriate to simplify the interface and
> just use existing start-queue configuration knob for this.
>
> What do you think?
I was thinking of an example like the below (plainly taken from the XDP example
programs at github.com/xdp-project/bpf-examples).
struct {
__uint(type, BPF_MAP_TYPE_XSKMAP);
__uint(max_entries, MAX_SOCKS);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} xsks_map SEC(".maps");
int num_socks = 0;
static unsigned int rr;
SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
{
rr = (rr + 1) & (num_socks - 1);
return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
}
If we'd just reuse the start-queue configuration knob for this, then it wouldn't
work. So I think having the flexibility of where to place the sockets in the map
would make sense. But I can also drop that part of you think it does not warrant
the extra knob and align to start-queue then the map always needs to be of the
same size as the combined NIC queues.
>>> I didn't read much into the code yet, but this patch is missing a few
>>> bits of documentation, e.g. it's missing an update for qemu-options.hx.
>>>
>>> See a few other quick comment below.
>>
>> Thanks a lot for the review, appreciate it!
>>
>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>> Cc: Ilya Maximets <i.maximets@ovn.org>
>>>> Cc: Jason Wang <jasowang@redhat.com>
>>>> Cc: Anton Protopopov <aspsk@isovalent.com>
>>>> ---
>>>> net/af-xdp.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
>>>> qapi/net.json | 24 +++++++++++++-------
>>>> 2 files changed, 72 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/net/af-xdp.c b/net/af-xdp.c
>>>> index 01c5fb914e..ddc52f1307 100644
>>>> --- a/net/af-xdp.c
>>>> +++ b/net/af-xdp.c
>>>> @@ -51,6 +51,8 @@ typedef struct AFXDPState {
>>>>
>>>> uint32_t n_queues;
>>>> uint32_t xdp_flags;
>>>> + char *map_path;
>>>> + uint32_t map_start_index;
>>>> bool inhibit;
>>>> } AFXDPState;
>>>>
>>>> @@ -261,6 +263,7 @@ static void af_xdp_send(void *opaque)
>>>> static void af_xdp_cleanup(NetClientState *nc)
>>>> {
>>>> AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
>>>> + int pin_fd, idx;
>>>>
>>>> qemu_purge_queued_packets(nc);
>>>>
>>>> @@ -282,6 +285,22 @@ static void af_xdp_cleanup(NetClientState *nc)
>>>> "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
>>>> s->ifname, s->ifindex);
>>>> }
>>>> + if (s->map_path) {
>>>> + pin_fd = bpf_obj_get(s->map_path);
>>>> + if (pin_fd < 0) {
>>>> + fprintf(stderr,
>>>> + "af-xdp: unable to remove %s's XSK sockets from '%s', ifindex: %d\n",
>>>> + s->ifname, s->map_path, s->ifindex);
>>>> + } else {
>>>> + idx = nc->queue_index;
>>>> + if (s->map_start_index > 0) {
>>>> + idx += s->map_start_index;
>>>> + }
>>>> + bpf_map_delete_elem(pin_fd, &idx);
>>>> + close(pin_fd);
>>>> + }
>>>> + }
>>>> + g_free(s->map_path);
>>>> }
>>>>
>>>> static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
>>>> @@ -343,7 +362,7 @@ static int af_xdp_socket_create(AFXDPState *s,
>>>> .bind_flags = XDP_USE_NEED_WAKEUP,
>>>> .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
>>>> };
>>>> - int queue_id, error = 0;
>>>> + int queue_id, pin_fd, xsk_fd, idx, error = 0;
>>>>
>>>> s->inhibit = opts->has_inhibit && opts->inhibit;
>>>> if (s->inhibit) {
>>>> @@ -384,6 +403,23 @@ static int af_xdp_socket_create(AFXDPState *s,
>>>> }
>>>> }
>>>>
>>>> + if (!error && s->map_path) {
>>>> + pin_fd = bpf_obj_get(s->map_path);
>>>> + if (pin_fd < 0) {
>>>> + error = errno;
>>>> + } else {
>>>> + xsk_fd = xsk_socket__fd(s->xsk);
>>>
>>> Indentation is off. Tbas mixed with spaces here and in some other
>>> places in the patch.
>>
>> Sigh, sorry, will fix (editor was set up for kernel style).
>
> You can use ./scripts/checkpatch.pl to catch this kind of issues.
>
>>>> + idx = s->nc.queue_index;
>>>> + if (s->map_start_index) {
>>>> + idx += s->map_start_index;
>>>> + }
>>>> + if (bpf_map_update_elem(pin_fd, &idx, &xsk_fd, 0)) {
>>>> + error = errno;
>>>> + }
>>>> + close(pin_fd);
>>>> + }
>>>> + }
>>>> +
>>>> if (error) {
>>>> error_setg_errno(errp, error,
>>>> "failed to create AF_XDP socket for %s queue_id: %d",
>>>> @@ -465,8 +501,8 @@ int net_init_af_xdp(const Netdev *netdev,
>>>> return -1;
>>>> }
>>>>
>>>> - if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
>>>> - error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
>>>> + if ((opts->has_inhibit && opts->inhibit) != !!(opts->sock_fds || opts->map_path)) {
>>>> + error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path' and vice versa");
>>>
>>> This may need some re-wording as 'A requires B or C or vice versa' is
>>> a little hard to understand.
>>
>> ack, will do
>>
>>>> return -1;
>>>> }
>>>>
>>>> @@ -491,6 +527,12 @@ int net_init_af_xdp(const Netdev *netdev,
>>>> pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
>>>> s->ifindex = ifindex;
>>>> s->n_queues = queues;
>>>> + if (opts->map_path) {
>>>> + s->map_path = g_strdup(opts->map_path);
>>>> + if (opts->has_map_start_index && opts->map_start_index > 0) {
>>>> + s->map_start_index = opts->map_start_index;
>>>> + }
>>>> + }
>>>>
>>>> if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
>>>> || af_xdp_socket_create(s, opts, errp)) {
>>>> @@ -504,10 +546,17 @@ int net_init_af_xdp(const Netdev *netdev,
>>>> if (nc0) {
>>>> s = DO_UPCAST(AFXDPState, nc, nc0);
>>>> if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
>>>> - error_setg_errno(errp, errno,
>>>> - "no XDP program loaded on '%s', ifindex: %d",
>>>> - s->ifname, s->ifindex);
>>>> - goto err;
>>>> + if (!s->map_path) {
>>>> + error_setg_errno(errp, errno,
>>>> + "no XDP program loaded on '%s', ifindex: %d",
>>>> + s->ifname, s->ifindex);
>>>> + goto err;
>>>> + } else {
>>>> + warn_report("no XDP program loaded on '%s', ifindex: %d, "
>>>> + "only %"PRIi64" XSK socket%s loaded into map %s at this point",
>>>> + s->ifname, s->ifindex, queues, queues > 1 ? "s" : "",
>>>> + s->map_path);
>>>
>>> How a missing program is not an error? Seems strange.
>>
>> Just the xsk map could be populated with the xsk sockets from qemu, but not
>> yet attached through XDP to the NIC.
>
> OK, I see. Yeah, that makes sense. But shouldn't that be true for all the
> cases where we do not have direct control over the program loading, i.e. all
> the cases with inhibit=on ?
Agree, makes sense to rather switch it to inhibit=on test.
>>>> + }
>>>> }
>>>> }
>>>>
>>>> diff --git a/qapi/net.json b/qapi/net.json
>>>> index 310cc4fd19..c750b805e8 100644
>>>> --- a/qapi/net.json
>>>> +++ b/qapi/net.json
>>>> @@ -454,7 +454,7 @@
>>>> # (default: 0).
>>>> #
>>>> # @inhibit: Don't load a default XDP program, use one already loaded
>>>> -# to the interface (default: false). Requires @sock-fds.
>>>> +# to the interface (default: false). Requires @sock-fds or @map-path.
>>>> #
>>>> # @sock-fds: A colon (:) separated list of file descriptors for
>>>> # already open but not bound AF_XDP sockets in the queue order.
>>>> @@ -462,17 +462,25 @@
>>>> # into XDP socket map for corresponding queues. Requires
>>>> # @inhibit.
>>>> #
>>>> +# @map-path: The path to a pinned xsk map to push file descriptors
>>>> +# for bound AF_XDP sockets into. Requires @inhibit.
>>>> +#
>>>> +# @map-start-index: Use @map-path to insert xsk sockets starting from
>>>> +# this index number (default: 0). Requires @map-path.
>>>
>>> These are new fields that do not exist in the older versions, so
>>> they will need their own 'since' qualifiers.
>>
>> Oh didn't know that there's a section for each version, will do.
>>
>>>> +#
>>>> # Since: 8.2
>>>> ##
>>>> { 'struct': 'NetdevAFXDPOptions',
>>>> 'data': {
>>>> - 'ifname': 'str',
>>>> - '*mode': 'AFXDPMode',
>>>> - '*force-copy': 'bool',
>>>> - '*queues': 'int',
>>>> - '*start-queue': 'int',
>>>> - '*inhibit': 'bool',
>>>> - '*sock-fds': 'str' },
>>>> + 'ifname': 'str',
>>>> + '*mode': 'AFXDPMode',
>>>> + '*force-copy': 'bool',
>>>> + '*queues': 'int',
>>>> + '*start-queue': 'int',
>>>> + '*inhibit': 'bool',
>>>> + '*sock-fds': 'str',
>>>> + '*map-path': 'str',
>>>> + '*map-start-index': 'int' },
>>>> 'if': 'CONFIG_AF_XDP' }
>>>>
>>>> ##
>>>
>>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-12 14:23 ` Daniel Borkmann via
@ 2025-05-14 19:02 ` Ilya Maximets
2025-05-15 8:00 ` Daniel Borkmann via
0 siblings, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2025-05-14 19:02 UTC (permalink / raw)
To: Daniel Borkmann, qemu-devel; +Cc: i.maximets, Jason Wang, Anton Protopopov
On 5/12/25 4:23 PM, Daniel Borkmann wrote:
> On 5/12/25 2:03 PM, Ilya Maximets wrote:
>> On 5/9/25 4:05 PM, Daniel Borkmann wrote:
>>> On 5/9/25 12:53 AM, Ilya Maximets wrote:
>>>> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>>>>> Extend inhibit=on setting with the option to specify a pinned XSK map
>>>>> path along with a starting index (default 0) to push the created XSK
>>>>> sockets into. Example usage:
>>>>>
>>>>> # ./build/qemu-system-x86_64 [...] \
>>>>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>>>>> -device virtio-net-pci,netdev=net0 [...]
>>>>>
>>>>> This is useful for the case where an existing XDP program with XSK map
>>>>> is present on the AF_XDP supported phys device and the XSK map not yet
>>>>> populated. Qemu will then push the XSK sockets into the specified map.
>>>>
>>>> Thanks for the patch!
>>>>
>>>> Could you, please, explain the use case a little more? Is this patch
>>>> aiming to improve usability? Do you have a specific use case in mind?
>>>
>>> The use case we have is basically that the phys NIC has an XDP program
>>> already attached which redirects into xsk map (e.g. installed from separate
>>> control plane), the xsk map got pinned during that process into bpf fs,
>>> and now qemu is launched, it creates the xsk sockets and then places them
>>> into the map by gathering the map fd from the pinned bpf fs file.
>>
>> OK. That's what I thought. Would be good to expand the commit message
>> a bit explaining the use case.
>
> Ack, I already adjusted locally. Planning to send v2 ~today with your feedback
> incorporated. Much appreciated!
>
>>>> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
>>>> of privileges to use the pre-loaded program and the pre-created sockets.
>>>> But creating the sockets and setting them into a map doesn't allow us to
>>>> run without privileges, IIUC. May be worth mentioning at least in the
>>>> commit message.
>>>
>>> Yes, privileges for above use case are still needed. Will clarify in the
>>> description.
>>
>> OK.
>>
>>>> Also, isn't map-start-index the same thing as start-queue ? Do we need
>>>> both of them?
>>>
>>> I'd say yes given it does not have to be an exact mapping wrt queue index
>>> to map slot. The default is 0 though and I expect this to be the most used
>>> scenario.
>>
>> I'm still not sure about this. For example, libxdp treats queue id as a map
>> index. And this value is actually not being used much in libxdp when the
>> program load is inhibited. I see that with a custom XDP program the indexes
>> inside the map may not directly correspond to queues in the device, and, in
>> fact, may have no relation to the actual queues in the device at all.
>
> Right, that's correct.
>
>> However, we're still calling them "queues" from the QEMU interface (as in the
>> "queues" parameter of the net/af-xdp device), and QEMU will just treat every
>> slot in the BPF map as separate queues, as this BPF map is essentially the
>> network device that QEMU is working with, it doesn't actually know what's
>> behind it.
>>
>> So, I think, it should be appropriate to simplify the interface and
>> just use existing start-queue configuration knob for this.
>>
>> What do you think?
>
> I was thinking of an example like the below (plainly taken from the XDP example
> programs at github.com/xdp-project/bpf-examples).
>
> struct {
> __uint(type, BPF_MAP_TYPE_XSKMAP);
> __uint(max_entries, MAX_SOCKS);
> __uint(key_size, sizeof(int));
> __uint(value_size, sizeof(int));
> } xsks_map SEC(".maps");
>
> int num_socks = 0;
> static unsigned int rr;
>
> SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> {
> rr = (rr + 1) & (num_socks - 1);
> return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
> }
>
> If we'd just reuse the start-queue configuration knob for this, then it wouldn't
> work. So I think having the flexibility of where to place the sockets in the map
> would make sense. But I can also drop that part of you think it does not warrant
> the extra knob and align to start-queue then the map always needs to be of the
> same size as the combined NIC queues.
I'm a little confused here. The 'start-queue' is not used for anything important,
AFAICT, in case of inhibit=on. So, why re-using it instead of adding a new
config option reduces the number of available use cases?
Bets regards, Ilya Maximets.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-14 19:02 ` Ilya Maximets
@ 2025-05-15 8:00 ` Daniel Borkmann via
2025-05-15 19:34 ` Ilya Maximets
0 siblings, 1 reply; 8+ messages in thread
From: Daniel Borkmann via @ 2025-05-15 8:00 UTC (permalink / raw)
To: Ilya Maximets, qemu-devel; +Cc: Jason Wang, Anton Protopopov
On 5/14/25 9:02 PM, Ilya Maximets wrote:
> On 5/12/25 4:23 PM, Daniel Borkmann wrote:
>> On 5/12/25 2:03 PM, Ilya Maximets wrote:
>>> On 5/9/25 4:05 PM, Daniel Borkmann wrote:
>>>> On 5/9/25 12:53 AM, Ilya Maximets wrote:
>>>>> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>>>>>> Extend inhibit=on setting with the option to specify a pinned XSK map
>>>>>> path along with a starting index (default 0) to push the created XSK
>>>>>> sockets into. Example usage:
>>>>>>
>>>>>> # ./build/qemu-system-x86_64 [...] \
>>>>>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>>>>>> -device virtio-net-pci,netdev=net0 [...]
>>>>>>
>>>>>> This is useful for the case where an existing XDP program with XSK map
>>>>>> is present on the AF_XDP supported phys device and the XSK map not yet
>>>>>> populated. Qemu will then push the XSK sockets into the specified map.
>>>>>
>>>>> Thanks for the patch!
>>>>>
>>>>> Could you, please, explain the use case a little more? Is this patch
>>>>> aiming to improve usability? Do you have a specific use case in mind?
>>>>
>>>> The use case we have is basically that the phys NIC has an XDP program
>>>> already attached which redirects into xsk map (e.g. installed from separate
>>>> control plane), the xsk map got pinned during that process into bpf fs,
>>>> and now qemu is launched, it creates the xsk sockets and then places them
>>>> into the map by gathering the map fd from the pinned bpf fs file.
>>>
>>> OK. That's what I thought. Would be good to expand the commit message
>>> a bit explaining the use case.
>>
>> Ack, I already adjusted locally. Planning to send v2 ~today with your feedback
>> incorporated. Much appreciated!
>>
>>>>> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
>>>>> of privileges to use the pre-loaded program and the pre-created sockets.
>>>>> But creating the sockets and setting them into a map doesn't allow us to
>>>>> run without privileges, IIUC. May be worth mentioning at least in the
>>>>> commit message.
>>>>
>>>> Yes, privileges for above use case are still needed. Will clarify in the
>>>> description.
>>>
>>> OK.
>>>
>>>>> Also, isn't map-start-index the same thing as start-queue ? Do we need
>>>>> both of them?
>>>>
>>>> I'd say yes given it does not have to be an exact mapping wrt queue index
>>>> to map slot. The default is 0 though and I expect this to be the most used
>>>> scenario.
>>>
>>> I'm still not sure about this. For example, libxdp treats queue id as a map
>>> index. And this value is actually not being used much in libxdp when the
>>> program load is inhibited. I see that with a custom XDP program the indexes
>>> inside the map may not directly correspond to queues in the device, and, in
>>> fact, may have no relation to the actual queues in the device at all.
>>
>> Right, that's correct.
>>
>>> However, we're still calling them "queues" from the QEMU interface (as in the
>>> "queues" parameter of the net/af-xdp device), and QEMU will just treat every
>>> slot in the BPF map as separate queues, as this BPF map is essentially the
>>> network device that QEMU is working with, it doesn't actually know what's
>>> behind it.
>>>
>>> So, I think, it should be appropriate to simplify the interface and
>>> just use existing start-queue configuration knob for this.
>>>
>>> What do you think?
>>
>> I was thinking of an example like the below (plainly taken from the XDP example
>> programs at github.com/xdp-project/bpf-examples).
>>
>> struct {
>> __uint(type, BPF_MAP_TYPE_XSKMAP);
>> __uint(max_entries, MAX_SOCKS);
>> __uint(key_size, sizeof(int));
>> __uint(value_size, sizeof(int));
>> } xsks_map SEC(".maps");
>>
>> int num_socks = 0;
>> static unsigned int rr;
>>
>> SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
>> {
>> rr = (rr + 1) & (num_socks - 1);
>> return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
>> }
>>
>> If we'd just reuse the start-queue configuration knob for this, then it wouldn't
>> work. So I think having the flexibility of where to place the sockets in the map
>> would make sense. But I can also drop that part of you think it does not warrant
>> the extra knob and align to start-queue then the map always needs to be of the
>> same size as the combined NIC queues.
>
> I'm a little confused here. The 'start-queue' is not used for anything important,
> AFAICT, in case of inhibit=on. So, why re-using it instead of adding a new
> config option reduces the number of available use cases?
Hm, maybe I'm missing something, but we use inhibit=on and do /not/ pass sock-fds as
a parameter and instead fully rely on qemu to create all related af_xdp sockets. So
the start-queue /is/ relevant for the underlying NIC queue selection as we pass the
queue_id to xsk_socket__create().
Thanks,
Daniel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets
2025-05-15 8:00 ` Daniel Borkmann via
@ 2025-05-15 19:34 ` Ilya Maximets
0 siblings, 0 replies; 8+ messages in thread
From: Ilya Maximets @ 2025-05-15 19:34 UTC (permalink / raw)
To: Daniel Borkmann, qemu-devel; +Cc: i.maximets, Jason Wang, Anton Protopopov
On 5/15/25 10:00 AM, Daniel Borkmann wrote:
> On 5/14/25 9:02 PM, Ilya Maximets wrote:
>> On 5/12/25 4:23 PM, Daniel Borkmann wrote:
>>> On 5/12/25 2:03 PM, Ilya Maximets wrote:
>>>> On 5/9/25 4:05 PM, Daniel Borkmann wrote:
>>>>> On 5/9/25 12:53 AM, Ilya Maximets wrote:
>>>>>> On 5/8/25 2:34 PM, Daniel Borkmann wrote:
>>>>>>> Extend inhibit=on setting with the option to specify a pinned XSK map
>>>>>>> path along with a starting index (default 0) to push the created XSK
>>>>>>> sockets into. Example usage:
>>>>>>>
>>>>>>> # ./build/qemu-system-x86_64 [...] \
>>>>>>> -netdev af-xdp,ifname=eth0,id=net0,mode=native,queues=2,inhibit=on,map-path=/sys/fs/bpf/foo,map-start-index=2
>>>>>>> -device virtio-net-pci,netdev=net0 [...]
>>>>>>>
>>>>>>> This is useful for the case where an existing XDP program with XSK map
>>>>>>> is present on the AF_XDP supported phys device and the XSK map not yet
>>>>>>> populated. Qemu will then push the XSK sockets into the specified map.
>>>>>>
>>>>>> Thanks for the patch!
>>>>>>
>>>>>> Could you, please, explain the use case a little more? Is this patch
>>>>>> aiming to improve usability? Do you have a specific use case in mind?
>>>>>
>>>>> The use case we have is basically that the phys NIC has an XDP program
>>>>> already attached which redirects into xsk map (e.g. installed from separate
>>>>> control plane), the xsk map got pinned during that process into bpf fs,
>>>>> and now qemu is launched, it creates the xsk sockets and then places them
>>>>> into the map by gathering the map fd from the pinned bpf fs file.
>>>>
>>>> OK. That's what I thought. Would be good to expand the commit message
>>>> a bit explaining the use case.
>>>
>>> Ack, I already adjusted locally. Planning to send v2 ~today with your feedback
>>> incorporated. Much appreciated!
>>>
>>>>>> The main idea behind 'inhibit' is that the qemu doesn't need to have a lot
>>>>>> of privileges to use the pre-loaded program and the pre-created sockets.
>>>>>> But creating the sockets and setting them into a map doesn't allow us to
>>>>>> run without privileges, IIUC. May be worth mentioning at least in the
>>>>>> commit message.
>>>>>
>>>>> Yes, privileges for above use case are still needed. Will clarify in the
>>>>> description.
>>>>
>>>> OK.
>>>>
>>>>>> Also, isn't map-start-index the same thing as start-queue ? Do we need
>>>>>> both of them?
>>>>>
>>>>> I'd say yes given it does not have to be an exact mapping wrt queue index
>>>>> to map slot. The default is 0 though and I expect this to be the most used
>>>>> scenario.
>>>>
>>>> I'm still not sure about this. For example, libxdp treats queue id as a map
>>>> index. And this value is actually not being used much in libxdp when the
>>>> program load is inhibited. I see that with a custom XDP program the indexes
>>>> inside the map may not directly correspond to queues in the device, and, in
>>>> fact, may have no relation to the actual queues in the device at all.
>>>
>>> Right, that's correct.
>>>
>>>> However, we're still calling them "queues" from the QEMU interface (as in the
>>>> "queues" parameter of the net/af-xdp device), and QEMU will just treat every
>>>> slot in the BPF map as separate queues, as this BPF map is essentially the
>>>> network device that QEMU is working with, it doesn't actually know what's
>>>> behind it.
>>>>
>>>> So, I think, it should be appropriate to simplify the interface and
>>>> just use existing start-queue configuration knob for this.
>>>>
>>>> What do you think?
>>>
>>> I was thinking of an example like the below (plainly taken from the XDP example
>>> programs at github.com/xdp-project/bpf-examples).
>>>
>>> struct {
>>> __uint(type, BPF_MAP_TYPE_XSKMAP);
>>> __uint(max_entries, MAX_SOCKS);
>>> __uint(key_size, sizeof(int));
>>> __uint(value_size, sizeof(int));
>>> } xsks_map SEC(".maps");
>>>
>>> int num_socks = 0;
>>> static unsigned int rr;
>>>
>>> SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
>>> {
>>> rr = (rr + 1) & (num_socks - 1);
>>> return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
>>> }
>>>
>>> If we'd just reuse the start-queue configuration knob for this, then it wouldn't
>>> work. So I think having the flexibility of where to place the sockets in the map
>>> would make sense. But I can also drop that part of you think it does not warrant
>>> the extra knob and align to start-queue then the map always needs to be of the
>>> same size as the combined NIC queues.
>>
>> I'm a little confused here. The 'start-queue' is not used for anything important,
>> AFAICT, in case of inhibit=on. So, why re-using it instead of adding a new
>> config option reduces the number of available use cases?
>
> Hm, maybe I'm missing something, but we use inhibit=on and do /not/ pass sock-fds as
> a parameter and instead fully rely on qemu to create all related af_xdp sockets. So
> the start-queue /is/ relevant for the underlying NIC queue selection as we pass the
> queue_id to xsk_socket__create().
Hrm, you're right. I always forget that rx and tx paths in AF_XDP have almost
nothing in common and we do indeed bind to actual NIC queues for tx path.
I'll take a closer look at v2.
Best regards, Ilya Maximets.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2025-05-15 19:35 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-08 12:34 [PATCH] net/af-xdp: Support pinned map path for AF_XDP sockets Daniel Borkmann via
2025-05-08 22:53 ` Ilya Maximets
2025-05-09 14:05 ` Daniel Borkmann via
2025-05-12 12:03 ` Ilya Maximets
2025-05-12 14:23 ` Daniel Borkmann via
2025-05-14 19:02 ` Ilya Maximets
2025-05-15 8:00 ` Daniel Borkmann via
2025-05-15 19:34 ` Ilya Maximets
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).