* Re: O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Christian Brauner @ 2026-01-16 10:00 UTC (permalink / raw)
To: Florian Weimer
Cc: linux-fsdevel, linux-api, linux-kernel, Al Viro, David Howells,
DJ Delorie
In-Reply-To: <lhuwm1ji7bl.fsf@oldenburg.str.redhat.com>
On Thu, Jan 15, 2026 at 09:55:10AM +0100, Florian Weimer wrote:
> * Christian Brauner:
>
> > On Tue, Jan 13, 2026 at 11:40:55PM +0100, Florian Weimer wrote:
> >> In <linux/mount.h>, we have this:
> >>
> >> #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
> >>
> >> This causes a few pain points for us to on the glibc side when we mirror
> >> this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
> >> which is one of the headers that's completely incompatible with the UAPI
> >> headers.
> >>
> >> The reason why this is painful is because O_CLOEXEC has at least three
> >> different values across architectures: 0x80000, 0x200000, 0x400000
> >>
> >> Even for the UAPI this isn't ideal because it effectively burns three
> >> open_tree flags, unless the flags are made architecture-specific, too.
> >
> > I think that just got cargo-culted... A long time ago some API define as
> > O_CLOEXEC and now a lot of APIs have done the same.
>
> Yes, it looks like inotify is in the same boat.
It's unfortunately nost just inotify...:
include/linux/net.h:#define SOCK_CLOEXEC O_CLOEXEC
include/uapi/drm/drm.h:#define DRM_CLOEXEC O_CLOEXEC
include/uapi/linux/eventfd.h:#define EFD_CLOEXEC O_CLOEXEC
include/uapi/linux/eventpoll.h:#define EPOLL_CLOEXEC O_CLOEXEC
include/uapi/linux/inotify.h:#define IN_CLOEXEC O_CLOEXEC
include/uapi/linux/signalfd.h:#define SFD_CLOEXEC O_CLOEXEC
include/uapi/linux/timerfd.h:#define TFD_CLOEXEC O_CLOEXEC
>
> > I'm pretty sure we can't change that now but we can document that this
> > shouldn't be ifdefed and instead be a separate per-syscall bit. But I
> > think that's the best we can do right now.
>
> Maybe add something like this as a safety measure, to ensure that the
> flags don't overlap?
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index c58674a20cad..5bbfd379ec44 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3069,6 +3069,9 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
> bool detached = flags & OPEN_TREE_CLONE;
>
> BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
> + BUILD_BUG_IN(!(O_CLOEXEC & OPEN_TREE_CLONE));
> + BUILD_BUG_ON(!((AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW) &
> + (O_CLOEXEC | OPEN_TREE_CLONE)));
Yeah, we can do something like that!
^ permalink raw reply
* Re: [PATCH bpf-next v5 8/9] libbpf: Add common attr support for map_create
From: Andrii Nakryiko @ 2026-01-16 1:03 UTC (permalink / raw)
To: Leon Hwang
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, linux-kernel, linux-api, linux-kselftest,
kernel-patches-bot
In-Reply-To: <20260112145616.44195-9-leon.hwang@linux.dev>
On Mon, Jan 12, 2026 at 6:59 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> With the previous commit adding common attribute support for
> BPF_MAP_CREATE, users can now retrieve detailed error messages when map
> creation fails via the log_buf field.
>
> Introduce struct bpf_syscall_common_attr_opts with the following fields:
> log_buf, log_size, log_level, and log_true_size.
>
> Extend bpf_map_create_opts with a new field common_attr_opts, allowing
> users to capture and inspect log messages on map creation failures.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
> tools/lib/bpf/bpf.c | 15 ++++++++++++++-
> tools/lib/bpf/bpf.h | 17 ++++++++++++++++-
> 2 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index d44e667aaf02..d65df1b7b2be 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -207,6 +207,9 @@ int bpf_map_create(enum bpf_map_type map_type,
> const struct bpf_map_create_opts *opts)
> {
> const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
> + const size_t common_attr_sz = sizeof(struct bpf_common_attr);
> + struct bpf_syscall_common_attr_opts *common_attr_opts;
> + struct bpf_common_attr common_attr;
> union bpf_attr attr;
> int fd;
>
> @@ -240,7 +243,17 @@ int bpf_map_create(enum bpf_map_type map_type,
> attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
> attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
>
> - fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
> + common_attr_opts = OPTS_GET(opts, common_attr_opts, NULL);
> + if (common_attr_opts && feat_supported(NULL, FEAT_EXTENDED_SYSCALL)) {
> + memset(&common_attr, 0, common_attr_sz);
> + common_attr.log_buf = ptr_to_u64(OPTS_GET(common_attr_opts, log_buf, NULL));
> + common_attr.log_size = OPTS_GET(common_attr_opts, log_size, 0);
> + common_attr.log_level = OPTS_GET(common_attr_opts, log_level, 0);
> + fd = sys_bpf_ext_fd(BPF_MAP_CREATE, &attr, attr_sz, &common_attr, common_attr_sz);
> + OPTS_SET(common_attr_opts, log_true_size, common_attr.log_true_size);
> + } else {
> + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
OPTS_SET(log_true_size) to zero here, maybe?
> + }
> return libbpf_err_errno(fd);
> }
>
> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> index 2c8e88ddb674..c4a26e6b71ea 100644
> --- a/tools/lib/bpf/bpf.h
> +++ b/tools/lib/bpf/bpf.h
> @@ -37,6 +37,18 @@ extern "C" {
>
> LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes);
>
> +struct bpf_syscall_common_attr_opts {
> + size_t sz; /* size of this struct for forward/backward compatibility */
> +
> + char *log_buf;
> + __u32 log_size;
> + __u32 log_level;
> + __u32 log_true_size;
> +
> + size_t :0;
> +};
> +#define bpf_syscall_common_attr_opts__last_field log_true_size
see below, let's drop this struct and just add these 4 fields directly
to bpf_map_create_opts
> +
> struct bpf_map_create_opts {
> size_t sz; /* size of this struct for forward/backward compatibility */
>
> @@ -57,9 +69,12 @@ struct bpf_map_create_opts {
>
> const void *excl_prog_hash;
> __u32 excl_prog_hash_size;
> +
> + struct bpf_syscall_common_attr_opts *common_attr_opts;
maybe let's just add those log_xxx fields here directly? This whole
extra bpf_syscall_common_attr_opts pointer and struct seems like a
cumbersome API.
> +
> size_t :0;
> };
> -#define bpf_map_create_opts__last_field excl_prog_hash_size
> +#define bpf_map_create_opts__last_field common_attr_opts
>
> LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
> const char *map_name,
> --
> 2.52.0
>
^ permalink raw reply
* Re: [PATCH bpf-next v5 4/9] bpf: Add syscall common attributes support for prog_load
From: Andrii Nakryiko @ 2026-01-16 0:54 UTC (permalink / raw)
To: Leon Hwang
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, linux-kernel, linux-api, linux-kselftest,
kernel-patches-bot
In-Reply-To: <20260112145616.44195-5-leon.hwang@linux.dev>
On Mon, Jan 12, 2026 at 6:59 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> The log buffer of common attributes would be confusing with the one in
> 'union bpf_attr' for BPF_PROG_LOAD.
>
> In order to clarify the usage of these two log buffers, they both can be
> used for logging if:
>
> * They are same, including 'log_buf', 'log_level' and 'log_size'.
> * One of them is missing, then another one will be used for logging.
>
> If they both have 'log_buf' but they are not same totally, return -EUSERS.
why use this special error code that we don't seem to use in BPF
subsystem at all? What's wrong with -EINVAL. This shouldn't be an easy
mistake to do, tbh.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
> include/linux/bpf_verifier.h | 4 +++-
> kernel/bpf/log.c | 29 ++++++++++++++++++++++++++---
> kernel/bpf/syscall.c | 9 ++++++---
> 3 files changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 4c9632c40059..da2d37ca60e7 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -637,9 +637,11 @@ struct bpf_log_attr {
> u32 log_level;
> struct bpf_attrs *attrs;
> u32 offsetof_log_true_size;
> + struct bpf_attrs *attrs_common;
> };
>
> -int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs);
> +int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
> + struct bpf_attrs *attrs_common);
> int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
>
> #define BPF_MAX_SUBPROGS 256
> diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
> index 457b724c4176..eba60a13e244 100644
> --- a/kernel/bpf/log.c
> +++ b/kernel/bpf/log.c
> @@ -865,23 +865,41 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
> }
>
> static int bpf_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs, u64 log_buf,
> - u32 log_size, u32 log_level, int offsetof_log_true_size)
> + u32 log_size, u32 log_level, int offsetof_log_true_size,
> + struct bpf_attrs *attrs_common)
> {
> + const struct bpf_common_attr *common_attr = attrs_common ? attrs_common->attr : NULL;
> +
There is something to be said about naming choices here :) it's easy
to get lost in attrs_common being actually bpf_attrs, which contains
attr field, which is actually of bpf_common_attr type... It's a bit
disorienting. :)
> memset(log_attr, 0, sizeof(*log_attr));
> log_attr->log_buf = log_buf;
> log_attr->log_size = log_size;
> log_attr->log_level = log_level;
> log_attr->attrs = attrs;
> log_attr->offsetof_log_true_size = offsetof_log_true_size;
> + log_attr->attrs_common = attrs_common;
> +
> + if (log_buf && common_attr && common_attr->log_buf &&
> + (log_buf != common_attr->log_buf ||
> + log_size != common_attr->log_size ||
> + log_level != common_attr->log_level))
> + return -EUSERS;
> +
> + if (!log_buf && common_attr && common_attr->log_buf) {
> + log_attr->log_buf = common_attr->log_buf;
> + log_attr->log_size = common_attr->log_size;
> + log_attr->log_level = common_attr->log_level;
> + }
> +
> return 0;
> }
>
[...]
^ permalink raw reply
* Re: [PATCH bpf-next v5 2/9] libbpf: Add support for extended bpf syscall
From: Andrii Nakryiko @ 2026-01-16 0:42 UTC (permalink / raw)
To: Leon Hwang
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, linux-kernel, linux-api, linux-kselftest,
kernel-patches-bot
In-Reply-To: <20260112145616.44195-3-leon.hwang@linux.dev>
On Mon, Jan 12, 2026 at 6:58 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> To support the extended BPF syscall introduced in the previous commit,
> introduce the following internal APIs:
>
> * 'sys_bpf_ext()'
> * 'sys_bpf_ext_fd()'
> They wrap the raw 'syscall()' interface to support passing extended
> attributes.
> * 'probe_sys_bpf_ext()'
> Check whether current kernel supports the extended attributes.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
> tools/lib/bpf/bpf.c | 34 +++++++++++++++++++++++++++++++++
> tools/lib/bpf/features.c | 8 ++++++++
> tools/lib/bpf/libbpf_internal.h | 3 +++
> 3 files changed, 45 insertions(+)
>
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 21b57a629916..d44e667aaf02 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -69,6 +69,40 @@ static inline __u64 ptr_to_u64(const void *ptr)
> return (__u64) (unsigned long) ptr;
> }
>
> +static inline int sys_bpf_ext(enum bpf_cmd cmd, union bpf_attr *attr,
> + unsigned int size,
> + struct bpf_common_attr *common_attr,
nit: kernel uses consistent attr_common/size_common pattern, but here
you are inverting attr_common -> common_attr, let's not?
> + unsigned int size_common)
> +{
> + cmd = common_attr ? (cmd | BPF_COMMON_ATTRS) : (cmd & ~BPF_COMMON_ATTRS);
> + return syscall(__NR_bpf, cmd, attr, size, common_attr, size_common);
> +}
> +
> +static inline int sys_bpf_ext_fd(enum bpf_cmd cmd, union bpf_attr *attr,
> + unsigned int size,
> + struct bpf_common_attr *common_attr,
> + unsigned int size_common)
> +{
> + int fd;
> +
> + fd = sys_bpf_ext(cmd, attr, size, common_attr, size_common);
> + return ensure_good_fd(fd);
> +}
> +
> +int probe_sys_bpf_ext(void)
> +{
> + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
> + union bpf_attr attr;
> + int fd;
> +
> + memset(&attr, 0, attr_sz);
> + fd = syscall(__NR_bpf, BPF_PROG_LOAD | BPF_COMMON_ATTRS, &attr, attr_sz, NULL,
> + sizeof(struct bpf_common_attr));
> + if (fd >= 0)
> + close(fd);
hm... close can change errno, this is fragile. If fd >= 0, something
is wrong with our detection, just return error right away?
> + return errno == EFAULT;
> +}
> +
> static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
> unsigned int size)
> {
> diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
> index b842b83e2480..d786a815f1ae 100644
> --- a/tools/lib/bpf/features.c
> +++ b/tools/lib/bpf/features.c
> @@ -506,6 +506,11 @@ static int probe_kern_arg_ctx_tag(int token_fd)
> return probe_fd(prog_fd);
> }
>
> +static int probe_kern_extended_syscall(int token_fd)
> +{
> + return probe_sys_bpf_ext();
> +}
> +
> typedef int (*feature_probe_fn)(int /* token_fd */);
>
> static struct kern_feature_cache feature_cache;
> @@ -581,6 +586,9 @@ static struct kern_feature_desc {
> [FEAT_BTF_QMARK_DATASEC] = {
> "BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec,
> },
> + [FEAT_EXTENDED_SYSCALL] = {
> + "Kernel supports extended syscall", probe_kern_extended_syscall,
"extended syscall" is a bit vague... We specifically detect common
attrs support, maybe say that?
> + },
> };
>
> bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
> diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
> index fc59b21b51b5..e2a6ef4b45ae 100644
> --- a/tools/lib/bpf/libbpf_internal.h
> +++ b/tools/lib/bpf/libbpf_internal.h
> @@ -392,6 +392,8 @@ enum kern_feature_id {
> FEAT_ARG_CTX_TAG,
> /* Kernel supports '?' at the front of datasec names */
> FEAT_BTF_QMARK_DATASEC,
> + /* Kernel supports extended syscall */
> + FEAT_EXTENDED_SYSCALL,
FEAT_BPF_COMMON_ATTRS ?
> __FEAT_CNT,
> };
>
> @@ -757,4 +759,5 @@ int probe_fd(int fd);
> #define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64)
>
> void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]);
> +int probe_sys_bpf_ext(void);
> #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
> --
> 2.52.0
>
^ permalink raw reply
* Re: O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Florian Weimer @ 2026-01-15 8:55 UTC (permalink / raw)
To: Christian Brauner
Cc: linux-fsdevel, linux-api, linux-kernel, Al Viro, David Howells,
DJ Delorie
In-Reply-To: <20260114-alias-riefen-2cb8c09d0ded@brauner>
* Christian Brauner:
> On Tue, Jan 13, 2026 at 11:40:55PM +0100, Florian Weimer wrote:
>> In <linux/mount.h>, we have this:
>>
>> #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
>>
>> This causes a few pain points for us to on the glibc side when we mirror
>> this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
>> which is one of the headers that's completely incompatible with the UAPI
>> headers.
>>
>> The reason why this is painful is because O_CLOEXEC has at least three
>> different values across architectures: 0x80000, 0x200000, 0x400000
>>
>> Even for the UAPI this isn't ideal because it effectively burns three
>> open_tree flags, unless the flags are made architecture-specific, too.
>
> I think that just got cargo-culted... A long time ago some API define as
> O_CLOEXEC and now a lot of APIs have done the same.
Yes, it looks like inotify is in the same boat.
> I'm pretty sure we can't change that now but we can document that this
> shouldn't be ifdefed and instead be a separate per-syscall bit. But I
> think that's the best we can do right now.
Maybe add something like this as a safety measure, to ensure that the
flags don't overlap?
diff --git a/fs/namespace.c b/fs/namespace.c
index c58674a20cad..5bbfd379ec44 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3069,6 +3069,9 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
bool detached = flags & OPEN_TREE_CLONE;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_IN(!(O_CLOEXEC & OPEN_TREE_CLONE));
+ BUILD_BUG_ON(!((AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW) &
+ (O_CLOEXEC | OPEN_TREE_CLONE)));
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
@@ -3100,7 +3103,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
- return FD_ADD(flags, vfs_open_tree(dfd, filename, flags));
+ return FD_ADD(flags & O_CLOEXEC, vfs_open_tree(dfd, filename, flags));
}
/*
(Completely untested.)
Passing the mix of flags to FD_ADD isn't really future-proof if FD_ADD
ever recognizes more than just O_CLOEXEC.
Thanks,
Florian
^ permalink raw reply related
* Re: O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Aleksa Sarai @ 2026-01-14 21:18 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Christian Brauner, Florian Weimer, linux-fsdevel, linux-api,
linux-kernel, Al Viro, David Howells, DJ Delorie
In-Reply-To: <CALCETrWMWs3_G5JhJb7+h+JQjpqXxqOh2vNcQaG1HuXjaeCqQw@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2010 bytes --]
On 2026-01-14, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Jan 14, 2026 at 8:09 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Tue, Jan 13, 2026 at 11:40:55PM +0100, Florian Weimer wrote:
> > > In <linux/mount.h>, we have this:
> > >
> > > #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
> > >
> > > This causes a few pain points for us to on the glibc side when we mirror
> > > this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
> > > which is one of the headers that's completely incompatible with the UAPI
> > > headers.
> > >
> > > The reason why this is painful is because O_CLOEXEC has at least three
> > > different values across architectures: 0x80000, 0x200000, 0x400000
> > >
> > > Even for the UAPI this isn't ideal because it effectively burns three
> > > open_tree flags, unless the flags are made architecture-specific, too.
> >
> > I think that just got cargo-culted... A long time ago some API define as
> > O_CLOEXEC and now a lot of APIs have done the same. I'm pretty sure we
> > can't change that now but we can document that this shouldn't be ifdefed
> > and instead be a separate per-syscall bit. But I think that's the best
> > we can do right now.
> >
>
> How about, for future syscalls, we make CLOEXEC unconditional? If
> anyone wants an ofd to get inherited across exec, they can F_SETFD it
> themselves.
I believe newer interfaces have already started doing that (e.g., all of
the pidfd stuff is O_CLOEXEC by default) but we should definitely update
the documentation in Documentation/process/adding-syscalls.rst to stop
recommending the inclusion of the O_CLOEXEC flag.
The funniest thing about open_tree(2) is that it actually borrows flag
bits from three distinct namespaces! It has an OPEN_TREE_* namespace,
the AT_* namespace (which now has a concept of "per-syscall flags"), and
O_CLOEXEC. What a fun interface!
--
Aleksa Sarai
https://www.cyphar.com/
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply
* Re: O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Andy Lutomirski @ 2026-01-14 19:42 UTC (permalink / raw)
To: Christian Brauner
Cc: Florian Weimer, linux-fsdevel, linux-api, linux-kernel, Al Viro,
David Howells, DJ Delorie
In-Reply-To: <20260114-alias-riefen-2cb8c09d0ded@brauner>
On Wed, Jan 14, 2026 at 8:09 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Jan 13, 2026 at 11:40:55PM +0100, Florian Weimer wrote:
> > In <linux/mount.h>, we have this:
> >
> > #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
> >
> > This causes a few pain points for us to on the glibc side when we mirror
> > this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
> > which is one of the headers that's completely incompatible with the UAPI
> > headers.
> >
> > The reason why this is painful is because O_CLOEXEC has at least three
> > different values across architectures: 0x80000, 0x200000, 0x400000
> >
> > Even for the UAPI this isn't ideal because it effectively burns three
> > open_tree flags, unless the flags are made architecture-specific, too.
>
> I think that just got cargo-culted... A long time ago some API define as
> O_CLOEXEC and now a lot of APIs have done the same. I'm pretty sure we
> can't change that now but we can document that this shouldn't be ifdefed
> and instead be a separate per-syscall bit. But I think that's the best
> we can do right now.
>
How about, for future syscalls, we make CLOEXEC unconditional? If
anyone wants an ofd to get inherited across exec, they can F_SETFD it
themselves.
--Andy
^ permalink raw reply
* Re: [PATCH v8 14/18] mm: memfd_luo: allow preserving memfd
From: Pratyush Yadav @ 2026-01-14 19:02 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Pasha Tatashin, pratyush, jasonmiu, graf, rppt, dmatlack,
rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
saeedm, ajayachandra, parav, leonro, witu, hughd, skhawaja,
chrisl
In-Reply-To: <20260107185414.GG293394@nvidia.com>
On Wed, Jan 07 2026, Jason Gunthorpe wrote:
> On Tue, Nov 25, 2025 at 11:58:44AM -0500, Pasha Tatashin wrote:
>> From: Pratyush Yadav <ptyadav@amazon.de>
>>
>> The ability to preserve a memfd allows userspace to use KHO and LUO to
>> transfer its memory contents to the next kernel. This is useful in many
>> ways. For one, it can be used with IOMMUFD as the backing store for
>> IOMMU page tables. Preserving IOMMUFD is essential for performing a
>> hypervisor live update with passthrough devices. memfd support provides
>> the first building block for making that possible.
>
> I would lead with the use of memfd to back the guest memory pages for
> use with KVM :)
I would assume using 1G-page-backed memfd is the more common use case,
and this patch doesn't come with 1G page support.
Anyway, the patch is now already applied so we can't go back and fix
the commit message...
--
Regards,
Pratyush Yadav
^ permalink raw reply
* Re: [PATCH v8 14/18] mm: memfd_luo: allow preserving memfd
From: Pratyush Yadav @ 2026-01-14 18:59 UTC (permalink / raw)
To: Mike Rapoport
Cc: Chris Mason, Pratyush Yadav, Pasha Tatashin, jasonmiu, graf,
dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
roman.gushchin, chenridong, axboe, mark.rutland, jannh,
vincent.guittot, hannes, dan.j.williams, david, joel.granados,
rostedt, anna.schumaker, song, linux, linux-kernel, linux-doc,
linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael,
dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
saeedm, ajayachandra, jgg, parav, leonro, witu, hughd, skhawaja,
chrisl
In-Reply-To: <aWfLS48tG7XInpNN@kernel.org>
On Wed, Jan 14 2026, Mike Rapoport wrote:
> On Tue, Jan 13, 2026 at 06:09:23AM -0800, Chris Mason wrote:
>> On Tue, 25 Nov 2025 11:58:44 -0500 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>>
>> Hi everyone,
>>
>> I'm running the v6.19 mm commits through patch review automation, and this
>> commit was flagged. I don't know this code well, but it looks like it
>> might be a real bug.
>>
>> [AI review output below]
>>
>> > diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
>> > new file mode 100644
>> > index 0000000000000..4f6ba63b43105
>> > --- /dev/null
>> > +++ b/mm/memfd_luo.c
>>
>> [ ... ]
>>
>> > +static int memfd_luo_retrieve_folios(struct file *file,
>> > + struct memfd_luo_folio_ser *folios_ser,
>> > + u64 nr_folios)
>> > +{
>> > + struct inode *inode = file_inode(file);
>> > + struct address_space *mapping = inode->i_mapping;
>> > + struct folio *folio;
>> > + int err = -EIO;
>> > + long i;
>> > +
>> > + for (i = 0; i < nr_folios; i++) {
>>
>> [ ... ]
>>
>> > + err = shmem_add_to_page_cache(folio, mapping, index, NULL,
>> > + mapping_gfp_mask(mapping));
>> > + if (err) {
>> > + pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
>> > + i, err);
>> > + goto unlock_folio;
>> > + }
>> > +
>> > + if (flags & MEMFD_LUO_FOLIO_UPTODATE)
>> > + folio_mark_uptodate(folio);
>> > + if (flags & MEMFD_LUO_FOLIO_DIRTY)
>> > + folio_mark_dirty(folio);
>> > +
>> > + err = shmem_inode_acct_blocks(inode, 1);
>> > + if (err) {
>> > + pr_err("shmem: failed to account folio index %ld: %d\n",
>> > + i, err);
>> > + goto unlock_folio;
>> > + }
>>
>> When shmem_inode_acct_blocks() fails here, the folio has already been
>> added to the page cache by shmem_add_to_page_cache(). Should the folio be
>> removed from the page cache before going to unlock_folio?
>>
>> Looking at shmem_alloc_and_add_folio() in mm/shmem.c, when
>> shmem_inode_acct_blocks() fails after the folio has been added, it calls
>> filemap_remove_folio() to remove it:
>>
>> error = shmem_inode_acct_blocks(inode, pages);
>> if (error) {
>> ...
>> if (error) {
>> filemap_remove_folio(folio);
>> goto unlock;
>> }
>> }
>>
>> Without this, the folio remains in the page cache (counted in
>> mapping->nrpages) but info->alloced is not incremented (since
>> shmem_recalc_inode is not called). This could cause shmem accounting
>> inconsistency.
>
> My understanding that if anything fails in memfd_luo_retrieve_folios() the
> file is destroyed anyway and the accounting wouldn't matter.
>
> But to be on the safe side we should fix the error handling here.
> @Pratyush, what do you say?
Yeah, I don't think the inode's alloced accounting is a real issue here
since the file will be destroyed immediately after. This is why I didn't
want to add the extra complexity of the error handling.
But now that I think of it, perhaps the lingering unaccounted folio
might cause an underflow in vm_committed_as. shmem_inode_acct_blocks()
cleans up the vm_acct_memory() call in case of failure. But perhaps the
iput() triggers an extra shmem_unacct_memory() because of the lingering
folio.
I am not 100% sure that can actually happen since the code is a bit
complex. Let me check and get back to you.
--
Regards,
Pratyush Yadav
^ permalink raw reply
* Re: [PATCH v8 14/18] mm: memfd_luo: allow preserving memfd
From: Mike Rapoport @ 2026-01-14 16:58 UTC (permalink / raw)
To: Chris Mason, Pratyush Yadav
Cc: Pasha Tatashin, jasonmiu, graf, dmatlack, rientjes, corbet,
rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
david, joel.granados, rostedt, anna.schumaker, song, linux,
linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, ptyadav,
lennart, brauner, linux-api, linux-fsdevel, saeedm, ajayachandra,
jgg, parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20260113140927.1074142-1-clm@meta.com>
On Tue, Jan 13, 2026 at 06:09:23AM -0800, Chris Mason wrote:
> On Tue, 25 Nov 2025 11:58:44 -0500 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> Hi everyone,
>
> I'm running the v6.19 mm commits through patch review automation, and this
> commit was flagged. I don't know this code well, but it looks like it
> might be a real bug.
>
> [AI review output below]
>
> > diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
> > new file mode 100644
> > index 0000000000000..4f6ba63b43105
> > --- /dev/null
> > +++ b/mm/memfd_luo.c
>
> [ ... ]
>
> > +static int memfd_luo_retrieve_folios(struct file *file,
> > + struct memfd_luo_folio_ser *folios_ser,
> > + u64 nr_folios)
> > +{
> > + struct inode *inode = file_inode(file);
> > + struct address_space *mapping = inode->i_mapping;
> > + struct folio *folio;
> > + int err = -EIO;
> > + long i;
> > +
> > + for (i = 0; i < nr_folios; i++) {
>
> [ ... ]
>
> > + err = shmem_add_to_page_cache(folio, mapping, index, NULL,
> > + mapping_gfp_mask(mapping));
> > + if (err) {
> > + pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
> > + i, err);
> > + goto unlock_folio;
> > + }
> > +
> > + if (flags & MEMFD_LUO_FOLIO_UPTODATE)
> > + folio_mark_uptodate(folio);
> > + if (flags & MEMFD_LUO_FOLIO_DIRTY)
> > + folio_mark_dirty(folio);
> > +
> > + err = shmem_inode_acct_blocks(inode, 1);
> > + if (err) {
> > + pr_err("shmem: failed to account folio index %ld: %d\n",
> > + i, err);
> > + goto unlock_folio;
> > + }
>
> When shmem_inode_acct_blocks() fails here, the folio has already been
> added to the page cache by shmem_add_to_page_cache(). Should the folio be
> removed from the page cache before going to unlock_folio?
>
> Looking at shmem_alloc_and_add_folio() in mm/shmem.c, when
> shmem_inode_acct_blocks() fails after the folio has been added, it calls
> filemap_remove_folio() to remove it:
>
> error = shmem_inode_acct_blocks(inode, pages);
> if (error) {
> ...
> if (error) {
> filemap_remove_folio(folio);
> goto unlock;
> }
> }
>
> Without this, the folio remains in the page cache (counted in
> mapping->nrpages) but info->alloced is not incremented (since
> shmem_recalc_inode is not called). This could cause shmem accounting
> inconsistency.
My understanding that if anything fails in memfd_luo_retrieve_folios() the
file is destroyed anyway and the accounting wouldn't matter.
But to be on the safe side we should fix the error handling here.
@Pratyush, what do you say?
--
Sincerely yours,
Mike.
^ permalink raw reply
* Re: O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Christian Brauner @ 2026-01-14 16:03 UTC (permalink / raw)
To: Florian Weimer
Cc: linux-fsdevel, linux-api, linux-kernel, Al Viro, David Howells,
DJ Delorie
In-Reply-To: <lhupl7dcf0o.fsf@oldenburg.str.redhat.com>
On Tue, Jan 13, 2026 at 11:40:55PM +0100, Florian Weimer wrote:
> In <linux/mount.h>, we have this:
>
> #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
>
> This causes a few pain points for us to on the glibc side when we mirror
> this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
> which is one of the headers that's completely incompatible with the UAPI
> headers.
>
> The reason why this is painful is because O_CLOEXEC has at least three
> different values across architectures: 0x80000, 0x200000, 0x400000
>
> Even for the UAPI this isn't ideal because it effectively burns three
> open_tree flags, unless the flags are made architecture-specific, too.
I think that just got cargo-culted... A long time ago some API define as
O_CLOEXEC and now a lot of APIs have done the same. I'm pretty sure we
can't change that now but we can document that this shouldn't be ifdefed
and instead be a separate per-syscall bit. But I think that's the best
we can do right now.
^ permalink raw reply
* O_CLOEXEC use for OPEN_TREE_CLOEXEC
From: Florian Weimer @ 2026-01-13 22:40 UTC (permalink / raw)
To: linux-fsdevel; +Cc: linux-api, linux-kernel, Al Viro, David Howells, DJ Delorie
In <linux/mount.h>, we have this:
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
This causes a few pain points for us to on the glibc side when we mirror
this into <linux/mount.h> becuse O_CLOEXEC is defined in <fcntl.h>,
which is one of the headers that's completely incompatible with the UAPI
headers.
The reason why this is painful is because O_CLOEXEC has at least three
different values across architectures: 0x80000, 0x200000, 0x400000
Even for the UAPI this isn't ideal because it effectively burns three
open_tree flags, unless the flags are made architecture-specific, too.
Thanks,
Florian
^ permalink raw reply
* Re: [PATCH v8 14/18] mm: memfd_luo: allow preserving memfd
From: Chris Mason @ 2026-01-13 14:09 UTC (permalink / raw)
To: Pasha Tatashin
Cc: Chris Mason, jasonmiu, graf, rppt, dmatlack, rientjes, corbet,
rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
david, joel.granados, rostedt, anna.schumaker, song, linux,
linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, ptyadav,
lennart, brauner, linux-api, linux-fsdevel, saeedm, ajayachandra,
jgg, parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20251125165850.3389713-15-pasha.tatashin@soleen.com>
On Tue, 25 Nov 2025 11:58:44 -0500 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
Hi everyone,
I'm running the v6.19 mm commits through patch review automation, and this
commit was flagged. I don't know this code well, but it looks like it
might be a real bug.
[AI review output below]
> diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
> new file mode 100644
> index 0000000000000..4f6ba63b43105
> --- /dev/null
> +++ b/mm/memfd_luo.c
[ ... ]
> +static int memfd_luo_retrieve_folios(struct file *file,
> + struct memfd_luo_folio_ser *folios_ser,
> + u64 nr_folios)
> +{
> + struct inode *inode = file_inode(file);
> + struct address_space *mapping = inode->i_mapping;
> + struct folio *folio;
> + int err = -EIO;
> + long i;
> +
> + for (i = 0; i < nr_folios; i++) {
[ ... ]
> + err = shmem_add_to_page_cache(folio, mapping, index, NULL,
> + mapping_gfp_mask(mapping));
> + if (err) {
> + pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
> + i, err);
> + goto unlock_folio;
> + }
> +
> + if (flags & MEMFD_LUO_FOLIO_UPTODATE)
> + folio_mark_uptodate(folio);
> + if (flags & MEMFD_LUO_FOLIO_DIRTY)
> + folio_mark_dirty(folio);
> +
> + err = shmem_inode_acct_blocks(inode, 1);
> + if (err) {
> + pr_err("shmem: failed to account folio index %ld: %d\n",
> + i, err);
> + goto unlock_folio;
> + }
When shmem_inode_acct_blocks() fails here, the folio has already been
added to the page cache by shmem_add_to_page_cache(). Should the folio be
removed from the page cache before going to unlock_folio?
Looking at shmem_alloc_and_add_folio() in mm/shmem.c, when
shmem_inode_acct_blocks() fails after the folio has been added, it calls
filemap_remove_folio() to remove it:
error = shmem_inode_acct_blocks(inode, pages);
if (error) {
...
if (error) {
filemap_remove_folio(folio);
goto unlock;
}
}
Without this, the folio remains in the page cache (counted in
mapping->nrpages) but info->alloced is not incremented (since
shmem_recalc_inode is not called). This could cause shmem accounting
inconsistency.
^ permalink raw reply
* Re: [PATCHSET v5] fs: generic file IO error reporting
From: Christian Brauner @ 2026-01-13 8:58 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christian Brauner, linux-api, jack, hch, linux-xfs, linux-ext4,
linux-fsdevel, gabriel, amir73il, Gao Xiang
In-Reply-To: <176826402528.3490369.2415315475116356277.stgit@frogsfrogsfrogs>
On Mon, 12 Jan 2026 16:31:03 -0800, Darrick J. Wong wrote:
> This patchset adds some generic helpers so that filesystems can report
> errors to fsnotify in a standard way. Then it adapts iomap to use the
> generic helpers so that any iomap-enabled filesystem can report I/O
> errors through this mechanism as well. Finally, it makes XFS report
> metadata errors through this mechanism in much the same way that ext4
> does now.
>
> [...]
Applied to the vfs-7.0.fserror branch of the vfs/vfs.git tree.
Patches in the vfs-7.0.fserror branch should appear in linux-next soon.
Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.
It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.
Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.
tree: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.0.fserror
[1/6] uapi: promote EFSCORRUPTED and EUCLEAN to errno.h
https://git.kernel.org/vfs/vfs/c/602544773763
[2/6] fs: report filesystem and file I/O errors to fsnotify
https://git.kernel.org/vfs/vfs/c/21945e6cb516
[3/6] iomap: report file I/O errors to the VFS
https://git.kernel.org/vfs/vfs/c/a9d573ee88af
[4/6] xfs: report fs metadata errors via fsnotify
https://git.kernel.org/vfs/vfs/c/efd87a100729
[5/6] xfs: translate fsdax media errors into file "data lost" errors when convenient
https://git.kernel.org/vfs/vfs/c/94503211d2fd
[6/6] ext4: convert to new fserror helpers
https://git.kernel.org/vfs/vfs/c/81d2e13a57c9
^ permalink raw reply
* [PATCH 1/6] uapi: promote EFSCORRUPTED and EUCLEAN to errno.h
From: Darrick J. Wong @ 2026-01-13 0:31 UTC (permalink / raw)
To: djwong, brauner
Cc: hch, hsiangkao, jack, linux-api, linux-xfs, jack, linux-ext4,
linux-fsdevel, gabriel, hch, amir73il
In-Reply-To: <176826402528.3490369.2415315475116356277.stgit@frogsfrogsfrogs>
From: Darrick J. Wong <djwong@kernel.org>
Stop definining these privately and instead move them to the uapi
errno.h so that they become canonical instead of copy pasta.
Cc: linux-api@vger.kernel.org
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
---
arch/alpha/include/uapi/asm/errno.h | 2 ++
arch/mips/include/uapi/asm/errno.h | 2 ++
arch/parisc/include/uapi/asm/errno.h | 2 ++
arch/sparc/include/uapi/asm/errno.h | 2 ++
fs/erofs/internal.h | 2 --
fs/ext2/ext2.h | 1 -
fs/ext4/ext4.h | 3 ---
fs/f2fs/f2fs.h | 3 ---
fs/minix/minix.h | 2 --
fs/udf/udf_sb.h | 2 --
fs/xfs/xfs_linux.h | 2 --
include/linux/jbd2.h | 3 ---
include/uapi/asm-generic/errno.h | 2 ++
tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
tools/arch/mips/include/uapi/asm/errno.h | 2 ++
tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
tools/include/uapi/asm-generic/errno.h | 2 ++
18 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
index 3d265f6babaf0a..6791f6508632ee 100644
--- a/arch/alpha/include/uapi/asm/errno.h
+++ b/arch/alpha/include/uapi/asm/errno.h
@@ -55,6 +55,7 @@
#define ENOSR 82 /* Out of streams resources */
#define ETIME 83 /* Timer expired */
#define EBADMSG 84 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EPROTO 85 /* Protocol error */
#define ENODATA 86 /* No data available */
#define ENOSTR 87 /* Device not a stream */
@@ -96,6 +97,7 @@
#define EREMCHG 115 /* Remote address changed */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
index 2fb714e2d6d8fc..c01ed91b1ef44b 100644
--- a/arch/mips/include/uapi/asm/errno.h
+++ b/arch/mips/include/uapi/asm/errno.h
@@ -50,6 +50,7 @@
#define EDOTDOT 73 /* RFS specific error */
#define EMULTIHOP 74 /* Multihop attempted */
#define EBADMSG 77 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define ENAMETOOLONG 78 /* File name too long */
#define EOVERFLOW 79 /* Value too large for defined data type */
#define ENOTUNIQ 80 /* Name not unique on network */
@@ -88,6 +89,7 @@
#define EISCONN 133 /* Transport endpoint is already connected */
#define ENOTCONN 134 /* Transport endpoint is not connected */
#define EUCLEAN 135 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 137 /* Not a XENIX named type file */
#define ENAVAIL 138 /* No XENIX semaphores available */
#define EISNAM 139 /* Is a named type file */
diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
index 8d94739d75c67c..8cbc07c1903e4c 100644
--- a/arch/parisc/include/uapi/asm/errno.h
+++ b/arch/parisc/include/uapi/asm/errno.h
@@ -36,6 +36,7 @@
#define EDOTDOT 66 /* RFS specific error */
#define EBADMSG 67 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EUSERS 68 /* Too many users */
#define EDQUOT 69 /* Quota exceeded */
#define ESTALE 70 /* Stale file handle */
@@ -62,6 +63,7 @@
#define ERESTART 175 /* Interrupted system call should be restarted */
#define ESTRPIPE 176 /* Streams pipe error */
#define EUCLEAN 177 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 178 /* Not a XENIX named type file */
#define ENAVAIL 179 /* No XENIX semaphores available */
#define EISNAM 180 /* Is a named type file */
diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
index 81a732b902ee38..4a41e7835fd5b8 100644
--- a/arch/sparc/include/uapi/asm/errno.h
+++ b/arch/sparc/include/uapi/asm/errno.h
@@ -48,6 +48,7 @@
#define ENOSR 74 /* Out of streams resources */
#define ENOMSG 75 /* No message of desired type */
#define EBADMSG 76 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EIDRM 77 /* Identifier removed */
#define EDEADLK 78 /* Resource deadlock would occur */
#define ENOLCK 79 /* No record locks available */
@@ -91,6 +92,7 @@
#define ENOTUNIQ 115 /* Name not unique on network */
#define ERESTART 116 /* Interrupted syscall should be restarted */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index f7f622836198da..d06e99baf5d5ae 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -541,6 +541,4 @@ long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index cf97b76e9fd3e9..5e0c6c5fcb6cd6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -357,7 +357,6 @@ struct ext2_inode {
*/
#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT2_ERROR_FS 0x0002 /* Errors detected */
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
/*
* Mount flags
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56112f201cace7..62c091b52bacdf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3938,7 +3938,4 @@ extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
get_block_t *get_block);
#endif /* __KERNEL__ */
-#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-
#endif /* _EXT4_H */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20edbb99b814a7..9f3aa3c7f12613 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -5004,7 +5004,4 @@ static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
f2fs_invalidate_compress_pages_range(sbi, blkaddr, len);
}
-#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-
#endif /* _LINUX_F2FS_H */
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 2bfaf377f2086c..7e1f652f16d311 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -175,6 +175,4 @@ static inline int minix_test_bit(int nr, const void *vaddr)
__minix_error_inode((inode), __func__, __LINE__, \
(fmt), ##__VA_ARGS__)
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-
#endif /* FS_MINIX_H */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 08ec8756b9487b..8399accc788dea 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -55,8 +55,6 @@
#define MF_DUPLICATE_MD 0x01
#define MF_MIRROR_FE_LOADED 0x02
-#define EFSCORRUPTED EUCLEAN
-
struct udf_meta_data {
__u32 s_meta_file_loc;
__u32 s_mirror_file_loc;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 4dd747bdbccab2..55064228c4d574 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -121,8 +121,6 @@ typedef __u32 xfs_nlink_t;
#define ENOATTR ENODATA /* Attribute not found */
#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define __return_address __builtin_return_address(0)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f5eaf76198f377..a53a00d36228ce 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1815,7 +1815,4 @@ static inline int jbd2_handle_buffer_credits(handle_t *handle)
#endif /* __KERNEL__ */
-#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
-
#endif /* _LINUX_JBD2_H */
diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h
index cf9c51ac49f97e..92e7ae493ee315 100644
--- a/include/uapi/asm-generic/errno.h
+++ b/include/uapi/asm-generic/errno.h
@@ -55,6 +55,7 @@
#define EMULTIHOP 72 /* Multihop attempted */
#define EDOTDOT 73 /* RFS specific error */
#define EBADMSG 74 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EOVERFLOW 75 /* Value too large for defined data type */
#define ENOTUNIQ 76 /* Name not unique on network */
#define EBADFD 77 /* File descriptor in bad state */
@@ -98,6 +99,7 @@
#define EINPROGRESS 115 /* Operation now in progress */
#define ESTALE 116 /* Stale file handle */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h
index 3d265f6babaf0a..6791f6508632ee 100644
--- a/tools/arch/alpha/include/uapi/asm/errno.h
+++ b/tools/arch/alpha/include/uapi/asm/errno.h
@@ -55,6 +55,7 @@
#define ENOSR 82 /* Out of streams resources */
#define ETIME 83 /* Timer expired */
#define EBADMSG 84 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EPROTO 85 /* Protocol error */
#define ENODATA 86 /* No data available */
#define ENOSTR 87 /* Device not a stream */
@@ -96,6 +97,7 @@
#define EREMCHG 115 /* Remote address changed */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h
index 2fb714e2d6d8fc..c01ed91b1ef44b 100644
--- a/tools/arch/mips/include/uapi/asm/errno.h
+++ b/tools/arch/mips/include/uapi/asm/errno.h
@@ -50,6 +50,7 @@
#define EDOTDOT 73 /* RFS specific error */
#define EMULTIHOP 74 /* Multihop attempted */
#define EBADMSG 77 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define ENAMETOOLONG 78 /* File name too long */
#define EOVERFLOW 79 /* Value too large for defined data type */
#define ENOTUNIQ 80 /* Name not unique on network */
@@ -88,6 +89,7 @@
#define EISCONN 133 /* Transport endpoint is already connected */
#define ENOTCONN 134 /* Transport endpoint is not connected */
#define EUCLEAN 135 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 137 /* Not a XENIX named type file */
#define ENAVAIL 138 /* No XENIX semaphores available */
#define EISNAM 139 /* Is a named type file */
diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h
index 8d94739d75c67c..8cbc07c1903e4c 100644
--- a/tools/arch/parisc/include/uapi/asm/errno.h
+++ b/tools/arch/parisc/include/uapi/asm/errno.h
@@ -36,6 +36,7 @@
#define EDOTDOT 66 /* RFS specific error */
#define EBADMSG 67 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EUSERS 68 /* Too many users */
#define EDQUOT 69 /* Quota exceeded */
#define ESTALE 70 /* Stale file handle */
@@ -62,6 +63,7 @@
#define ERESTART 175 /* Interrupted system call should be restarted */
#define ESTRPIPE 176 /* Streams pipe error */
#define EUCLEAN 177 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 178 /* Not a XENIX named type file */
#define ENAVAIL 179 /* No XENIX semaphores available */
#define EISNAM 180 /* Is a named type file */
diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h
index 81a732b902ee38..4a41e7835fd5b8 100644
--- a/tools/arch/sparc/include/uapi/asm/errno.h
+++ b/tools/arch/sparc/include/uapi/asm/errno.h
@@ -48,6 +48,7 @@
#define ENOSR 74 /* Out of streams resources */
#define ENOMSG 75 /* No message of desired type */
#define EBADMSG 76 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EIDRM 77 /* Identifier removed */
#define EDEADLK 78 /* Resource deadlock would occur */
#define ENOLCK 79 /* No record locks available */
@@ -91,6 +92,7 @@
#define ENOTUNIQ 115 /* Name not unique on network */
#define ERESTART 116 /* Interrupted syscall should be restarted */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h
index cf9c51ac49f97e..92e7ae493ee315 100644
--- a/tools/include/uapi/asm-generic/errno.h
+++ b/tools/include/uapi/asm-generic/errno.h
@@ -55,6 +55,7 @@
#define EMULTIHOP 72 /* Multihop attempted */
#define EDOTDOT 73 /* RFS specific error */
#define EBADMSG 74 /* Not a data message */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EOVERFLOW 75 /* Value too large for defined data type */
#define ENOTUNIQ 76 /* Name not unique on network */
#define EBADFD 77 /* File descriptor in bad state */
@@ -98,6 +99,7 @@
#define EINPROGRESS 115 /* Operation now in progress */
#define ESTALE 116 /* Stale file handle */
#define EUCLEAN 117 /* Structure needs cleaning */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define ENOTNAM 118 /* Not a XENIX named type file */
#define ENAVAIL 119 /* No XENIX semaphores available */
#define EISNAM 120 /* Is a named type file */
^ permalink raw reply related
* [PATCHSET v5] fs: generic file IO error reporting
From: Darrick J. Wong @ 2026-01-13 0:31 UTC (permalink / raw)
To: djwong, brauner
Cc: linux-api, jack, hch, hsiangkao, linux-xfs, jack, linux-ext4,
linux-fsdevel, gabriel, hch, amir73il
Hi all,
This patchset adds some generic helpers so that filesystems can report
errors to fsnotify in a standard way. Then it adapts iomap to use the
generic helpers so that any iomap-enabled filesystem can report I/O
errors through this mechanism as well. Finally, it makes XFS report
metadata errors through this mechanism in much the same way that ext4
does now.
These are a prerequisite for the XFS self-healing series which will
come at a later time.
v5: tidy comments, un-inline the unmount function
v4: drag out of RFC status, finalize the sign of errnos that we accept
If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.
This has been running on the djcloud for months with no problems. Enjoy!
Comments and questions are, as always, welcome.
--D
kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=filesystem-error-reporting
fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=filesystem-error-reporting
---
Commits in this patchset:
* uapi: promote EFSCORRUPTED and EUCLEAN to errno.h
* fs: report filesystem and file I/O errors to fsnotify
* iomap: report file I/O errors to the VFS
* xfs: report fs metadata errors via fsnotify
* xfs: translate fsdax media errors into file "data lost" errors when convenient
* ext4: convert to new fserror helpers
---
arch/alpha/include/uapi/asm/errno.h | 2
arch/mips/include/uapi/asm/errno.h | 2
arch/parisc/include/uapi/asm/errno.h | 2
arch/sparc/include/uapi/asm/errno.h | 2
fs/erofs/internal.h | 2
fs/ext2/ext2.h | 1
fs/ext4/ext4.h | 3
fs/f2fs/f2fs.h | 3
fs/minix/minix.h | 2
fs/udf/udf_sb.h | 2
fs/xfs/xfs_linux.h | 2
include/linux/fs/super_types.h | 7 +
include/linux/fserror.h | 75 +++++++++++
include/linux/jbd2.h | 3
include/uapi/asm-generic/errno.h | 2
tools/arch/alpha/include/uapi/asm/errno.h | 2
tools/arch/mips/include/uapi/asm/errno.h | 2
tools/arch/parisc/include/uapi/asm/errno.h | 2
tools/arch/sparc/include/uapi/asm/errno.h | 2
tools/include/uapi/asm-generic/errno.h | 2
fs/Makefile | 2
fs/ext4/ioctl.c | 2
fs/ext4/super.c | 13 +-
fs/fserror.c | 194 ++++++++++++++++++++++++++++
fs/iomap/buffered-io.c | 23 +++
fs/iomap/direct-io.c | 12 ++
fs/iomap/ioend.c | 6 +
fs/super.c | 3
fs/xfs/xfs_fsops.c | 4 +
fs/xfs/xfs_health.c | 14 ++
fs/xfs/xfs_notify_failure.c | 4 +
31 files changed, 373 insertions(+), 24 deletions(-)
create mode 100644 include/linux/fserror.h
create mode 100644 fs/fserror.c
^ permalink raw reply
* [PATCH bpf-next v5 9/9] selftests/bpf: Add tests to verify map create failure log
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
Add tests to verify that the kernel reports the expected error messages
when map creation fails.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
.../selftests/bpf/prog_tests/map_init.c | 168 ++++++++++++++++++
1 file changed, 168 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c
index 14a31109dd0e..824e2bea74bf 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_init.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_init.c
@@ -212,3 +212,171 @@ void test_map_init(void)
if (test__start_subtest("pcpu_lru_map_init"))
test_pcpu_lru_map_init();
}
+
+#define BPF_LOG_FIXED 8
+
+static void test_map_create(enum bpf_map_type map_type, const char *map_name,
+ struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+ const int key_size = 4, value_size = 4, max_entries = 1;
+ char log_buf[128];
+ int fd;
+ LIBBPF_OPTS(bpf_syscall_common_attr_opts, copts);
+
+ log_buf[0] = '\0';
+ copts.log_buf = log_buf;
+ copts.log_size = sizeof(log_buf);
+ copts.log_level = BPF_LOG_FIXED;
+ opts->common_attr_opts = &copts;
+ fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, opts);
+ if (!ASSERT_LT(fd, 0, "bpf_map_create")) {
+ close(fd);
+ return;
+ }
+
+ ASSERT_STREQ(log_buf, exp_msg, "log_buf");
+ ASSERT_EQ(copts.log_true_size, strlen(exp_msg) + 1, "log_true_size");
+}
+
+static void test_map_create_array(struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+ test_map_create(BPF_MAP_TYPE_ARRAY, "test_map_create", opts, exp_msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_struct_ops(void)
+{
+ const char *msg = "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_vmlinux_value_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_kv_type_id(void)
+{
+ const char *msg = "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_vmlinux_value_type_id = 1,
+ .btf_key_type_id = 1,
+ );
+
+ test_map_create(BPF_MAP_TYPE_STRUCT_OPS, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_value_type_id(void)
+{
+ const char *msg = "Invalid btf_value_type_id.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_key_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_extra(void)
+{
+ const char *msg = "Invalid map_extra.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_extra = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_numa_node(void)
+{
+ const char *msg = "Invalid numa_node.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = BPF_F_NUMA_NODE,
+ .numa_node = 0xFF,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_type(void)
+{
+ const char *msg = "Invalid map_type.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+ test_map_create(__MAX_BPF_MAP_TYPE, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_token_fd(void)
+{
+ const char *msg = "Invalid map_token_fd.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = BPF_F_TOKEN_FD,
+ .token_fd = 0xFF,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_name(void)
+{
+ const char *msg = "Invalid map_name.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+ test_map_create(BPF_MAP_TYPE_ARRAY, "test-!@#", &opts, msg);
+}
+
+static void test_invalid_btf_fd(void)
+{
+ const char *msg = "Invalid btf_fd.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_fd = -1,
+ .btf_key_type_id = 1,
+ .btf_value_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_1(void)
+{
+ const char *msg = "Invalid excl_prog_hash_size.\n";
+ const char *hash = "DEADCODE";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .excl_prog_hash = hash,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_2(void)
+{
+ const char *msg = "Invalid excl_prog_hash_size.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .excl_prog_hash_size = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+void test_map_create_failure(void)
+{
+ if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops"))
+ test_invalid_vmlinux_value_type_id_struct_ops();
+ if (test__start_subtest("invalid_vmlinux_value_type_id_kv_type_id"))
+ test_invalid_vmlinux_value_type_id_kv_type_id();
+ if (test__start_subtest("invalid_value_type_id"))
+ test_invalid_value_type_id();
+ if (test__start_subtest("invalid_map_extra"))
+ test_invalid_map_extra();
+ if (test__start_subtest("invalid_numa_node"))
+ test_invalid_numa_node();
+ if (test__start_subtest("invalid_map_type"))
+ test_invalid_map_type();
+ if (test__start_subtest("invalid_token_fd"))
+ test_invalid_token_fd();
+ if (test__start_subtest("invalid_map_name"))
+ test_invalid_map_name();
+ if (test__start_subtest("invalid_btf_fd"))
+ test_invalid_btf_fd();
+ if (test__start_subtest("invalid_excl_prog_hash_size_1"))
+ test_excl_prog_hash_size_1();
+ if (test__start_subtest("invalid_excl_prog_hash_size_2"))
+ test_excl_prog_hash_size_2();
+}
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 8/9] libbpf: Add common attr support for map_create
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
With the previous commit adding common attribute support for
BPF_MAP_CREATE, users can now retrieve detailed error messages when map
creation fails via the log_buf field.
Introduce struct bpf_syscall_common_attr_opts with the following fields:
log_buf, log_size, log_level, and log_true_size.
Extend bpf_map_create_opts with a new field common_attr_opts, allowing
users to capture and inspect log messages on map creation failures.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
tools/lib/bpf/bpf.c | 15 ++++++++++++++-
tools/lib/bpf/bpf.h | 17 ++++++++++++++++-
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index d44e667aaf02..d65df1b7b2be 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -207,6 +207,9 @@ int bpf_map_create(enum bpf_map_type map_type,
const struct bpf_map_create_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
+ const size_t common_attr_sz = sizeof(struct bpf_common_attr);
+ struct bpf_syscall_common_attr_opts *common_attr_opts;
+ struct bpf_common_attr common_attr;
union bpf_attr attr;
int fd;
@@ -240,7 +243,17 @@ int bpf_map_create(enum bpf_map_type map_type,
attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
- fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+ common_attr_opts = OPTS_GET(opts, common_attr_opts, NULL);
+ if (common_attr_opts && feat_supported(NULL, FEAT_EXTENDED_SYSCALL)) {
+ memset(&common_attr, 0, common_attr_sz);
+ common_attr.log_buf = ptr_to_u64(OPTS_GET(common_attr_opts, log_buf, NULL));
+ common_attr.log_size = OPTS_GET(common_attr_opts, log_size, 0);
+ common_attr.log_level = OPTS_GET(common_attr_opts, log_level, 0);
+ fd = sys_bpf_ext_fd(BPF_MAP_CREATE, &attr, attr_sz, &common_attr, common_attr_sz);
+ OPTS_SET(common_attr_opts, log_true_size, common_attr.log_true_size);
+ } else {
+ fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+ }
return libbpf_err_errno(fd);
}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 2c8e88ddb674..c4a26e6b71ea 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -37,6 +37,18 @@ extern "C" {
LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes);
+struct bpf_syscall_common_attr_opts {
+ size_t sz; /* size of this struct for forward/backward compatibility */
+
+ char *log_buf;
+ __u32 log_size;
+ __u32 log_level;
+ __u32 log_true_size;
+
+ size_t :0;
+};
+#define bpf_syscall_common_attr_opts__last_field log_true_size
+
struct bpf_map_create_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
@@ -57,9 +69,12 @@ struct bpf_map_create_opts {
const void *excl_prog_hash;
__u32 excl_prog_hash_size;
+
+ struct bpf_syscall_common_attr_opts *common_attr_opts;
+
size_t :0;
};
-#define bpf_map_create_opts__last_field excl_prog_hash_size
+#define bpf_map_create_opts__last_field common_attr_opts
LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
const char *map_name,
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 7/9] bpf: Add syscall common attributes support for map_create
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
Currently, many BPF_MAP_CREATE failures return -EINVAL without providing
any explanation to userspace.
With extended BPF syscall support, detailed error messages can now be
reported via the log buffer, allowing users to understand the specific
reason for a failed map creation.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf_verifier.h | 2 ++
kernel/bpf/log.c | 30 +++++++++++++++++
kernel/bpf/syscall.c | 65 ++++++++++++++++++++++++++++++------
3 files changed, 87 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9022e4f515f9..280beca480ea 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -644,6 +644,8 @@ int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs
struct bpf_attrs *attrs_common);
int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
struct bpf_attrs *attrs_common);
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *log_attr,
+ struct bpf_attrs *attrs_common);
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
#define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 0dba014ca055..6586d752970f 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -912,6 +912,36 @@ int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *
attrs_common);
}
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *log_attr,
+ struct bpf_attrs *attrs_common)
+{
+ const struct bpf_common_attr *common_attr = attrs_common->attr;
+ struct bpf_verifier_log *log;
+ int err;
+
+ memset(log_attr, 0, sizeof(*log_attr));
+ log_attr->log_buf = common_attr->log_buf;
+ log_attr->log_size = common_attr->log_size;
+ log_attr->log_level = common_attr->log_level;
+ log_attr->attrs_common = attrs_common;
+
+ if (!log_attr->log_buf)
+ return NULL;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_vlog_init(log, log_attr->log_level, u64_to_user_ptr(log_attr->log_buf),
+ log_attr->log_size);
+ if (err) {
+ kfree(log);
+ return ERR_PTR(err);
+ }
+
+ return log;
+}
+
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
{
u32 log_true_size, off;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d0440e640e40..52e1ab142da9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1370,7 +1370,7 @@ static bool bpf_net_capable(void)
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr, bpfptr_t uattr)
+static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1382,8 +1382,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
- if (err)
+ if (err) {
+ bpf_log(log, "Invalid attr.\n");
return -EINVAL;
+ }
/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
* to avoid per-map type checks tripping on unknown flag
@@ -1392,17 +1394,25 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
attr->map_flags &= ~BPF_F_TOKEN_FD;
if (attr->btf_vmlinux_value_type_id) {
- if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
- attr->btf_key_type_id || attr->btf_value_type_id)
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n");
return -EINVAL;
+ }
+ if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n");
+ return -EINVAL;
+ }
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ bpf_log(log, "Invalid btf_value_type_id.\n");
return -EINVAL;
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_type != BPF_MAP_TYPE_ARENA &&
- attr->map_extra != 0)
+ attr->map_extra != 0) {
+ bpf_log(log, "Invalid map_extra.\n");
return -EINVAL;
+ }
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
@@ -1410,13 +1420,17 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
- !node_online(numa_node)))
+ !node_online(numa_node))) {
+ bpf_log(log, "Invalid numa_node.\n");
return -EINVAL;
+ }
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map_type = attr->map_type;
- if (map_type >= ARRAY_SIZE(bpf_map_types))
+ if (map_type >= ARRAY_SIZE(bpf_map_types)) {
+ bpf_log(log, "Invalid map_type.\n");
return -EINVAL;
+ }
map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
ops = bpf_map_types[map_type];
if (!ops)
@@ -1434,8 +1448,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
if (token_flag) {
token = bpf_token_get_from_fd(attr->map_token_fd);
- if (IS_ERR(token))
+ if (IS_ERR(token)) {
+ bpf_log(log, "Invalid map_token_fd.\n");
return PTR_ERR(token);
+ }
/* if current token doesn't grant map creation permissions,
* then we can't use this token, so ignore it and rely on
@@ -1518,8 +1534,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
- if (err < 0)
+ if (err < 0) {
+ bpf_log(log, "Invalid map_name.\n");
goto free_map;
+ }
preempt_disable();
map->cookie = gen_cookie_next(&bpf_map_cookie);
@@ -1542,6 +1560,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
+ bpf_log(log, "Invalid btf_fd.\n");
err = PTR_ERR(btf);
goto free_map;
}
@@ -1569,6 +1588,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
@@ -1584,6 +1604,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
@@ -1622,6 +1643,29 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
return err;
}
+static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_attrs *common_attrs)
+{
+ struct bpf_verifier_log *log;
+ struct bpf_log_attr log_attr;
+ int err, ret;
+
+ log = bpf_log_attr_create_vlog(&log_attr, common_attrs);
+ if (IS_ERR(log))
+ return PTR_ERR(log);
+
+ err = __map_create(attr, uattr, log);
+ if (err >= 0)
+ goto free;
+
+ ret = bpf_log_attr_finalize(&log_attr, log);
+ if (ret)
+ err = ret;
+
+free:
+ kfree(log);
+ return err;
+}
+
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -6218,7 +6262,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr, uattr);
+ bpf_attrs_init(&common_attrs, &common_attr, uattr_common, size_common);
+ err = map_create(&attr, uattr, &common_attrs);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 6/9] bpf: Add syscall common attributes support for btf_load
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
Since bpf_log_attr_init() now supports struct bpf_common_attr, pass the
common attributes to it to enable syscall common attributes support for
BPF_BTF_LOAD.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf_verifier.h | 3 ++-
kernel/bpf/log.c | 5 +++--
kernel/bpf/syscall.c | 8 +++++---
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 44dd60de1966..9022e4f515f9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -642,7 +642,8 @@ struct bpf_log_attr {
int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
struct bpf_attrs *attrs_common);
-int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs);
+int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
+ struct bpf_attrs *attrs_common);
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
#define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 96df089a2c89..0dba014ca055 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -902,13 +902,14 @@ int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs
offsetof(union bpf_attr, log_true_size), attrs_common);
}
-int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs)
+int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
+ struct bpf_attrs *attrs_common)
{
const union bpf_attr *attr = attrs->attr;
return bpf_log_attr_init(log_attr, attrs, attr->btf_log_buf, attr->btf_log_size,
attr->btf_log_level, offsetof(union bpf_attr, btf_log_true_size),
- NULL);
+ attrs_common);
}
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8382fafc8d17..d0440e640e40 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5433,7 +5433,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size,
+ struct bpf_attrs *common_attrs)
{
struct bpf_token *token = NULL;
struct bpf_log_attr log_attr;
@@ -5447,7 +5448,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
return -EINVAL;
bpf_attrs_init(&attrs, attr, uattr, uattr_size);
- err = bpf_btf_load_log_attr_init(&log_attr, &attrs);
+ err = bpf_btf_load_log_attr_init(&log_attr, &attrs, common_attrs);
if (err)
return err;
@@ -6281,7 +6282,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr, size);
+ bpf_attrs_init(&common_attrs, &common_attr, uattr_common, size_common);
+ err = bpf_btf_load(&attr, uattr, size, &common_attrs);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 5/9] bpf: Refactor reporting btf_log_true_size for btf_load
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
In the next commit, it will be able to report logs via extended common
attributes, which will report 'log_true_size' via the extended common
attributes meanwhile.
Therefore, refactor the way of 'btf_log_true_size' reporting in order to
report 'log_true_size' via the extended common attributes easily.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf_verifier.h | 1 +
include/linux/btf.h | 3 ++-
kernel/bpf/btf.c | 32 +++++++++-----------------------
kernel/bpf/log.c | 9 +++++++++
kernel/bpf/syscall.c | 10 +++++++++-
5 files changed, 30 insertions(+), 25 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index da2d37ca60e7..44dd60de1966 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -642,6 +642,7 @@ struct bpf_log_attr {
int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
struct bpf_attrs *attrs_common);
+int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs);
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
#define BPF_MAX_SUBPROGS 256
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 691f09784933..df04843a4635 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -144,7 +144,8 @@ const char *btf_get_name(const struct btf *btf);
void btf_get(struct btf *btf);
void btf_put(struct btf *btf);
const struct btf_header *btf_header(const struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
+struct bpf_log_attr;
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *log_attr);
struct btf *btf_get_by_fd(int fd);
int btf_get_info_by_fd(const struct btf *btf,
const union bpf_attr *attr,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 539c9fdea41d..f9973f12a482 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5745,25 +5745,11 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
return 0;
}
-static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
-{
- u32 log_true_size;
- int err;
-
- err = bpf_vlog_finalize(log, &log_true_size);
-
- if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
- copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
- &log_true_size, sizeof(log_true_size)))
- err = -EFAULT;
-
- return err;
-}
-
-static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_log_attr *log_attr)
{
bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
- char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
+ char __user *log_ubuf = u64_to_user_ptr(log_attr->log_buf);
struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
struct btf *btf = NULL;
@@ -5780,8 +5766,8 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
/* user could have requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- err = bpf_vlog_init(&env->log, attr->btf_log_level,
- log_ubuf, attr->btf_log_size);
+ err = bpf_vlog_init(&env->log, log_attr->log_level,
+ log_ubuf, log_attr->log_size);
if (err)
goto errout_free;
@@ -5841,7 +5827,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
}
}
- err = finalize_log(&env->log, uattr, uattr_size);
+ err = bpf_log_attr_finalize(log_attr, &env->log);
if (err)
goto errout_free;
@@ -5853,7 +5839,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
btf_free_struct_meta_tab(btf);
errout:
/* overwrite err with -ENOSPC or -EFAULT */
- ret = finalize_log(&env->log, uattr, uattr_size);
+ ret = bpf_log_attr_finalize(log_attr, &env->log);
if (ret)
err = ret;
errout_free:
@@ -8017,12 +8003,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *log_attr)
{
struct btf *btf;
int ret;
- btf = btf_parse(attr, uattr, uattr_size);
+ btf = btf_parse(attr, uattr, log_attr);
if (IS_ERR(btf))
return PTR_ERR(btf);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index eba60a13e244..96df089a2c89 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -902,6 +902,15 @@ int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs
offsetof(union bpf_attr, log_true_size), attrs_common);
}
+int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs)
+{
+ const union bpf_attr *attr = attrs->attr;
+
+ return bpf_log_attr_init(log_attr, attrs, attr->btf_log_buf, attr->btf_log_size,
+ attr->btf_log_level, offsetof(union bpf_attr, btf_log_true_size),
+ NULL);
+}
+
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
{
u32 log_true_size, off;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f369b9ec9d60..8382fafc8d17 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5436,6 +5436,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
struct bpf_token *token = NULL;
+ struct bpf_log_attr log_attr;
+ struct bpf_attrs attrs;
+ int err;
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -5443,6 +5446,11 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
if (attr->btf_flags & ~BPF_F_TOKEN_FD)
return -EINVAL;
+ bpf_attrs_init(&attrs, attr, uattr, uattr_size);
+ err = bpf_btf_load_log_attr_init(&log_attr, &attrs);
+ if (err)
+ return err;
+
if (attr->btf_flags & BPF_F_TOKEN_FD) {
token = bpf_token_get_from_fd(attr->btf_token_fd);
if (IS_ERR(token))
@@ -5460,7 +5468,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
bpf_token_put(token);
- return btf_new_fd(attr, uattr, uattr_size);
+ return btf_new_fd(attr, uattr, &log_attr);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 4/9] bpf: Add syscall common attributes support for prog_load
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
The log buffer of common attributes would be confusing with the one in
'union bpf_attr' for BPF_PROG_LOAD.
In order to clarify the usage of these two log buffers, they both can be
used for logging if:
* They are same, including 'log_buf', 'log_level' and 'log_size'.
* One of them is missing, then another one will be used for logging.
If they both have 'log_buf' but they are not same totally, return -EUSERS.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf_verifier.h | 4 +++-
kernel/bpf/log.c | 29 ++++++++++++++++++++++++++---
kernel/bpf/syscall.c | 9 ++++++---
3 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c9632c40059..da2d37ca60e7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -637,9 +637,11 @@ struct bpf_log_attr {
u32 log_level;
struct bpf_attrs *attrs;
u32 offsetof_log_true_size;
+ struct bpf_attrs *attrs_common;
};
-int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs);
+int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
+ struct bpf_attrs *attrs_common);
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
#define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 457b724c4176..eba60a13e244 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -865,23 +865,41 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
}
static int bpf_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs, u64 log_buf,
- u32 log_size, u32 log_level, int offsetof_log_true_size)
+ u32 log_size, u32 log_level, int offsetof_log_true_size,
+ struct bpf_attrs *attrs_common)
{
+ const struct bpf_common_attr *common_attr = attrs_common ? attrs_common->attr : NULL;
+
memset(log_attr, 0, sizeof(*log_attr));
log_attr->log_buf = log_buf;
log_attr->log_size = log_size;
log_attr->log_level = log_level;
log_attr->attrs = attrs;
log_attr->offsetof_log_true_size = offsetof_log_true_size;
+ log_attr->attrs_common = attrs_common;
+
+ if (log_buf && common_attr && common_attr->log_buf &&
+ (log_buf != common_attr->log_buf ||
+ log_size != common_attr->log_size ||
+ log_level != common_attr->log_level))
+ return -EUSERS;
+
+ if (!log_buf && common_attr && common_attr->log_buf) {
+ log_attr->log_buf = common_attr->log_buf;
+ log_attr->log_size = common_attr->log_size;
+ log_attr->log_level = common_attr->log_level;
+ }
+
return 0;
}
-int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs)
+int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
+ struct bpf_attrs *attrs_common)
{
const union bpf_attr *attr = attrs->attr;
return bpf_log_attr_init(log_attr, attrs, attr->log_buf, attr->log_size, attr->log_level,
- offsetof(union bpf_attr, log_true_size));
+ offsetof(union bpf_attr, log_true_size), attrs_common);
}
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
@@ -901,5 +919,10 @@ int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log
copy_to_bpfptr_offset(log_attr->attrs->uattr, off, &log_true_size, size))
err = -EFAULT;
+ off = offsetof(struct bpf_common_attr, log_true_size);
+ if (log_attr->attrs_common && log_attr->attrs_common->size >= off + size &&
+ copy_to_bpfptr_offset(log_attr->attrs_common->uattr, off, &log_true_size, size))
+ err = -EFAULT;
+
return err;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b389bc6add8..f369b9ec9d60 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2865,7 +2865,8 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size,
+ struct bpf_attrs *common_attrs)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -3085,7 +3086,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
goto free_prog_sec;
bpf_attrs_init(&attrs, attr, uattr, uattr_size);
- err = bpf_prog_load_log_attr_init(&log_attr, &attrs);
+ err = bpf_prog_load_log_attr_init(&log_attr, &attrs, common_attrs);
if (err < 0)
goto free_used_maps;
@@ -6174,6 +6175,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
bpfptr_t uattr_common, unsigned int size_common)
{
struct bpf_common_attr common_attr;
+ struct bpf_attrs common_attrs;
union bpf_attr attr;
int err;
@@ -6225,7 +6227,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr, size);
+ bpf_attrs_init(&common_attrs, &common_attr, uattr_common, size_common);
+ err = bpf_prog_load(&attr, uattr, size, &common_attrs);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 3/9] bpf: Refactor reporting log_true_size for prog_load
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
The next commit will add support for reporting logs via extended common
attributes, including 'log_true_size'.
To prepare for that, refactor the 'log_true_size' reporting logic by
introducing a new struct bpf_log_attr to encapsulate log-related behavior:
* bpf_prog_load_log_attr_init(): initialize the log fields, which will
support extended common attributes in the next commit.
* bpf_log_attr_finalize(): handle log finalization and write back
'log_true_size' to userspace.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf.h | 19 ++++++++++++++++-
include/linux/bpf_verifier.h | 11 ++++++++++
kernel/bpf/log.c | 40 ++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 9 +++++++-
kernel/bpf/verifier.c | 19 ++++++-----------
5 files changed, 83 insertions(+), 15 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5936f8e2996f..3a525a7e8747 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2867,8 +2867,25 @@ int bpf_get_file_flag(int flags);
int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
size_t actual_size);
+struct bpf_attrs {
+ const void *attr;
+ bpfptr_t uattr;
+ u32 size;
+};
+
+static inline void bpf_attrs_init(struct bpf_attrs *attrs, const void *attr, bpfptr_t uattr,
+ u32 size)
+{
+ memset(attrs, 0, sizeof(*attrs));
+ attrs->attr = attr;
+ attrs->uattr = uattr;
+ attrs->size = size;
+}
+
/* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
+struct bpf_log_attr;
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_log_attr *log_attr);
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..4c9632c40059 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -631,6 +631,17 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
return log && log->level;
}
+struct bpf_log_attr {
+ u64 log_buf;
+ u32 log_size;
+ u32 log_level;
+ struct bpf_attrs *attrs;
+ u32 offsetof_log_true_size;
+};
+
+int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs);
+int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
+
#define BPF_MAX_SUBPROGS 256
struct bpf_subprog_arg_info {
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index a0c3b35de2ce..457b724c4176 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -863,3 +863,43 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
}
print_verifier_state(env, vstate, frameno, false);
}
+
+static int bpf_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs, u64 log_buf,
+ u32 log_size, u32 log_level, int offsetof_log_true_size)
+{
+ memset(log_attr, 0, sizeof(*log_attr));
+ log_attr->log_buf = log_buf;
+ log_attr->log_size = log_size;
+ log_attr->log_level = log_level;
+ log_attr->attrs = attrs;
+ log_attr->offsetof_log_true_size = offsetof_log_true_size;
+ return 0;
+}
+
+int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs)
+{
+ const union bpf_attr *attr = attrs->attr;
+
+ return bpf_log_attr_init(log_attr, attrs, attr->log_buf, attr->log_size, attr->log_level,
+ offsetof(union bpf_attr, log_true_size));
+}
+
+int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
+{
+ u32 log_true_size, off;
+ size_t size;
+ int err;
+
+ if (!log)
+ return 0;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ size = sizeof(log_true_size);
+ off = log_attr->offsetof_log_true_size;
+ if (log_attr->attrs && log_attr->attrs->size >= off + size &&
+ copy_to_bpfptr_offset(log_attr->attrs->uattr, off, &log_true_size, size))
+ err = -EFAULT;
+
+ return err;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index af703f7ea58e..0b389bc6add8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2871,6 +2871,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
struct bpf_token *token = NULL;
+ struct bpf_log_attr log_attr;
+ struct bpf_attrs attrs;
bool bpf_cap;
int err;
char license[128];
@@ -3082,8 +3084,13 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err)
goto free_prog_sec;
+ bpf_attrs_init(&attrs, attr, uattr, uattr_size);
+ err = bpf_prog_load_log_attr_init(&log_attr, &attrs);
+ if (err < 0)
+ goto free_used_maps;
+
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr, uattr_size);
+ err = bpf_check(&prog, attr, uattr, &log_attr);
if (err < 0)
goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53635ea2e41b..921a7b6a6686 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -25106,12 +25106,12 @@ static int compute_scc(struct bpf_verifier_env *env)
return err;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_log_attr *log_attr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
int i, len, ret = -EINVAL, err;
- u32 log_true_size;
bool is_priv;
BTF_TYPE_EMIT(enum bpf_features);
@@ -25158,9 +25158,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
/* user could have requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- ret = bpf_vlog_init(&env->log, attr->log_level,
- (char __user *) (unsigned long) attr->log_buf,
- attr->log_size);
+ ret = bpf_vlog_init(&env->log, log_attr->log_level,
+ u64_to_user_ptr(log_attr->log_buf),
+ log_attr->log_size);
if (ret)
goto err_unlock;
@@ -25310,17 +25310,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->prog->aux->verified_insns = env->insn_processed;
/* preserve original error even if log finalization is successful */
- err = bpf_vlog_finalize(&env->log, &log_true_size);
+ err = bpf_log_attr_finalize(log_attr, &env->log);
if (err)
ret = err;
- if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
- copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
- &log_true_size, sizeof(log_true_size))) {
- ret = -EFAULT;
- goto err_release_maps;
- }
-
if (ret)
goto err_release_maps;
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 2/9] libbpf: Add support for extended bpf syscall
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
To support the extended BPF syscall introduced in the previous commit,
introduce the following internal APIs:
* 'sys_bpf_ext()'
* 'sys_bpf_ext_fd()'
They wrap the raw 'syscall()' interface to support passing extended
attributes.
* 'probe_sys_bpf_ext()'
Check whether current kernel supports the extended attributes.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
tools/lib/bpf/bpf.c | 34 +++++++++++++++++++++++++++++++++
tools/lib/bpf/features.c | 8 ++++++++
tools/lib/bpf/libbpf_internal.h | 3 +++
3 files changed, 45 insertions(+)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 21b57a629916..d44e667aaf02 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -69,6 +69,40 @@ static inline __u64 ptr_to_u64(const void *ptr)
return (__u64) (unsigned long) ptr;
}
+static inline int sys_bpf_ext(enum bpf_cmd cmd, union bpf_attr *attr,
+ unsigned int size,
+ struct bpf_common_attr *common_attr,
+ unsigned int size_common)
+{
+ cmd = common_attr ? (cmd | BPF_COMMON_ATTRS) : (cmd & ~BPF_COMMON_ATTRS);
+ return syscall(__NR_bpf, cmd, attr, size, common_attr, size_common);
+}
+
+static inline int sys_bpf_ext_fd(enum bpf_cmd cmd, union bpf_attr *attr,
+ unsigned int size,
+ struct bpf_common_attr *common_attr,
+ unsigned int size_common)
+{
+ int fd;
+
+ fd = sys_bpf_ext(cmd, attr, size, common_attr, size_common);
+ return ensure_good_fd(fd);
+}
+
+int probe_sys_bpf_ext(void)
+{
+ const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
+ union bpf_attr attr;
+ int fd;
+
+ memset(&attr, 0, attr_sz);
+ fd = syscall(__NR_bpf, BPF_PROG_LOAD | BPF_COMMON_ATTRS, &attr, attr_sz, NULL,
+ sizeof(struct bpf_common_attr));
+ if (fd >= 0)
+ close(fd);
+ return errno == EFAULT;
+}
+
static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
unsigned int size)
{
diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index b842b83e2480..d786a815f1ae 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -506,6 +506,11 @@ static int probe_kern_arg_ctx_tag(int token_fd)
return probe_fd(prog_fd);
}
+static int probe_kern_extended_syscall(int token_fd)
+{
+ return probe_sys_bpf_ext();
+}
+
typedef int (*feature_probe_fn)(int /* token_fd */);
static struct kern_feature_cache feature_cache;
@@ -581,6 +586,9 @@ static struct kern_feature_desc {
[FEAT_BTF_QMARK_DATASEC] = {
"BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec,
},
+ [FEAT_EXTENDED_SYSCALL] = {
+ "Kernel supports extended syscall", probe_kern_extended_syscall,
+ },
};
bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index fc59b21b51b5..e2a6ef4b45ae 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -392,6 +392,8 @@ enum kern_feature_id {
FEAT_ARG_CTX_TAG,
/* Kernel supports '?' at the front of datasec names */
FEAT_BTF_QMARK_DATASEC,
+ /* Kernel supports extended syscall */
+ FEAT_EXTENDED_SYSCALL,
__FEAT_CNT,
};
@@ -757,4 +759,5 @@ int probe_fd(int fd);
#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64)
void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]);
+int probe_sys_bpf_ext(void);
#endif /* __LIBBPF_LIBBPF_INTERNAL_H */
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v5 1/9] bpf: Extend BPF syscall with common attributes support
From: Leon Hwang @ 2026-01-12 14:56 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260112145616.44195-1-leon.hwang@linux.dev>
Extend the BPF syscall to support a set of common attributes shared
across all BPF commands:
1. 'log_buf': User-provided buffer for storing logs.
2. 'log_size': Size of the log buffer.
3. 'log_level': Log verbosity level.
4. 'log_true_size': The size of log reported by kernel.
These common attributes are passed as the 4th argument to the BPF
syscall, with the 5th argument specifying the size of this structure.
To indicate the use of these common attributes from userspace, a new flag
'BPF_COMMON_ATTRS' ('1 << 16') is introduced. This flag is OR-ed into the
'cmd' field of the syscall.
When 'cmd & BPF_COMMON_ATTRS' is set, the kernel will copy the common
attributes from userspace into kernel space for use.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/syscalls.h | 3 ++-
include/uapi/linux/bpf.h | 8 ++++++++
kernel/bpf/syscall.c | 25 +++++++++++++++++++++----
tools/include/uapi/linux/bpf.h | 8 ++++++++
4 files changed, 39 insertions(+), 5 deletions(-)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf84d98964b2..729659202d77 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -937,7 +937,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
asmlinkage long sys_getrandom(char __user *buf, size_t count,
unsigned int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
-asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size);
+asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size,
+ struct bpf_common_attr __user *attr_common, unsigned int size_common);
asmlinkage long sys_execveat(int dfd, const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a2ade4be60f..2f83eca0a357 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_cmd {
BPF_PROG_STREAM_READ_BY_FD,
BPF_PROG_ASSOC_STRUCT_OPS,
__MAX_BPF_CMD,
+ BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying bpf_common_attr. */
};
enum bpf_map_type {
@@ -1491,6 +1492,13 @@ struct bpf_stack_build_id {
};
};
+struct bpf_common_attr {
+ __u64 log_buf;
+ __u32 log_size;
+ __u32 log_level;
+ __u32 log_true_size;
+};
+
#define BPF_OBJ_NAME_LEN 16U
enum {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ecc0929ce462..af703f7ea58e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6163,8 +6163,10 @@ static int prog_assoc_struct_ops(union bpf_attr *attr)
return ret;
}
-static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
+ bpfptr_t uattr_common, unsigned int size_common)
{
+ struct bpf_common_attr common_attr;
union bpf_attr attr;
int err;
@@ -6178,6 +6180,20 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
+ memset(&common_attr, 0, sizeof(common_attr));
+ if (cmd & BPF_COMMON_ATTRS) {
+ err = bpf_check_uarg_tail_zero(uattr_common, sizeof(common_attr), size_common);
+ if (err)
+ return err;
+
+ cmd &= ~BPF_COMMON_ATTRS;
+ size_common = min_t(u32, size_common, sizeof(common_attr));
+ if (copy_from_bpfptr(&common_attr, uattr_common, size_common) != 0)
+ return -EFAULT;
+ } else {
+ size_common = 0;
+ }
+
err = security_bpf(cmd, &attr, size, uattr.is_kernel);
if (err < 0)
return err;
@@ -6313,9 +6329,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
return err;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size,
+ struct bpf_common_attr __user *, uattr_common, unsigned int, size_common)
{
- return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common);
}
static bool syscall_prog_is_valid_access(int off, int size,
@@ -6346,7 +6363,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
default:
return -EINVAL;
}
- return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0);
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b816bc53d2e1..2b05c689d51a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_cmd {
BPF_PROG_STREAM_READ_BY_FD,
BPF_PROG_ASSOC_STRUCT_OPS,
__MAX_BPF_CMD,
+ BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying bpf_common_attr. */
};
enum bpf_map_type {
@@ -1491,6 +1492,13 @@ struct bpf_stack_build_id {
};
};
+struct bpf_common_attr {
+ __u64 log_buf;
+ __u32 log_size;
+ __u32 log_level;
+ __u32 log_true_size;
+};
+
#define BPF_OBJ_NAME_LEN 16U
enum {
--
2.52.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox