* [PATCH v7 01/16] fs: Add case sensitivity flags to file_kattr
From: Chuck Lever @ 2026-01-22 16:02 UTC (permalink / raw)
To: Al Viro, Christian Brauner, Jan Kara
Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever
In-Reply-To: <20260122160311.1117669-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
Enable upper layers such as NFSD to retrieve case sensitivity
information from file systems by adding FS_XFLAG_CASEFOLD and
FS_XFLAG_CASENONPRESERVING flags.
Filesystems report case-insensitive or case-nonpreserving behavior
by setting these flags directly in fa->fsx_xflags. The default
(flags unset) indicates POSIX semantics: case-sensitive and
case-preserving. These flags are read-only; userspace cannot set
them via ioctl.
Remove struct file_kattr initialization from fileattr_fill_xflags()
and fileattr_fill_flags(). Callers at ioctl/syscall entry points
zero-initialize the struct themselves, which allows them to pass
hints (flags_valid, fsx_valid) to the filesystem's ->fileattr_get()
callback via the fa argument. Filesystem handlers that invoke these
fill functions can now set flags directly in fa->fsx_xflags before
calling them, without the fill functions zeroing those values.
Case sensitivity information is exported to userspace via the
fa_xflags field in the FS_IOC_FSGETXATTR ioctl and file_getattr()
system call.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/file_attr.c | 12 ++++--------
fs/xfs/xfs_ioctl.c | 2 +-
include/linux/fileattr.h | 3 ++-
include/uapi/linux/fs.h | 2 ++
4 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 13cdb31a3e94..6e37040fc5fa 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -15,12 +15,10 @@
* @fa: fileattr pointer
* @xflags: FS_XFLAG_* flags
*
- * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
- * other fields are zeroed.
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).
*/
void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
{
- memset(fa, 0, sizeof(*fa));
fa->fsx_valid = true;
fa->fsx_xflags = xflags;
if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
@@ -46,11 +44,9 @@ EXPORT_SYMBOL(fileattr_fill_xflags);
* @flags: FS_*_FL flags
*
* Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
- * All other fields are zeroed.
*/
void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
{
- memset(fa, 0, sizeof(*fa));
fa->flags_valid = true;
fa->flags = flags;
if (fa->flags & FS_SYNC_FL)
@@ -323,7 +319,7 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct dentry *dentry = file->f_path.dentry;
- struct file_kattr fa;
+ struct file_kattr fa = {};
unsigned int flags;
int err;
@@ -355,7 +351,7 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
{
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct dentry *dentry = file->f_path.dentry;
- struct file_kattr fa;
+ struct file_kattr fa = {};
int err;
err = copy_fsxattr_from_user(&fa, argp);
@@ -434,7 +430,7 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
struct filename *name __free(putname) = NULL;
unsigned int lookup_flags = 0;
struct file_attr fattr;
- struct file_kattr fa;
+ struct file_kattr fa = {};
int error;
BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad774371..f0417c4d1fca 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
void __user *arg)
{
- struct file_kattr fa;
+ struct file_kattr fa = {};
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index f89dcfad3f8f..709de829659f 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -16,7 +16,8 @@
/* Read-only inode flags */
#define FS_XFLAG_RDONLY_MASK \
- (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR)
+ (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | \
+ FS_XFLAG_CASEFOLD | FS_XFLAG_CASENONPRESERVING)
/* Flags to indicate valid value of fsx_ fields */
#define FS_XFLAG_VALUES_MASK \
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 66ca526cf786..919148beaa8c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -253,6 +253,8 @@ struct file_attr {
#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
+#define FS_XFLAG_CASEFOLD 0x00020000 /* case-insensitive lookups */
+#define FS_XFLAG_CASENONPRESERVING 0x00040000 /* case not preserved */
#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
/* the read-only stuff doesn't really belong here, but any other place is
--
2.52.0
^ permalink raw reply related
* [PATCH v7 00/16] Exposing case folding behavior
From: Chuck Lever @ 2026-01-22 16:02 UTC (permalink / raw)
To: Al Viro, Christian Brauner, Jan Kara
Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever
From: Chuck Lever <chuck.lever@oracle.com>
Following on from
https://lore.kernel.org/linux-nfs/20251021-zypressen-bazillus-545a44af57fd@brauner/T/#m0ba197d75b7921d994cf284f3cef3a62abb11aaa
I'm attempting to implement enough support in the Linux VFS to
enable file services like NFSD and ksmbd (and user space
equivalents) to provide the actual status of case folding support
in local file systems. The default behavior for local file systems
not explicitly supported in this series is to reflect the usual
POSIX behaviors:
case-insensitive = false
case-nonpreserving = true
The case-insensitivity and case-nonpreserving booleans can be
consumed immediately by NFSD. These two attributes have been part of
the NFSv3 and NFSv4 protocols for decades, in order to support NFS
client implementations on non-POSIX systems.
Support for user space file servers is why this series exposes case
folding information via a user-space API. I don't know of any other
category of user-space application that requires access to case
folding info.
The Linux NFS community has a growing interest in supporting NFS
clients on Windows and MacOS platforms, where file name behavior does
not align with traditional POSIX semantics.
One example of a Windows-based NFS client is [1]. This client
implementation explicitly requires servers to report
FATTR4_WORD0_CASE_INSENSITIVE = TRUE for proper operation, a hard
requirement for Windows client interoperability because Windows
applications expect case-insensitive behavior. When an NFS client
knows the server is case-insensitive, it can avoid issuing multiple
LOOKUP/READDIR requests to search for case variants, and applications
like Win32 programs work correctly without manual workarounds or
code changes.
Even the Linux client can take advantage of this information. Trond
merged patches 4 years ago [2] that introduce support for case
insensitivity, in support of the Hammerspace NFS server. In
particular, when a client detects a case-insensitive NFS share,
negative dentry caching must be disabled (a lookup for "FILE.TXT"
failing shouldn't cache a negative entry when "file.txt" exists)
and directory change invalidation must clear all cached case-folded
file name variants.
Hammerspace servers and several other NFS server implementations
operate in multi-protocol environments, where a single file service
instance caters to both NFS and SMB clients. In those cases, things
work more smoothly for everyone when the NFS client can see and adapt
to the case folding behavior that SMB users rely on and expect. NFSD
needs to support the case-insensitivity and case-nonpreserving
booleans properly in order to participate as a first-class citizen
in such environments.
Series based on v6.19-rc6.
[1] https://github.com/kofemann/ms-nfs41-client
[2] https://patchwork.kernel.org/project/linux-nfs/cover/20211217203658.439352-1-trondmy@kernel.org/
---
Changes since v6:
- Remove the memset from vfs_fileattr_get
Changes since v5:
- Finish the conversion to FS_XFLAGs
- NFSv4 GETATTR now clears the attr mask bit if nfsd_get_case_info()
fails
Changes since v4:
- Observe the MSDOS "nocase" mount option
- Define new FS_XFLAGs for the user API
Changes since v3:
- Change fa->case_preserving to fa_case_nonpreserving
- VFAT is case preserving
- Make new fields available to user space
Changes since v2:
- Remove unicode labels
- Replace vfs_get_case_info
- Add support for several more local file system implementations
- Add support for in-kernel SMB server
Changes since RFC:
- Use file_getattr instead of statx
- Postpone exposing Unicode version until later
- Support NTFS and ext4 in addition to FAT
- Support NFSv4 fattr4 in addition to NFSv3 PATHCONF
Chuck Lever (16):
fs: Add case sensitivity flags to file_kattr
fat: Implement fileattr_get for case sensitivity
exfat: Implement fileattr_get for case sensitivity
ntfs3: Implement fileattr_get for case sensitivity
hfs: Implement fileattr_get for case sensitivity
hfsplus: Report case sensitivity in fileattr_get
ext4: Report case sensitivity in fileattr_get
xfs: Report case sensitivity in fileattr_get
cifs: Implement fileattr_get for case sensitivity
nfs: Implement fileattr_get for case sensitivity
f2fs: Add case sensitivity reporting to fileattr_get
vboxsf: Implement fileattr_get for case sensitivity
isofs: Implement fileattr_get for case sensitivity
nfsd: Report export case-folding via NFSv3 PATHCONF
nfsd: Implement NFSv4 FATTR4_CASE_INSENSITIVE and
FATTR4_CASE_PRESERVING
ksmbd: Report filesystem case sensitivity via FS_ATTRIBUTE_INFORMATION
fs/exfat/exfat_fs.h | 2 ++
fs/exfat/file.c | 16 ++++++++++++++--
fs/exfat/namei.c | 1 +
fs/ext4/ioctl.c | 7 +++++++
fs/f2fs/file.c | 7 +++++++
fs/fat/fat.h | 3 +++
fs/fat/file.c | 21 +++++++++++++++++++++
fs/fat/namei_msdos.c | 1 +
fs/fat/namei_vfat.c | 1 +
fs/file_attr.c | 12 ++++--------
fs/hfs/dir.c | 1 +
fs/hfs/hfs_fs.h | 2 ++
fs/hfs/inode.c | 13 +++++++++++++
fs/hfsplus/inode.c | 8 ++++++++
fs/isofs/dir.c | 15 +++++++++++++++
fs/nfs/client.c | 9 +++++++--
fs/nfs/inode.c | 20 ++++++++++++++++++++
fs/nfs/internal.h | 3 +++
fs/nfs/nfs3proc.c | 2 ++
fs/nfs/nfs3xdr.c | 7 +++++--
fs/nfs/nfs4proc.c | 2 ++
fs/nfs/proc.c | 3 +++
fs/nfs/symlink.c | 3 +++
fs/nfsd/nfs3proc.c | 18 ++++++++++--------
fs/nfsd/nfs4xdr.c | 31 +++++++++++++++++++++++++++----
fs/nfsd/vfs.c | 25 +++++++++++++++++++++++++
fs/nfsd/vfs.h | 2 ++
fs/ntfs3/file.c | 23 +++++++++++++++++++++++
fs/ntfs3/inode.c | 1 +
fs/ntfs3/namei.c | 2 ++
fs/ntfs3/ntfs_fs.h | 1 +
fs/smb/client/cifsfs.c | 20 ++++++++++++++++++++
fs/smb/server/smb2pdu.c | 25 +++++++++++++++++++------
fs/vboxsf/dir.c | 1 +
fs/vboxsf/file.c | 6 ++++--
fs/vboxsf/super.c | 4 ++++
fs/vboxsf/utils.c | 31 +++++++++++++++++++++++++++++++
fs/vboxsf/vfsmod.h | 6 ++++++
fs/xfs/xfs_ioctl.c | 9 ++++++++-
include/linux/fileattr.h | 3 ++-
include/linux/nfs_xdr.h | 2 ++
include/uapi/linux/fs.h | 2 ++
42 files changed, 335 insertions(+), 36 deletions(-)
--
2.52.0
^ permalink raw reply
* Re: [PATCH net-next v2 0/4] net: uapi: Provide an UAPI definition of 'struct sockaddr'
From: Jakub Kicinski @ 2026-01-22 3:27 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
David S. Miller, Simon Horman, Shuah Khan, Matthieu Baerts,
Mat Martineau, Geliang Tang, Mickaël Salaün,
Günther Noack, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Stanislav Fomichev,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Hao Luo, Jiri Olsa, netdev, linux-kernel,
linux-api, Arnd Bergmann, linux-kselftest, mptcp,
linux-security-module, bpf, libc-alpha, Carlos O'Donell,
Adhemerval Zanella, Rich Felker, klibc, Florian Weimer
In-Reply-To: <20260120-uapi-sockaddr-v2-0-63c319111cf6@linutronix.de>
On Tue, 20 Jan 2026 15:10:30 +0100 Thomas Weißschuh wrote:
> Various UAPI headers reference 'struct sockaddr'. Currently the
> definition of this struct is pulled in from the libc header
> sys/socket.h. This is problematic as it introduces a dependency
> on a full userspace toolchain.
>
> Add a definition of 'struct sockaddr' to the UAPI headers.
> Before that, reorder some problematic header inclusions in the selftests.
> include/linux/socket.h | 10 ----------
> include/uapi/linux/if.h | 4 ----
> include/uapi/linux/libc-compat.h | 12 ++++++++++++
> include/uapi/linux/socket.h | 14 ++++++++++++++
> samples/bpf/xdp_adjust_tail_user.c | 6 ++++--
> samples/bpf/xdp_fwd_user.c | 7 ++++---
> samples/bpf/xdp_router_ipv4_user.c | 6 +++---
> samples/bpf/xdp_sample_user.c | 15 ++++++++-------
> samples/bpf/xdp_tx_iptunnel_user.c | 4 ++--
> tools/testing/selftests/landlock/audit.h | 7 ++++---
> tools/testing/selftests/net/af_unix/diag_uid.c | 9 +++++----
> tools/testing/selftests/net/busy_poller.c | 3 ++-
> tools/testing/selftests/net/mptcp/mptcp_diag.c | 11 ++++++-----
> tools/testing/selftests/net/nettest.c | 4 ++--
> tools/testing/selftests/net/tcp_ao/icmps-discard.c | 6 +++---
> tools/testing/selftests/net/tcp_ao/lib/netlink.c | 9 +++++----
> tools/testing/selftests/net/tun.c | 5 +++--
> 17 files changed, 77 insertions(+), 55 deletions(-)
Are all those selftests / samples getting broken by this patch set?
I understand that we should avoid libc dependencies in uAPI but at
least speaking for networking - building selftests without libc is..
not a practical proposition?
^ permalink raw reply
* Re: [PATCH 0/2] mount: add OPEN_TREE_NAMESPACE
From: Rob Landley @ 2026-01-21 19:56 UTC (permalink / raw)
To: Jeff Layton, Andy Lutomirski, Askar Safin
Cc: amir73il, cyphar, jack, josef, linux-fsdevel, viro,
Lennart Poettering, David Howells, Zhang Yunkai, cgel.zte,
Menglong Dong, linux-kernel, initramfs, containers, linux-api,
news, lwn, Jonathan Corbet, emily, Christoph Hellwig
In-Reply-To: <acb859e1684122e1a73f30115f2389d2c9897251.camel@kernel.org>
>>>> You want rootfs to be a NULLFS instead of ramfs. You don't seem to want it to
>>>> actually _be_ a filesystem. Even with your "fix", containers could communicate
>>>> with each _other_ through it if it becomes accessible. If a container can get
>>>> access to an empty initramfs and write into it, it can ask/answer the question
>>>> "Are there any other containers on this machine running stux24" and then coordinate.
Or you could just make the ROOT= codepath remount the empty initramfs -o
ro like some switch_root implementations do. If the PID 1 you launch
isn't in initramfs, don't leave initramfs writeable. That seems unlikely
to break userspace.
(Having permissions to remount initramfs but _not_ having already
"cracked root" seems... a bit funky? You have waaaaay more faith in
security modules than I do...)
>> I think this new OPEN_TREE_NAMESPACE is nifty, but I don't think the
>> path that gives it sensible behavior should be conditional like this.
>> Either make it *always* mount on top of nullfs (regardless of boot
>> options) or find some way to have it actually be the root. I assume
>> the latter is challenging for some reason.
>
> I think that's the plan. I suggested the same to Christian last week,
> and he was amenable to removing the option and just always doing a
> nullfs_rootfs mount.
Since 2013, initramfs might be ramfs or tmpfs depending on
circumstances. Adding a third option for it be nullfs when there's no
cpio.gz extracted into it seems reasonable. (You can always mount a
tmpfs _over_ it if you need that later, it's writeable so a PID 1
launched in it has workspace.)
That said, if you are changing the semantics, right now we switch_root
from initramfs instead of pivot_root because initramfs couldn't be
unmounted. With this change would pivot_root become the mechanism for
initramfs too? (If the cpio.gz recipient wasn't actually rootfs but was
an overmount the way ROOT= does it.)
Aside: it would be nice if inaccessible mount points could automatically
be garbage collected. There's already some "lazy umount" plumbing that
does that when explicitly requested to, but last I checked there were
cases that didn't get caught. It's been a while though, might already
have been fixed. Presumably initramfs would always get pinned because
it's PID 0's / reference...
Also, could you guys make CONFIG_DEVTMPFS_MOUNT work with initramfs?
I've posted patches for that on and off since 2017, most recent one's
probably
https://landley.net/bin/mkroot/0.8.13/linux-patches/0003-Wire-up-CONFIG_DEVTMPFS_MOUNT-to-initramfs.patch
(tested on a 6.17 kernel).
> We think that older runtimes should still "just work" with this scheme.
> Out of an abundance of caution, we _might_ want a command-line option
> to make it go back to old way, in case we find some userland stuff that
> doesn't like this for some reason, but hopefully we won't even need
> that.
I assume it will break stuff, but I also assume the systems it breaks
will never upgrade to a 7.x kernel because the kernel itself would
consume all available memory before launching PID 1.
Rob
^ permalink raw reply
* Re: [PATCH 0/2] mount: add OPEN_TREE_NAMESPACE
From: Andy Lutomirski @ 2026-01-21 18:00 UTC (permalink / raw)
To: Jeff Layton
Cc: Askar Safin, brauner, amir73il, cyphar, jack, josef,
linux-fsdevel, viro, Lennart Poettering, David Howells,
Yunkai Zhang, cgel.zte, Menglong Dong, linux-kernel, initramfs,
containers, linux-api, news, lwn, Jonathan Corbet, Rob Landley,
emily, Christoph Hellwig
In-Reply-To: <acb859e1684122e1a73f30115f2389d2c9897251.camel@kernel.org>
> On Jan 19, 2026, at 2:21 PM, Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2026-01-19 at 11:05 -0800, Andy Lutomirski wrote:
>>> On Mon, Jan 19, 2026 at 10:56 AM Askar Safin <safinaskar@gmail.com> wrote:
>>>
>>> Christian Brauner <brauner@kernel.org>:
>>>> Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
>>>> OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
>>>> returning a file descriptor referring to that mount tree
>>>> OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
>>>> to a new mount namespace. In that new mount namespace the copied mount
>>>> tree has been mounted on top of a copy of the real rootfs.
>>>
>>> I want to point at security benefits of this.
>>>
>>> [[ TL;DR: [1] and [2] are very big changes to how mount namespaces work.
>>> I like them, and I think they should get wider exposure. ]]
>>>
>>> If this patchset ([1]) and [2] both land (they are both in "next" now and
>>> likely will be submitted to mainline soon) and "nullfs_rootfs" is passed on
>>> command line, then mount namespace created by open_tree(OPEN_TREE_NAMESPACE) will
>>> usually contain exactly 2 mounts: nullfs and whatever was passed to
>>> open_tree(OPEN_TREE_NAMESPACE).
>>>
>>> This means that even if attacker somehow is able to unmount its root and
>>> get access to underlying mounts, then the only underlying thing they will
>>> get is nullfs.
>>>
>>> Also this means that other mounts are not only hidden in new namespace, they
>>> are fully absent. This prevents attacks discussed here: [3], [4].
>>>
>>> Also this means that (assuming we have both [1] and [2] and "nullfs_rootfs"
>>> is passed), there is no anymore hidden writable mount shared by all containers,
>>> potentially available to attackers. This is concern raised in [5]:
>>>
>>>> You want rootfs to be a NULLFS instead of ramfs. You don't seem to want it to
>>>> actually _be_ a filesystem. Even with your "fix", containers could communicate
>>>> with each _other_ through it if it becomes accessible. If a container can get
>>>> access to an empty initramfs and write into it, it can ask/answer the question
>>>> "Are there any other containers on this machine running stux24" and then coordinate.
>>
>> I think this new OPEN_TREE_NAMESPACE is nifty, but I don't think the
>> path that gives it sensible behavior should be conditional like this.
>> Either make it *always* mount on top of nullfs (regardless of boot
>> options) or find some way to have it actually be the root. I assume
>> the latter is challenging for some reason.
>>
>
> I think that's the plan. I suggested the same to Christian last week,
> and he was amenable to removing the option and just always doing a
> nullfs_rootfs mount.
>
> We think that older runtimes should still "just work" with this scheme.
> Out of an abundance of caution, we _might_ want a command-line option
> to make it go back to old way, in case we find some userland stuff that
> doesn't like this for some reason, but hopefully we won't even need
> that.
What I mean is: even if for some reason the kernel is running in a
mode where the *initial* rootfs is a real fs, I think it would be nice
for OPEN_TREE_NAMESPACE to use nullfs.
^ permalink raw reply
* Re: [PATCH 0/2] mount: add OPEN_TREE_NAMESPACE
From: Christian Brauner @ 2026-01-21 10:20 UTC (permalink / raw)
To: Jeff Layton
Cc: Andy Lutomirski, Askar Safin, amir73il, cyphar, jack, josef,
linux-fsdevel, viro, Lennart Poettering, David Howells,
Zhang Yunkai, cgel.zte, Menglong Dong, linux-kernel, initramfs,
containers, linux-api, news, lwn, Jonathan Corbet, Rob Landley,
emily, Christoph Hellwig
In-Reply-To: <acb859e1684122e1a73f30115f2389d2c9897251.camel@kernel.org>
On Mon, Jan 19, 2026 at 05:21:30PM -0500, Jeff Layton wrote:
> On Mon, 2026-01-19 at 11:05 -0800, Andy Lutomirski wrote:
> > On Mon, Jan 19, 2026 at 10:56 AM Askar Safin <safinaskar@gmail.com> wrote:
> > >
> > > Christian Brauner <brauner@kernel.org>:
> > > > Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
> > > > OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
> > > > returning a file descriptor referring to that mount tree
> > > > OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
> > > > to a new mount namespace. In that new mount namespace the copied mount
> > > > tree has been mounted on top of a copy of the real rootfs.
> > >
> > > I want to point at security benefits of this.
> > >
> > > [[ TL;DR: [1] and [2] are very big changes to how mount namespaces work.
> > > I like them, and I think they should get wider exposure. ]]
> > >
> > > If this patchset ([1]) and [2] both land (they are both in "next" now and
> > > likely will be submitted to mainline soon) and "nullfs_rootfs" is passed on
> > > command line, then mount namespace created by open_tree(OPEN_TREE_NAMESPACE) will
> > > usually contain exactly 2 mounts: nullfs and whatever was passed to
> > > open_tree(OPEN_TREE_NAMESPACE).
> > >
> > > This means that even if attacker somehow is able to unmount its root and
> > > get access to underlying mounts, then the only underlying thing they will
> > > get is nullfs.
> > >
> > > Also this means that other mounts are not only hidden in new namespace, they
> > > are fully absent. This prevents attacks discussed here: [3], [4].
> > >
> > > Also this means that (assuming we have both [1] and [2] and "nullfs_rootfs"
> > > is passed), there is no anymore hidden writable mount shared by all containers,
> > > potentially available to attackers. This is concern raised in [5]:
> > >
> > > > You want rootfs to be a NULLFS instead of ramfs. You don't seem to want it to
> > > > actually _be_ a filesystem. Even with your "fix", containers could communicate
> > > > with each _other_ through it if it becomes accessible. If a container can get
> > > > access to an empty initramfs and write into it, it can ask/answer the question
> > > > "Are there any other containers on this machine running stux24" and then coordinate.
> >
> > I think this new OPEN_TREE_NAMESPACE is nifty, but I don't think the
> > path that gives it sensible behavior should be conditional like this.
> > Either make it *always* mount on top of nullfs (regardless of boot
> > options) or find some way to have it actually be the root. I assume
> > the latter is challenging for some reason.
> >
>
> I think that's the plan. I suggested the same to Christian last week,
> and he was amenable to removing the option and just always doing a
> nullfs_rootfs mount.
Whether or not the underlying mount is nullfs or not is irrelevant. If
it's not nullfs but a regular tmpfs it works just as well. If it has any
locked overmounts the new rootfs will become locked as well similarly if
it'll be owned by a new userns.
^ permalink raw reply
* Re: [klibc] [PATCH net-next] net: uapi: Provide an UAPI definition of 'struct sockaddr'
From: H. Peter Anvin @ 2026-01-20 23:20 UTC (permalink / raw)
To: Arnd Bergmann, Thomas Weißschuh, Eric Dumazet,
Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn, Xi Ruoyao,
Carlos O'Donell, Adhemerval Zanella Netto, Rich Felker
Cc: Netdev, linux-kernel, linux-api, klibc
In-Reply-To: <212d2e51-abc1-47bb-8666-755917cad889@app.fastmail.com>
On 2026-01-20 14:31, Arnd Bergmann wrote:
>>
> I must have accidentally cut that from my reply, sorry.
> Looking at it again now, I think I ran into problems with the
> flexible array that was removed from the in-kernel sockaddr
> structure in commit 2b5e9f9b7e41 ("net: Convert struct sockaddr
> to fixed-size "sa_data[14]""), so there is a good chance it works
> now with the (once more) fixed-size version.
>
> The other problem is that the structures that embed 'sockaddr'
> are used a lot inside of the kernel, in particular in 'ifreq',
> so changing the uapi sockaddr to __kernel_sockaddr requires
> additional changes wherever the struct members are passed
> by reference.
>
Well, the kernel should do what opting-in libcs do:
#define sockaddr __kernel_sockaddr
or
struct sockaddr { struct __kernel_sockaddr; };
... now when we have ms extensions turned on. Unfortunately gcc/clang don't
support them without the option even with __extension__, so user space is
limited to using macros.
-hpa
^ permalink raw reply
* Re: [PATCH v6 01/16] fs: Add case sensitivity flags to file_kattr
From: Darrick J. Wong @ 2026-01-20 22:58 UTC (permalink / raw)
To: Chuck Lever
Cc: Christian Brauner, Alexander Viro, Jan Kara, linux-fsdevel,
linux-ext4, linux-xfs, linux-cifs, linux-nfs, linux-f2fs-devel,
OGAWA Hirofumi, Namjae Jeon, Sungjong Seo, Yuezhang Mo,
almaz.alexandrovich, Viacheslav Dubeyko, glaubitz, frank.li,
Theodore Tso, adilger.kernel, Carlos Maiolino, Steve French,
Paulo Alcantara, Ronnie Sahlberg, Shyam Prasad N, Trond Myklebust,
Anna Schumaker, Jaegeuk Kim, Chao Yu, Hans de Goede, senozhatsky,
Chuck Lever, linux-api
In-Reply-To: <38bf1452-8cf8-477b-bcc3-9fe442033bc5@app.fastmail.com>
[uapi change, so add linux-api]
On Tue, Jan 20, 2026 at 04:48:31PM -0500, Chuck Lever wrote:
>
>
> On Tue, Jan 20, 2026, at 12:26 PM, Darrick J. Wong wrote:
> > On Tue, Jan 20, 2026 at 09:24:24AM -0500, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >>
> >> Enable upper layers such as NFSD to retrieve case sensitivity
> >> information from file systems by adding FS_XFLAG_CASEFOLD and
> >> FS_XFLAG_CASENONPRESERVING flags.
> >>
> >> Filesystems report case-insensitive or case-nonpreserving behavior
> >> by setting these flags directly in fa->fsx_xflags. The default
> >> (flags unset) indicates POSIX semantics: case-sensitive and
> >> case-preserving. These flags are read-only; userspace cannot set
> >> them via ioctl.
> >>
> >> Relocate struct file_kattr initialization from fileattr_fill_xflags()
> >> and fileattr_fill_flags() to vfs_fileattr_get() and the ioctl/syscall
> >> call sites. This allows filesystem ->fileattr_get() callbacks to set
> >> flags directly in fa->fsx_xflags before invoking the fill functions,
> >> which previously would have zeroed those values. Callers that bypass
> >> vfs_fileattr_get() must now zero-initialize the struct themselves.
> >>
> >> Case sensitivity information is exported to userspace via the
> >> fa_xflags field in the FS_IOC_FSGETXATTR ioctl and file_getattr()
> >> system call.
> >>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> fs/file_attr.c | 14 ++++++--------
> >> fs/xfs/xfs_ioctl.c | 2 +-
> >> include/linux/fileattr.h | 3 ++-
> >> include/uapi/linux/fs.h | 2 ++
> >
> > This ought to go to linux-api because you're changing the userspace api.
> > Granted it's only adding a flag definition to an existing ioctl, but
> > FS_XFLAG_CASEFOLD /does/ collide with Andrey's fsverity xflag patch...
> >
> > (The rest of the changes looks ok to me.)
>
> Process question for Christian: Do you want to see a v7 of this
> series with Cc: linux-api before proceeding, or are you taking
> both Andrey's and mine and can resolve the conflict, or ... ?
Well now that I've griped at both patchsets on the same public list,
I'll at least chime in that I'd be satisfied if whoever decides to merge
them to fix up the conflicts whenever they merge them. :)
--D
>
> > --D
> >
> >> 4 files changed, 11 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/fs/file_attr.c b/fs/file_attr.c
> >> index 13cdb31a3e94..2700200c5b9c 100644
> >> --- a/fs/file_attr.c
> >> +++ b/fs/file_attr.c
> >> @@ -15,12 +15,10 @@
> >> * @fa: fileattr pointer
> >> * @xflags: FS_XFLAG_* flags
> >> *
> >> - * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
> >> - * other fields are zeroed.
> >> + * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).
> >> */
> >> void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
> >> {
> >> - memset(fa, 0, sizeof(*fa));
> >> fa->fsx_valid = true;
> >> fa->fsx_xflags = xflags;
> >> if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
> >> @@ -46,11 +44,9 @@ EXPORT_SYMBOL(fileattr_fill_xflags);
> >> * @flags: FS_*_FL flags
> >> *
> >> * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
> >> - * All other fields are zeroed.
> >> */
> >> void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
> >> {
> >> - memset(fa, 0, sizeof(*fa));
> >> fa->flags_valid = true;
> >> fa->flags = flags;
> >> if (fa->flags & FS_SYNC_FL)
> >> @@ -84,6 +80,8 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
> >> struct inode *inode = d_inode(dentry);
> >> int error;
> >>
> >> + memset(fa, 0, sizeof(*fa));
> >> +
> >> if (!inode->i_op->fileattr_get)
> >> return -ENOIOCTLCMD;
> >>
> >> @@ -323,7 +321,7 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
> >> {
> >> struct mnt_idmap *idmap = file_mnt_idmap(file);
> >> struct dentry *dentry = file->f_path.dentry;
> >> - struct file_kattr fa;
> >> + struct file_kattr fa = {};
> >> unsigned int flags;
> >> int err;
> >>
> >> @@ -355,7 +353,7 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
> >> {
> >> struct mnt_idmap *idmap = file_mnt_idmap(file);
> >> struct dentry *dentry = file->f_path.dentry;
> >> - struct file_kattr fa;
> >> + struct file_kattr fa = {};
> >> int err;
> >>
> >> err = copy_fsxattr_from_user(&fa, argp);
> >> @@ -434,7 +432,7 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
> >> struct filename *name __free(putname) = NULL;
> >> unsigned int lookup_flags = 0;
> >> struct file_attr fattr;
> >> - struct file_kattr fa;
> >> + struct file_kattr fa = {};
> >> int error;
> >>
> >> BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
> >> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> >> index 59eaad774371..f0417c4d1fca 100644
> >> --- a/fs/xfs/xfs_ioctl.c
> >> +++ b/fs/xfs/xfs_ioctl.c
> >> @@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
> >> xfs_inode_t *ip,
> >> void __user *arg)
> >> {
> >> - struct file_kattr fa;
> >> + struct file_kattr fa = {};
> >>
> >> xfs_ilock(ip, XFS_ILOCK_SHARED);
> >> xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
> >> diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
> >> index f89dcfad3f8f..709de829659f 100644
> >> --- a/include/linux/fileattr.h
> >> +++ b/include/linux/fileattr.h
> >> @@ -16,7 +16,8 @@
> >>
> >> /* Read-only inode flags */
> >> #define FS_XFLAG_RDONLY_MASK \
> >> - (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR)
> >> + (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | \
> >> + FS_XFLAG_CASEFOLD | FS_XFLAG_CASENONPRESERVING)
> >>
> >> /* Flags to indicate valid value of fsx_ fields */
> >> #define FS_XFLAG_VALUES_MASK \
> >> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> >> index 66ca526cf786..919148beaa8c 100644
> >> --- a/include/uapi/linux/fs.h
> >> +++ b/include/uapi/linux/fs.h
> >> @@ -253,6 +253,8 @@ struct file_attr {
> >> #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
> >> #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
> >> #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
> >> +#define FS_XFLAG_CASEFOLD 0x00020000 /* case-insensitive lookups */
> >> +#define FS_XFLAG_CASENONPRESERVING 0x00040000 /* case not preserved */
> >> #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
> >>
> >> /* the read-only stuff doesn't really belong here, but any other place is
> >> --
> >> 2.52.0
> >>
> >>
>
> --
> Chuck Lever
>
^ permalink raw reply
* Re: [klibc] [PATCH net-next] net: uapi: Provide an UAPI definition of 'struct sockaddr'
From: Arnd Bergmann @ 2026-01-20 22:31 UTC (permalink / raw)
To: H. Peter Anvin, Thomas Weißschuh, Eric Dumazet,
Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn, Xi Ruoyao,
Carlos O'Donell, Adhemerval Zanella Netto, Rich Felker
Cc: Netdev, linux-kernel, linux-api, klibc
In-Reply-To: <0fc5b4ef-c468-416f-a065-f64989d75378@zytor.com>
On Tue, Jan 20, 2026, at 19:50, H. Peter Anvin wrote:
> On 2026-01-05 05:50, Arnd Bergmann wrote:
>>
>> This looks like the right approach to me. I have previously
>> tried to introduce a 'struct __kernel_sockaddr' structure and
>> use that in uapi headers in place of the libc sockaddr, but
>> that seemed worse in the end, and introduce the same problems
>> as using the existing __kernel_sockaddr_storage.
>>
>
> You say "the same problems". It's not clear to me what that means.
I must have accidentally cut that from my reply, sorry.
Looking at it again now, I think I ran into problems with the
flexible array that was removed from the in-kernel sockaddr
structure in commit 2b5e9f9b7e41 ("net: Convert struct sockaddr
to fixed-size "sa_data[14]""), so there is a good chance it works
now with the (once more) fixed-size version.
The other problem is that the structures that embed 'sockaddr'
are used a lot inside of the kernel, in particular in 'ifreq',
so changing the uapi sockaddr to __kernel_sockaddr requires
additional changes wherever the struct members are passed
by reference.
> Based on my own libc experience, hacking both a minimal (klibc) and a
> maximal (glibc) libc, there are a *lot* of advantages to having
> __kernel_* definitions available, *regardless* of if they are exported
> into the libc namespace at all.
Absolutely, I am totally in agreement about this in general, which
is why I tried this approach first.
Arnd
^ permalink raw reply
* Re: [PATCH net-next v2 2/4] selftests/landlock: Move some UAPI header inclusions after libc ones
From: Mickaël Salaün @ 2026-01-20 21:46 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
David S. Miller, Jakub Kicinski, Simon Horman, Shuah Khan,
Matthieu Baerts, Mat Martineau, Geliang Tang, Günther Noack,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Stanislav Fomichev, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
KP Singh, Hao Luo, Jiri Olsa, netdev, linux-kernel, linux-api,
Arnd Bergmann, linux-kselftest, mptcp, linux-security-module, bpf,
libc-alpha, Carlos O'Donell, Adhemerval Zanella, Rich Felker,
klibc, Florian Weimer
In-Reply-To: <20260120-uapi-sockaddr-v2-2-63c319111cf6@linutronix.de>
On Tue, Jan 20, 2026 at 03:10:32PM +0100, Thomas Weißschuh wrote:
> Interleaving inclusions of UAPI headers and libc headers is problematic.
> Both sets of headers define conflicting symbols. To enable their
> coexistence a compatibility-mechanism is in place.
>
> An upcoming change will define 'struct sockaddr' from linux/socket.h.
> However sys/socket.h from libc does not yet handle this case and a
> symbol conflict will arise.
>
> Move the inclusion of all UAPI headers after the inclusion of the glibc
> ones, so the compatibility mechanism from the UAPI headers is used.
>
> Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Thanks!
> ---
> tools/testing/selftests/landlock/audit.h | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h
> index 44eb433e9666..c12b16c690dc 100644
> --- a/tools/testing/selftests/landlock/audit.h
> +++ b/tools/testing/selftests/landlock/audit.h
> @@ -7,9 +7,6 @@
>
> #define _GNU_SOURCE
> #include <errno.h>
> -#include <linux/audit.h>
> -#include <linux/limits.h>
> -#include <linux/netlink.h>
> #include <regex.h>
> #include <stdbool.h>
> #include <stdint.h>
> @@ -20,6 +17,10 @@
> #include <sys/time.h>
> #include <unistd.h>
>
> +#include <linux/audit.h>
> +#include <linux/limits.h>
> +#include <linux/netlink.h>
> +
> #include "kselftest.h"
>
> #ifndef ARRAY_SIZE
>
> --
> 2.52.0
>
>
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Alejandro Colomar @ 2026-01-20 20:42 UTC (permalink / raw)
To: Rich Felker
Cc: Zack Weinberg, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <aW_jz7nucPBjhu0C@devuan>
[-- Attachment #1: Type: text/plain, Size: 9759 bytes --]
On Tue, Jan 20, 2026 at 09:35:43PM +0100, Alejandro Colomar wrote:
> Hi Rich, Zack,
>
> On Tue, Jan 20, 2026 at 12:46:59PM -0500, Rich Felker wrote:
> > On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
> > > > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
>
> [...]
>
> > > Now, the abstract correct behavior is secondary to the fact that we
> > > know there are both systems where close should not be retried after
> > > EINTR (Linux) and systems where the fd is still open after EINTR
> > > (HP-UX). But it is my position that *portable code* should assume the
> > > Linux behavior, because that is the safest option. If you assume the
> > > HP-UX behavior on a machine that implements the Linux behavior, you
> > > might close some unrelated file out from under yourself (probably but
> > > not necessarily a different thread). If you assume the Linux behavior
> > > on a machine that implements the HP-UX behavior, you have leaked a
> > > file descriptor; the worst things that can do are much less severe.
> >
> > Unfortunately, regardless of what happens, code portable to old
> > systems needs to avoid getting in the situation to begin with. By
> > either not installing interrupting signal handlers or blocking EINTR
> > around close.
>
> [...]
>
> > > > While I agree with all of this, I think the tone is way too
> > > > proscriptive. The man pages are to document the behaviors, not tell
> > > > people how to program.
> > >
> > > I could be persuaded to tone it down a little but in this case I think
> > > the man page's job *is* to tell people how to program. We know lots of
> > > existing code has gotten the fine details of close() wrong and we are
> > > trying to document how to do it right.
> >
> > No, the job of the man pages absolutely is not "to tell people how to
> > program". It's to document behaviors. They are not a programming
> > tutorial. They are not polemic diatribes. They are unbiased statements
> > of facts. Facts of what the standards say and what implementations do,
> > that equip programmers with the knowledge they need to make their own
> > informed decisions, rather than blindly following what someone who
> > thinks they know better told them to do.
>
> This reminds me a little bit of the realloc(p,0) fiasco of C89 and
> glibc.
>
> In most cases, I agree with you that manual pages are and should be
> aseptic, there are cases where I think the manual page needs to be
> tutorial. Especially when there's such a mess, we need to both explain
> all the possible behaviors (or at least mention them to some degree).
... and guide programmers about how to best use the API.
I forgot to finish the sentence.
>
> But for example, there's the case of realloc(p,0), where we have
> a fiasco that was pushed by a compoundment of wrong decisions by the
> C Committee, and prior to that from System V. We're a bit lucky that
> C17 accidentally broke it so badly that we now have it as UB, and that
> gives us the opportunity to fix it now (which BTW might also be the case
> for close(2)).
>
> In the case of realloc(3), I went and documented in the manual page that
> glibc is broken, and that ISO C is also broken.
>
> STANDARDS
> malloc()
> free()
> calloc()
> realloc()
> C23, POSIX.1‐2024.
>
> reallocarray()
> POSIX.1‐2024.
>
> realloc(p, 0)
> The behavior of realloc(p, 0) in glibc doesn’t conform to
> any of C99, C11, POSIX.1‐2001, POSIX.1‐2004, POSIX.1‐2008,
> POSIX.1‐2013, POSIX.1‐2017, or POSIX.1‐2024. The C17
> specification was changed to make it conforming, but that
> specification made it impossible to write code that reli‐
> ably determines if the input pointer is freed after real‐
> loc(p, 0), and C23 changed it again to make this undefined
> behavior, acknowledging that the C17 specification was
> broad enough, so that undefined behavior wasn’t worse than
> that.
>
> reallocarray() suffers the same issues in glibc.
>
> musl libc and the BSDs conform to all versions of ISO C
> and POSIX.1.
>
> gnulib provides the realloc‐posix module, which provides
> wrappers realloc() and reallocarray() that conform to all
> versions of ISO C and POSIX.1.
>
> There’s a proposal to standardize the BSD behavior: https:
> //www.open-std.org/jtc1/sc22/wg14/www/docs/n3621.txt.
>
> HISTORY
> malloc()
> free()
> calloc()
> realloc()
> POSIX.1‐2001, C89.
>
> reallocarray()
> glibc 2.26. OpenBSD 5.6, FreeBSD 11.0.
>
> malloc() and related functions rejected sizes greater than
> PTRDIFF_MAX starting in glibc 2.30.
>
> free() preserved errno starting in glibc 2.33.
>
> realloc(p, 0)
> C89 was ambiguous in its specification of realloc(p, 0).
> C99 partially fixed this.
>
> The original implementation in glibc would have been con‐
> forming to C99. However, and ironically, trying to comply
> with C99 before the standard was released, glibc changed
> its behavior in glibc 2.1.1 into something that ended up
> not conforming to the final C99 specification (but this is
> debated, as the wording of the standard seems self‐contra‐
> dicting).
>
> ...
>
> BUGS
> Programmers would naturally expect by induction that
> realloc(p, size) is consistent with free(p) and mal‐
> loc(size), as that is the behavior in the general case.
> This is not explicitly required by POSIX.1‐2024 or C11,
> but all conforming implementations are consistent with
> that.
>
> The glibc implementation of realloc() is not consistent
> with that, and as a consequence, it is dangerous to call
> realloc(p, 0) in glibc.
>
> A trivial workaround for glibc is calling it as
> realloc(p, size?size:1).
>
> The workaround for reallocarray() in glibc ——which shares
> the same bug—— would be
> reallocarray(p, n?n:1, size?size:1).
>
>
> Apart from documenting that glibc and ISO C are broken, we document how
> to best deal with it (see the last paragraph in BUGS). This is
> necessary because I fear that just by documenting the different
> behaviors, programmers would still not know what to do with that.
> Just take into account that even several members of the committee don't
> know how to deal with it.
>
> I'd be willing to have something similar for close(2).
>
>
> Have a lovely night!
> Alex
>
> P.S.: I have great news about realloc(p,0)! Microsoft is on-board with
> the change. They told me they like the proposal, and are willing to
> fix their realloc(3) implementation. They'll now conduct tests to make
> sure it doesn't break anything too badly, and will come back to me with
> any feedback they have from those tests.
>
> I'll put the standards proposal for realloc(3) on hold, waiting for
> Microsoft's feedback.
>
> > > > Aside: the reason EINTR *has to* be specified this way is that pthread
> > > > cancellation is aligned with EINTR. If EINTR were defined to have
> > > > closed the fd, then acting on cancellation during close would also
> > > > have closed the fd, but the cancellation handler would have no way to
> > > > distinguish this, leading to a situation where you're forced to either
> > > > leak fds or introduce a double-close vuln.
> > >
> > > The correct way to address this would be to make close() not be a
> > > cancellation point.
> >
> > This would also be a desirable change, one I would support if other
> > implementors are on-board with pushing for it.
> >
> > > > An outline of what I'd like to see instead:
> > > >
> > > > - Clear explanation of why double-close is a serious bug that must
> > > > always be avoided. (I think we all agree on this.)
> > > >
> > > > - Statement that the historical Linux/glibc behavior and current POSIX
> > > > requirement differ, without language that tries to paint the POSIX
> > > > behavior as a HP-UX bug/quirk. Possibly citing real sources/history
> > > > of the issue (Austin Group tracker items 529, 614; maybe others).
> > > >
> > > > - Consequence of just assuming the Linux behavior (fd leaks on
> > > > conforming systems).
> > > >
> > > > - Consequences of assuming the POSIX behavior (double-close vulns on
> > > > GNU/Linux, maybe others).
> > > >
> > > > - Survey of methods for avoiding the problem (ways to preclude EINTR,
> > > > possibly ways to infer behavior, etc).
> > >
> > > This outline seems more or less reasonable to me but, if it's me
> > > writing the text, I _will_ characterize what POSIX currently says
> > > about EINTR returns from close() as a bug in POSIX. As far as I'm
> > > concerned, that is a fact, not polemic.
> > >
> > > I have found that arguing with you in particular, Rich, is generally
> > > not worth the effort. Therefore, unless you reply and _accept_ that
> > > the final version of the close manpage will say that POSIX is buggy,
> > > I am not going to write another version of this text, nor will I be
> > > drawn into further debate.
> >
> > I will not accept that because it's a gross violation of the
> > responsibility of document writing.
> >
> > Rich
>
> --
> <https://www.alejandro-colomar.es>
--
<https://www.alejandro-colomar.es>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Alejandro Colomar @ 2026-01-20 20:35 UTC (permalink / raw)
To: Rich Felker
Cc: Zack Weinberg, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <20260120174659.GE6263@brightrain.aerifal.cx>
[-- Attachment #1: Type: text/plain, Size: 9114 bytes --]
Hi Rich, Zack,
On Tue, Jan 20, 2026 at 12:46:59PM -0500, Rich Felker wrote:
> On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
> > > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
[...]
> > Now, the abstract correct behavior is secondary to the fact that we
> > know there are both systems where close should not be retried after
> > EINTR (Linux) and systems where the fd is still open after EINTR
> > (HP-UX). But it is my position that *portable code* should assume the
> > Linux behavior, because that is the safest option. If you assume the
> > HP-UX behavior on a machine that implements the Linux behavior, you
> > might close some unrelated file out from under yourself (probably but
> > not necessarily a different thread). If you assume the Linux behavior
> > on a machine that implements the HP-UX behavior, you have leaked a
> > file descriptor; the worst things that can do are much less severe.
>
> Unfortunately, regardless of what happens, code portable to old
> systems needs to avoid getting in the situation to begin with. By
> either not installing interrupting signal handlers or blocking EINTR
> around close.
[...]
> > > While I agree with all of this, I think the tone is way too
> > > proscriptive. The man pages are to document the behaviors, not tell
> > > people how to program.
> >
> > I could be persuaded to tone it down a little but in this case I think
> > the man page's job *is* to tell people how to program. We know lots of
> > existing code has gotten the fine details of close() wrong and we are
> > trying to document how to do it right.
>
> No, the job of the man pages absolutely is not "to tell people how to
> program". It's to document behaviors. They are not a programming
> tutorial. They are not polemic diatribes. They are unbiased statements
> of facts. Facts of what the standards say and what implementations do,
> that equip programmers with the knowledge they need to make their own
> informed decisions, rather than blindly following what someone who
> thinks they know better told them to do.
This reminds me a little bit of the realloc(p,0) fiasco of C89 and
glibc.
In most cases, I agree with you that manual pages are and should be
aseptic, there are cases where I think the manual page needs to be
tutorial. Especially when there's such a mess, we need to both explain
all the possible behaviors (or at least mention them to some degree).
But for example, there's the case of realloc(p,0), where we have
a fiasco that was pushed by a compoundment of wrong decisions by the
C Committee, and prior to that from System V. We're a bit lucky that
C17 accidentally broke it so badly that we now have it as UB, and that
gives us the opportunity to fix it now (which BTW might also be the case
for close(2)).
In the case of realloc(3), I went and documented in the manual page that
glibc is broken, and that ISO C is also broken.
STANDARDS
malloc()
free()
calloc()
realloc()
C23, POSIX.1‐2024.
reallocarray()
POSIX.1‐2024.
realloc(p, 0)
The behavior of realloc(p, 0) in glibc doesn’t conform to
any of C99, C11, POSIX.1‐2001, POSIX.1‐2004, POSIX.1‐2008,
POSIX.1‐2013, POSIX.1‐2017, or POSIX.1‐2024. The C17
specification was changed to make it conforming, but that
specification made it impossible to write code that reli‐
ably determines if the input pointer is freed after real‐
loc(p, 0), and C23 changed it again to make this undefined
behavior, acknowledging that the C17 specification was
broad enough, so that undefined behavior wasn’t worse than
that.
reallocarray() suffers the same issues in glibc.
musl libc and the BSDs conform to all versions of ISO C
and POSIX.1.
gnulib provides the realloc‐posix module, which provides
wrappers realloc() and reallocarray() that conform to all
versions of ISO C and POSIX.1.
There’s a proposal to standardize the BSD behavior: https:
//www.open-std.org/jtc1/sc22/wg14/www/docs/n3621.txt.
HISTORY
malloc()
free()
calloc()
realloc()
POSIX.1‐2001, C89.
reallocarray()
glibc 2.26. OpenBSD 5.6, FreeBSD 11.0.
malloc() and related functions rejected sizes greater than
PTRDIFF_MAX starting in glibc 2.30.
free() preserved errno starting in glibc 2.33.
realloc(p, 0)
C89 was ambiguous in its specification of realloc(p, 0).
C99 partially fixed this.
The original implementation in glibc would have been con‐
forming to C99. However, and ironically, trying to comply
with C99 before the standard was released, glibc changed
its behavior in glibc 2.1.1 into something that ended up
not conforming to the final C99 specification (but this is
debated, as the wording of the standard seems self‐contra‐
dicting).
...
BUGS
Programmers would naturally expect by induction that
realloc(p, size) is consistent with free(p) and mal‐
loc(size), as that is the behavior in the general case.
This is not explicitly required by POSIX.1‐2024 or C11,
but all conforming implementations are consistent with
that.
The glibc implementation of realloc() is not consistent
with that, and as a consequence, it is dangerous to call
realloc(p, 0) in glibc.
A trivial workaround for glibc is calling it as
realloc(p, size?size:1).
The workaround for reallocarray() in glibc ——which shares
the same bug—— would be
reallocarray(p, n?n:1, size?size:1).
Apart from documenting that glibc and ISO C are broken, we document how
to best deal with it (see the last paragraph in BUGS). This is
necessary because I fear that just by documenting the different
behaviors, programmers would still not know what to do with that.
Just take into account that even several members of the committee don't
know how to deal with it.
I'd be willing to have something similar for close(2).
Have a lovely night!
Alex
P.S.: I have great news about realloc(p,0)! Microsoft is on-board with
the change. They told me they like the proposal, and are willing to
fix their realloc(3) implementation. They'll now conduct tests to make
sure it doesn't break anything too badly, and will come back to me with
any feedback they have from those tests.
I'll put the standards proposal for realloc(3) on hold, waiting for
Microsoft's feedback.
> > > Aside: the reason EINTR *has to* be specified this way is that pthread
> > > cancellation is aligned with EINTR. If EINTR were defined to have
> > > closed the fd, then acting on cancellation during close would also
> > > have closed the fd, but the cancellation handler would have no way to
> > > distinguish this, leading to a situation where you're forced to either
> > > leak fds or introduce a double-close vuln.
> >
> > The correct way to address this would be to make close() not be a
> > cancellation point.
>
> This would also be a desirable change, one I would support if other
> implementors are on-board with pushing for it.
>
> > > An outline of what I'd like to see instead:
> > >
> > > - Clear explanation of why double-close is a serious bug that must
> > > always be avoided. (I think we all agree on this.)
> > >
> > > - Statement that the historical Linux/glibc behavior and current POSIX
> > > requirement differ, without language that tries to paint the POSIX
> > > behavior as a HP-UX bug/quirk. Possibly citing real sources/history
> > > of the issue (Austin Group tracker items 529, 614; maybe others).
> > >
> > > - Consequence of just assuming the Linux behavior (fd leaks on
> > > conforming systems).
> > >
> > > - Consequences of assuming the POSIX behavior (double-close vulns on
> > > GNU/Linux, maybe others).
> > >
> > > - Survey of methods for avoiding the problem (ways to preclude EINTR,
> > > possibly ways to infer behavior, etc).
> >
> > This outline seems more or less reasonable to me but, if it's me
> > writing the text, I _will_ characterize what POSIX currently says
> > about EINTR returns from close() as a bug in POSIX. As far as I'm
> > concerned, that is a fact, not polemic.
> >
> > I have found that arguing with you in particular, Rich, is generally
> > not worth the effort. Therefore, unless you reply and _accept_ that
> > the final version of the close manpage will say that POSIX is buggy,
> > I am not going to write another version of this text, nor will I be
> > drawn into further debate.
>
> I will not accept that because it's a gross violation of the
> responsibility of document writing.
>
> Rich
--
<https://www.alejandro-colomar.es>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Paul Eggert @ 2026-01-20 20:11 UTC (permalink / raw)
To: Rich Felker
Cc: Alejandro Colomar, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development,
Zack Weinberg
In-Reply-To: <20260120174659.GE6263@brightrain.aerifal.cx>
On 2026-01-20 09:46, Rich Felker wrote:
> the job of the man pages absolutely is not "to tell people how to
> program". It's to document behaviors.
In practice man pages do both. When I type "man close" on GNU/Linux I
see text like the text quoted below, and as a C programmer I appreciate
getting advice like this when the situation is sufficiently tricky.
----
Any record locks (see fcntl(2)) held on the file it was associated with,
and owned by the process, are removed regardless of the file descriptor
that was used to obtain the lock. This has some unfortunate consequences
and one should be extra careful when using advisory record locking. See
fcntl(2) for discussion of the risks and consequences as well as for the
(probably preferred) open file description locks.
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Florian Weimer @ 2026-01-20 20:05 UTC (permalink / raw)
To: Rich Felker
Cc: Zack Weinberg, Alejandro Colomar, Vincent Lefevre, Jan Kara,
Alexander Viro, Christian Brauner, linux-fsdevel, linux-api,
GNU libc development
In-Reply-To: <20260120190010.GF6263@brightrain.aerifal.cx>
* Rich Felker:
> On Tue, Jan 20, 2026 at 07:39:48PM +0100, Florian Weimer wrote:
>> * Rich Felker:
>>
>> > On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
>> >> > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
>> >> >> close() always succeeds. That is, after it returns, _fd_ has
>> >> >> always been disconnected from the open file it formerly referred
>> >> >> to, and its number can be recycled to refer to some other file.
>> >> >> Furthermore, if _fd_ was the last reference to the underlying
>> >> >> open file description, the resources associated with the open file
>> >> >> description will always have been scheduled to be released.
>> >> ...
>> >> >> EINPROGRESS
>> >> >> EINTR
>> >> >> There are no delayed errors to report, but the kernel is
>> >> >> still doing some clean-up work in the background. This
>> >> >> situation should be treated the same as if close() had
>> >> >> returned zero. Do not retry the close(), and do not report
>> >> >> an error to the user.
>> >> >
>> >> > Since this behavior for EINTR is non-conforming (and even prior to the
>> >> > POSIX 2024 update, it was contrary to the general semantics for EINTR,
>> >> > that no non-ignoreable side-effects have taken place), it should be
>> >> > noted that it's Linux/glibc-specific.
>> >>
>> >> I am prepared to take your word for it that POSIX says this is
>> >> non-conforming, but in that case, POSIX is wrong, and I will not be
>> >> convinced otherwise by any argument. Operations that release a
>> >> resource must always succeed.
>> >
>> > There are two conflicting requirements here:
>> >
>> > 1. Operations that release a resource must always succeed.
>> > 2. Failure with EINTR must not not have side effects.
>> >
>> > The right conclusion is that operations that release resources must
>> > not be able to fail with EINTR. And that's how POSIX should have
>> > resolved the situation -- by getting rid of support for the silly
>> > legacy synchronous-tape-drive-rewinding behavior of close on some
>> > systems, and requiring close to succeed immediately with no waiting
>> > for anything.
>>
>> What about SO_LINGER? Isn't this relevant in context?
>
> shutdown should be used for this, not close. So that the acts of
> waiting for the operation to finish, and releasing the resource handle
> needed to observe if it's finished, are separate.
I think shutdown on TCP sockets is non-blocking under Linux. It doesn't
wait until the peer has acknowledged the FIN segment, as far as I
understand it. Other systems may behave differently.
>> As far as I know, there is no other way besides SO_LINGER to get
>> notification if the packet buffers are actually gone. If you don't use
>> it, memory can pile up in the kernel without the application's
>> knowledge.
>
> The way Linux's EINTR behaves, using close can't ensure this memory
> doesn't pile up, because on EINTR you lose the ability to wait for it.
Can't the application reliably avoid EINTR by blocking signals?
Thanks,
Florian
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Al Viro @ 2026-01-20 19:17 UTC (permalink / raw)
To: Rich Felker
Cc: Zack Weinberg, Alejandro Colomar, Vincent Lefevre, Jan Kara,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <20260120163634.GD6263@brightrain.aerifal.cx>
On Tue, Jan 20, 2026 at 11:36:34AM -0500, Rich Felker wrote:
> On Tue, Jan 20, 2026 at 11:15:15AM -0500, Zack Weinberg wrote:
> > Rich and I have an irreconciliable disagreement on what the semantics of close
> > _should_ be. I'm not going to do any more work on this until/unless he
> > changes his mind.
>
> It's been way too long since I read this thread to recall what our
> point of disagreement is or what point glibc might be at in
> reconciling the Linux kernel disagreement with POSIX.
It's not so much disagreement as breakage of internal POSIX decision
process that has lead to POSIX irrelevance in this particular area.
POSIX authority derives from the agreement of actual behaviour of
Unices. Always had been, witness the amount of underspecified
areas where various vendor implementation had different semantics,
due to exact that reason.
They (or anybody else, really) can argue that such-and-such behaviour
ought to change. In quite a few cases that has succeeded. What they
can't do is to force such change by fiat. Especially not when Linux
and *BSD happen to agree on behaviour that differs from what they
wish it to be.
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Rich Felker @ 2026-01-20 19:00 UTC (permalink / raw)
To: Florian Weimer
Cc: Zack Weinberg, Alejandro Colomar, Vincent Lefevre, Jan Kara,
Alexander Viro, Christian Brauner, linux-fsdevel, linux-api,
GNU libc development
In-Reply-To: <lhubjio5dsb.fsf@oldenburg.str.redhat.com>
On Tue, Jan 20, 2026 at 07:39:48PM +0100, Florian Weimer wrote:
> * Rich Felker:
>
> > On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
> >> > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
> >> >> close() always succeeds. That is, after it returns, _fd_ has
> >> >> always been disconnected from the open file it formerly referred
> >> >> to, and its number can be recycled to refer to some other file.
> >> >> Furthermore, if _fd_ was the last reference to the underlying
> >> >> open file description, the resources associated with the open file
> >> >> description will always have been scheduled to be released.
> >> ...
> >> >> EINPROGRESS
> >> >> EINTR
> >> >> There are no delayed errors to report, but the kernel is
> >> >> still doing some clean-up work in the background. This
> >> >> situation should be treated the same as if close() had
> >> >> returned zero. Do not retry the close(), and do not report
> >> >> an error to the user.
> >> >
> >> > Since this behavior for EINTR is non-conforming (and even prior to the
> >> > POSIX 2024 update, it was contrary to the general semantics for EINTR,
> >> > that no non-ignoreable side-effects have taken place), it should be
> >> > noted that it's Linux/glibc-specific.
> >>
> >> I am prepared to take your word for it that POSIX says this is
> >> non-conforming, but in that case, POSIX is wrong, and I will not be
> >> convinced otherwise by any argument. Operations that release a
> >> resource must always succeed.
> >
> > There are two conflicting requirements here:
> >
> > 1. Operations that release a resource must always succeed.
> > 2. Failure with EINTR must not not have side effects.
> >
> > The right conclusion is that operations that release resources must
> > not be able to fail with EINTR. And that's how POSIX should have
> > resolved the situation -- by getting rid of support for the silly
> > legacy synchronous-tape-drive-rewinding behavior of close on some
> > systems, and requiring close to succeed immediately with no waiting
> > for anything.
>
> What about SO_LINGER? Isn't this relevant in context?
shutdown should be used for this, not close. So that the acts of
waiting for the operation to finish, and releasing the resource handle
needed to observe if it's finished, are separate.
> As far as I know, there is no other way besides SO_LINGER to get
> notification if the packet buffers are actually gone. If you don't use
> it, memory can pile up in the kernel without the application's
> knowledge.
The way Linux's EINTR behaves, using close can't ensure this memory
doesn't pile up, because on EINTR you lose the ability to wait for it.
Rich
^ permalink raw reply
* Re: [klibc] [PATCH net-next] net: uapi: Provide an UAPI definition of 'struct sockaddr'
From: H. Peter Anvin @ 2026-01-20 18:50 UTC (permalink / raw)
To: Arnd Bergmann, Thomas Weißschuh, Eric Dumazet,
Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn, libc-alpha,
Carlos O'Donell, Adhemerval Zanella, Rich Felker
Cc: Netdev, linux-kernel, linux-api, klibc
In-Reply-To: <934ca004-0aef-49a4-a4f1-3d39a2e71864@app.fastmail.com>
On 2026-01-05 05:50, Arnd Bergmann wrote:
>
> This looks like the right approach to me. I have previously
> tried to introduce a 'struct __kernel_sockaddr' structure and
> use that in uapi headers in place of the libc sockaddr, but
> that seemed worse in the end, and introduce the same problems
> as using the existing __kernel_sockaddr_storage.
>
You say "the same problems". It's not clear to me what that means.
Based on my own libc experience, hacking both a minimal (klibc) and a
maximal (glibc) libc, there are a *lot* of advantages to having
__kernel_* definitions available, *regardless* of if they are exported
into the libc namespace at all.
Specifically:
1. When calling explicit kernel interfaces, like ioctl(), it is the
kernel interfaces, not the libc interfaces, that needs to be
used. However, the rest of the application may need to include the
libc headers.
2. The libc *implementation* may need to have both the kernel and the
libc interfaces available.
In the case of struct sockaddr et al, it probably matters less,
because it isn't practical for them to be different for other
reasons, but it has been a real problem for things like termios.
On the flipside, for things where the kernel interfaces are inherently
necessary, we really want the libc authors to be able to use the uapi
headers. However, they have to be concerned about things like
namespace restrictions.
So I have a very, very strong vote for using and even expanding the
use of __kernel_* in the uapi headers. In fact, it would even be nice
to have a variant of "make headers_install" that automatically
transmogrifies symbols; if it isn't doable with simple pattern
matching then perhaps using coccinelle.
Allowing the libc authors to modify those transmogrification rules
would definitely be better than having various kinds of header guards.
-hpa
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Florian Weimer @ 2026-01-20 18:39 UTC (permalink / raw)
To: Rich Felker
Cc: Zack Weinberg, Alejandro Colomar, Vincent Lefevre, Jan Kara,
Alexander Viro, Christian Brauner, linux-fsdevel, linux-api,
GNU libc development
In-Reply-To: <20260120174659.GE6263@brightrain.aerifal.cx>
* Rich Felker:
> On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
>> > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
>> >> close() always succeeds. That is, after it returns, _fd_ has
>> >> always been disconnected from the open file it formerly referred
>> >> to, and its number can be recycled to refer to some other file.
>> >> Furthermore, if _fd_ was the last reference to the underlying
>> >> open file description, the resources associated with the open file
>> >> description will always have been scheduled to be released.
>> ...
>> >> EINPROGRESS
>> >> EINTR
>> >> There are no delayed errors to report, but the kernel is
>> >> still doing some clean-up work in the background. This
>> >> situation should be treated the same as if close() had
>> >> returned zero. Do not retry the close(), and do not report
>> >> an error to the user.
>> >
>> > Since this behavior for EINTR is non-conforming (and even prior to the
>> > POSIX 2024 update, it was contrary to the general semantics for EINTR,
>> > that no non-ignoreable side-effects have taken place), it should be
>> > noted that it's Linux/glibc-specific.
>>
>> I am prepared to take your word for it that POSIX says this is
>> non-conforming, but in that case, POSIX is wrong, and I will not be
>> convinced otherwise by any argument. Operations that release a
>> resource must always succeed.
>
> There are two conflicting requirements here:
>
> 1. Operations that release a resource must always succeed.
> 2. Failure with EINTR must not not have side effects.
>
> The right conclusion is that operations that release resources must
> not be able to fail with EINTR. And that's how POSIX should have
> resolved the situation -- by getting rid of support for the silly
> legacy synchronous-tape-drive-rewinding behavior of close on some
> systems, and requiring close to succeed immediately with no waiting
> for anything.
What about SO_LINGER? Isn't this relevant in context?
As far as I know, there is no other way besides SO_LINGER to get
notification if the packet buffers are actually gone. If you don't use
it, memory can pile up in the kernel without the application's
knowledge.
Thanks,
Florian
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Rich Felker @ 2026-01-20 17:46 UTC (permalink / raw)
To: Zack Weinberg
Cc: Alejandro Colomar, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <1571b14d-1077-4e81-ab97-36e39099761e@app.fastmail.com>
On Tue, Jan 20, 2026 at 12:05:52PM -0500, Zack Weinberg wrote:
> > On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
> >> close() always succeeds. That is, after it returns, _fd_ has
> >> always been disconnected from the open file it formerly referred
> >> to, and its number can be recycled to refer to some other file.
> >> Furthermore, if _fd_ was the last reference to the underlying
> >> open file description, the resources associated with the open file
> >> description will always have been scheduled to be released.
> ...
> >> EINPROGRESS
> >> EINTR
> >> There are no delayed errors to report, but the kernel is
> >> still doing some clean-up work in the background. This
> >> situation should be treated the same as if close() had
> >> returned zero. Do not retry the close(), and do not report
> >> an error to the user.
> >
> > Since this behavior for EINTR is non-conforming (and even prior to the
> > POSIX 2024 update, it was contrary to the general semantics for EINTR,
> > that no non-ignoreable side-effects have taken place), it should be
> > noted that it's Linux/glibc-specific.
>
> I am prepared to take your word for it that POSIX says this is
> non-conforming, but in that case, POSIX is wrong, and I will not be
> convinced otherwise by any argument. Operations that release a
> resource must always succeed.
There are two conflicting requirements here:
1. Operations that release a resource must always succeed.
2. Failure with EINTR must not not have side effects.
The right conclusion is that operations that release resources must
not be able to fail with EINTR. And that's how POSIX should have
resolved the situation -- by getting rid of support for the silly
legacy synchronous-tape-drive-rewinding behavior of close on some
systems, and requiring close to succeed immediately with no waiting
for anything. But abandoning requirement 2 is not an option,
especially in light of the relationship between EINTR and thread
cancellation in regards to contract about side effects.
It's perfectly reasonable for implementations (as musl does, and as I
think glibc either does or intends to do) to just go all the way and
satisfy both 1 and 2 by having close translate the kernel EINTR into
0.
> Now, the abstract correct behavior is secondary to the fact that we
> know there are both systems where close should not be retried after
> EINTR (Linux) and systems where the fd is still open after EINTR
> (HP-UX). But it is my position that *portable code* should assume the
> Linux behavior, because that is the safest option. If you assume the
> HP-UX behavior on a machine that implements the Linux behavior, you
> might close some unrelated file out from under yourself (probably but
> not necessarily a different thread). If you assume the Linux behavior
> on a machine that implements the HP-UX behavior, you have leaked a
> file descriptor; the worst things that can do are much less severe.
Unfortunately, regardless of what happens, code portable to old
systems needs to avoid getting in the situation to begin with. By
either not installing interrupting signal handlers or blocking EINTR
around close.
> The only way to get it right all the time is to have a big long list
> of #ifdefs for every Unix under the sun, and we don't even have the
> data we would need to write that list.
>
> > While I agree with all of this, I think the tone is way too
> > proscriptive. The man pages are to document the behaviors, not tell
> > people how to program.
>
> I could be persuaded to tone it down a little but in this case I think
> the man page's job *is* to tell people how to program. We know lots of
> existing code has gotten the fine details of close() wrong and we are
> trying to document how to do it right.
No, the job of the man pages absolutely is not "to tell people how to
program". It's to document behaviors. They are not a programming
tutorial. They are not polemic diatribes. They are unbiased statements
of facts. Facts of what the standards say and what implementations do,
that equip programmers with the knowledge they need to make their own
informed decisions, rather than blindly following what someone who
thinks they know better told them to do.
> > Aside: the reason EINTR *has to* be specified this way is that pthread
> > cancellation is aligned with EINTR. If EINTR were defined to have
> > closed the fd, then acting on cancellation during close would also
> > have closed the fd, but the cancellation handler would have no way to
> > distinguish this, leading to a situation where you're forced to either
> > leak fds or introduce a double-close vuln.
>
> The correct way to address this would be to make close() not be a
> cancellation point.
This would also be a desirable change, one I would support if other
implementors are on-board with pushing for it.
> > An outline of what I'd like to see instead:
> >
> > - Clear explanation of why double-close is a serious bug that must
> > always be avoided. (I think we all agree on this.)
> >
> > - Statement that the historical Linux/glibc behavior and current POSIX
> > requirement differ, without language that tries to paint the POSIX
> > behavior as a HP-UX bug/quirk. Possibly citing real sources/history
> > of the issue (Austin Group tracker items 529, 614; maybe others).
> >
> > - Consequence of just assuming the Linux behavior (fd leaks on
> > conforming systems).
> >
> > - Consequences of assuming the POSIX behavior (double-close vulns on
> > GNU/Linux, maybe others).
> >
> > - Survey of methods for avoiding the problem (ways to preclude EINTR,
> > possibly ways to infer behavior, etc).
>
> This outline seems more or less reasonable to me but, if it's me
> writing the text, I _will_ characterize what POSIX currently says
> about EINTR returns from close() as a bug in POSIX. As far as I'm
> concerned, that is a fact, not polemic.
>
> I have found that arguing with you in particular, Rich, is generally
> not worth the effort. Therefore, unless you reply and _accept_ that
> the final version of the close manpage will say that POSIX is buggy,
> I am not going to write another version of this text, nor will I be
> drawn into further debate.
I will not accept that because it's a gross violation of the
responsibility of document writing.
Rich
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Zack Weinberg @ 2026-01-20 17:05 UTC (permalink / raw)
To: Rich Felker
Cc: Alejandro Colomar, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <20250524022416.GB6263@brightrain.aerifal.cx>
> On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
>> close() always succeeds. That is, after it returns, _fd_ has
>> always been disconnected from the open file it formerly referred
>> to, and its number can be recycled to refer to some other file.
>> Furthermore, if _fd_ was the last reference to the underlying
>> open file description, the resources associated with the open file
>> description will always have been scheduled to be released.
...
>> EINPROGRESS
>> EINTR
>> There are no delayed errors to report, but the kernel is
>> still doing some clean-up work in the background. This
>> situation should be treated the same as if close() had
>> returned zero. Do not retry the close(), and do not report
>> an error to the user.
>
> Since this behavior for EINTR is non-conforming (and even prior to the
> POSIX 2024 update, it was contrary to the general semantics for EINTR,
> that no non-ignoreable side-effects have taken place), it should be
> noted that it's Linux/glibc-specific.
I am prepared to take your word for it that POSIX says this is
non-conforming, but in that case, POSIX is wrong, and I will not be
convinced otherwise by any argument. Operations that release a
resource must always succeed.
Now, the abstract correct behavior is secondary to the fact that we
know there are both systems where close should not be retried after
EINTR (Linux) and systems where the fd is still open after EINTR
(HP-UX). But it is my position that *portable code* should assume the
Linux behavior, because that is the safest option. If you assume the
HP-UX behavior on a machine that implements the Linux behavior, you
might close some unrelated file out from under yourself (probably but
not necessarily a different thread). If you assume the Linux behavior
on a machine that implements the HP-UX behavior, you have leaked a
file descriptor; the worst things that can do are much less severe.
The only way to get it right all the time is to have a big long list
of #ifdefs for every Unix under the sun, and we don't even have the
data we would need to write that list.
> While I agree with all of this, I think the tone is way too
> proscriptive. The man pages are to document the behaviors, not tell
> people how to program.
I could be persuaded to tone it down a little but in this case I think
the man page's job *is* to tell people how to program. We know lots of
existing code has gotten the fine details of close() wrong and we are
trying to document how to do it right.
> Aside: the reason EINTR *has to* be specified this way is that pthread
> cancellation is aligned with EINTR. If EINTR were defined to have
> closed the fd, then acting on cancellation during close would also
> have closed the fd, but the cancellation handler would have no way to
> distinguish this, leading to a situation where you're forced to either
> leak fds or introduce a double-close vuln.
The correct way to address this would be to make close() not be a
cancellation point.
> It sounds like you are intentionally omitting that POSIX says the
> opposite of what you want it to, and treating the standard behavior
> as a historical HP-UX quirk/bug. This is polemic, not the sort of
> documentation that belongs in a man page.
To be clear, when I wrote all this I thought the POSIX.1-2024 change
did in fact make the semantics be that close() closes the descriptor
no matter what it returns.
However, I insist that the correct behavior is in fact for close to
close the descriptor no matter what it returns, and to the extent
POSIX says anything else, POSIX is wrong. Again, you cannot change
my mind about this.
N.B. I have skimmed the current text of
https://pubs.opengroup.org/onlinepubs/9799919799/functions/close.html
and it appears to me that the committee more or less agrees with me,
but wishes to avoid declaring HP-UX (and any other systems with the
same behavior) nonconformant. So instead of just saying the fd is
closed no matter what, they've invented a new variant on close that
they have more scope to modify the behavior of, and they're nudging
implementations to not return EINTR from (posix_)close at all.
I don't think we (authors of this particular set of manpages) need to
care about the Austin Group's reluctance to declare existing legacy
systems nonconformant.
> An outline of what I'd like to see instead:
>
> - Clear explanation of why double-close is a serious bug that must
> always be avoided. (I think we all agree on this.)
>
> - Statement that the historical Linux/glibc behavior and current POSIX
> requirement differ, without language that tries to paint the POSIX
> behavior as a HP-UX bug/quirk. Possibly citing real sources/history
> of the issue (Austin Group tracker items 529, 614; maybe others).
>
> - Consequence of just assuming the Linux behavior (fd leaks on
> conforming systems).
>
> - Consequences of assuming the POSIX behavior (double-close vulns on
> GNU/Linux, maybe others).
>
> - Survey of methods for avoiding the problem (ways to preclude EINTR,
> possibly ways to infer behavior, etc).
This outline seems more or less reasonable to me but, if it's me
writing the text, I _will_ characterize what POSIX currently says
about EINTR returns from close() as a bug in POSIX. As far as I'm
concerned, that is a fact, not polemic.
I have found that arguing with you in particular, Rich, is generally
not worth the effort. Therefore, unless you reply and _accept_ that
the final version of the close manpage will say that POSIX is buggy,
I am not going to write another version of this text, nor will I be
drawn into further debate.
zw
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Rich Felker @ 2026-01-20 16:36 UTC (permalink / raw)
To: Zack Weinberg
Cc: Alejandro Colomar, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <60c77e5c-dbab-4cca-8d0d-9857875c73fb@app.fastmail.com>
On Tue, Jan 20, 2026 at 11:15:15AM -0500, Zack Weinberg wrote:
> Rich and I have an irreconciliable disagreement on what the semantics of close
> _should_ be. I'm not going to do any more work on this until/unless he
> changes his mind.
It's been way too long since I read this thread to recall what our
point of disagreement is or what point glibc might be at in
reconciling the Linux kernel disagreement with POSIX.
I believe my position is basically this:
1. Documentation should reflect that the EINTR behavior on raw Linux
syscall and traditional glibc is non-conforming to POSIX, but make
applications aware of it and that it's unsafe to retry close on
these systems.
2. Documentation should be descriptive not polemic or proscriptive of
coding style or practices. When there is a disagreement like this
it should document that and faithfully represent the different
positions, not represent the author's views on which one is
correct.
3. It may be helpful to have further information on what types of
errors can actually be expected from close on Linux, and under what
conditions, but only if these behaviors can actually be guaranteed.
If it's just documenting what Linux currently happens to do, but
without any existing promise to preserve that for new file types
etc., then this is stepping out of line of the role of
documentation into defining the specification, and that requires
input from other folks.
4. If musl behavior is being documented, it should be noted that we do
not have the non-conforming EINTR issue. If the kernel produces
EINTR, we return 0. From 0.9.7 to 1.1.6 we produced EINPROGRESS,
but this was changed in 1.1.7 as it was found that applications
would treat EINPROGRESS as an error condition.
Rich
^ permalink raw reply
* Re: [RFC v1] man/man2/close.2: CAVEATS: Document divergence from POSIX.1-2024
From: Zack Weinberg @ 2026-01-20 16:15 UTC (permalink / raw)
To: Alejandro Colomar
Cc: Rich Felker, Vincent Lefevre, Jan Kara, Alexander Viro,
Christian Brauner, linux-fsdevel, linux-api, GNU libc development
In-Reply-To: <aW1dE9j91WAte1gf@devuan>
Rich and I have an irreconciliable disagreement on what the semantics of close
_should_ be. I'm not going to do any more work on this until/unless he
changes his mind.
On Sun, Jan 18, 2026, at 5:23 PM, Alejandro Colomar wrote:
> Hi Zack and others,
>
> Just a gentle ping. It would be nice to have an agreement for some
> patch.
>
>
> Have a lovely night!
> Alex
>
> On Fri, May 23, 2025 at 02:10:57PM -0400, Zack Weinberg wrote:
>> Taking everything said in this thread into account, I have attempted to
>> wordsmith new language for the close(2) manpage. Please let me know
>> what you think, and please help me with the bits marked in square
>> brackets. I can make this into a proper patch for the manpages
>> when everyone is happy with it.
>>
>> zw
>>
>> ---
>>
>> DESCRIPTION
>> ... existing text ...
>>
>> close() always succeeds. That is, after it returns, _fd_ has
>> always been disconnected from the open file it formerly referred
>> to, and its number can be recycled to refer to some other file.
>> Furthermore, if _fd_ was the last reference to the underlying
>> open file description, the resources associated with the open file
>> description will always have been scheduled to be released.
>>
>> However, close may report _delayed errors_ from a previous I/O
>> operation. Therefore, its return value should not be ignored.
>>
>> RETURN VALUE
>> close() returns zero if there are no delayed errors to report,
>> or -1 if there _might_ be delayed errors.
>>
>> When close() returns -1, check _errno_ to see what the situation
>> actually is. Most, but not all, _errno_ codes indicate a delayed
>> I/O error that should be reported to the user. See ERRORS and
>> NOTES for more detail.
>>
>> [QUERY: Is it ever possible to get delayed errors on close() from
>> a file that was opened with O_RDONLY? What about a file that was
>> opened with O_RDWR but never actually written to? If people only
>> have to worry about delayed errors if the file was actually
>> written to, we should say so at this point.
>>
>> It would also be good to mention whether it is possible to get a
>> delayed error on close() even if a previous call to fsync() or
>> fdatasync() succeeded and there haven’t been any more writes to
>> that file *description* (not necessarily via the fd being closed)
>> since.]
>>
>> ERRORS
>> EBADF _fd_ wasn’t open in the first place, or is outside the
>> valid numeric range for file descriptors.
>>
>> EINPROGRESS
>> EINTR
>> There are no delayed errors to report, but the kernel is
>> still doing some clean-up work in the background. This
>> situation should be treated the same as if close() had
>> returned zero. Do not retry the close(), and do not report
>> an error to the user.
>>
>> EDQUOT
>> EFBIG
>> EIO
>> ENOSPC
>> These are the most common errno codes associated with
>> delayed I/O errors. They should be treated as a hard
>> failure to write to the file that was formerly associated
>> with _fd_, the same as if an earlier write(2) had failed
>> with one of these codes. The file has still been closed!
>> Do not retry the close(). But do report an error to the user.
>>
>> Depending on the underlying file, close() may return other errno
>> codes; these should generally also be treated as delayed I/O errors.
>>
>> NOTES
>> Dealing with error returns from close()
>>
>> As discussed above, close() always closes the file. Except when
>> errno is set to EBADF, EINPROGRESS, or EINTR, an error return from
>> close() reports a _delayed I/O error_ from a previous write()
>> operation.
>>
>> It is vital to report delayed I/O errors to the user; failing to
>> check the return value of close() can cause _silent_ loss of data.
>> The most common situations where this actually happens involve
>> networked filesystems, where, in the name of throughput, write()
>> often returns success before the server has actually confirmed a
>> successful write.
>>
>> However, it is also vital to understand that _no matter what_
>> close() returns, and _no matter what_ it sets errno to, when it
>> returns, _the file descriptor passed to close() has been closed_,
>> and its number is _immediately_ available for reuse by open(2),
>> dup(2), etc. Therefore, one should never retry a close(), not
>> even if it set errno to a value that normally indicates the
>> operation needs to be retried (e.g. EINTR). Retrying a close()
>> is a serious bug, particularly in a multithreaded program; if
>> the file descriptor number has already been reused, _that file_
>> will get closed out from under whatever other thread opened it.
>>
>> [Possibly something about fsync/fdatasync here?]
>>
>> BUGS
>> Prior to POSIX.1-2024, there was no official guarantee that
>> close() would always close the file descriptor, even on error.
>> Linux has always closed the file descriptor, even on error,
>> but other implementations might not have.
>>
>> The only such implementation we have heard of is HP-UX; at least
>> some versions of HP-UX’s man page for close() said it should be
>> retried if it returned -1 with errno set to EINTR. (If you know
>> exactly which versions of HP-UX are affected, or of any other
>> Unix where close() doesn’t always close the file descriptor,
>> please contact us about it.)
>>
>> Portable code should nonetheless never retry a failed close(); the
>> consequences of a file descriptor leak are far less dangerous than
>> the consequences of closing a file out from under another thread.
>
> --
> <https://www.alejandro-colomar.es>
>
> Attachments:
> * signature.asc
^ permalink raw reply
* [RESEND PATCH bpf-next v6 9/9] selftests/bpf: Add tests to verify map create failure log
From: Leon Hwang @ 2026-01-20 15:24 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260120152424.40766-1-leon.hwang@linux.dev>
Add tests to verify that the kernel reports the expected error messages
when map creation fails.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
.../selftests/bpf/prog_tests/map_init.c | 168 ++++++++++++++++++
1 file changed, 168 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c
index 14a31109dd0e..89e6daf2fcfd 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_init.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_init.c
@@ -212,3 +212,171 @@ void test_map_init(void)
if (test__start_subtest("pcpu_lru_map_init"))
test_pcpu_lru_map_init();
}
+
+#define BPF_LOG_FIXED 8
+
+static void test_map_create(enum bpf_map_type map_type, const char *map_name,
+ struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+ const int key_size = 4, value_size = 4, max_entries = 1;
+ char log_buf[128];
+ int fd;
+ LIBBPF_OPTS(bpf_log_opts, log_opts);
+
+ log_buf[0] = '\0';
+ log_opts.log_buf = log_buf;
+ log_opts.log_size = sizeof(log_buf);
+ log_opts.log_level = BPF_LOG_FIXED;
+ opts->log_opts = &log_opts;
+ fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, opts);
+ if (!ASSERT_LT(fd, 0, "bpf_map_create")) {
+ close(fd);
+ return;
+ }
+
+ ASSERT_STREQ(log_buf, exp_msg, "log_buf");
+ ASSERT_EQ(log_opts.log_true_size, strlen(exp_msg) + 1, "log_true_size");
+}
+
+static void test_map_create_array(struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+ test_map_create(BPF_MAP_TYPE_ARRAY, "test_map_create", opts, exp_msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_struct_ops(void)
+{
+ const char *msg = "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_vmlinux_value_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_kv_type_id(void)
+{
+ const char *msg = "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_vmlinux_value_type_id = 1,
+ .btf_key_type_id = 1,
+ );
+
+ test_map_create(BPF_MAP_TYPE_STRUCT_OPS, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_value_type_id(void)
+{
+ const char *msg = "Invalid btf_value_type_id.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_key_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_extra(void)
+{
+ const char *msg = "Invalid map_extra.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_extra = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_numa_node(void)
+{
+ const char *msg = "Invalid numa_node.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = BPF_F_NUMA_NODE,
+ .numa_node = 0xFF,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_type(void)
+{
+ const char *msg = "Invalid map_type.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+ test_map_create(__MAX_BPF_MAP_TYPE, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_token_fd(void)
+{
+ const char *msg = "Invalid map_token_fd.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = BPF_F_TOKEN_FD,
+ .token_fd = 0xFF,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_name(void)
+{
+ const char *msg = "Invalid map_name.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+ test_map_create(BPF_MAP_TYPE_ARRAY, "test-!@#", &opts, msg);
+}
+
+static void test_invalid_btf_fd(void)
+{
+ const char *msg = "Invalid btf_fd.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .btf_fd = -1,
+ .btf_key_type_id = 1,
+ .btf_value_type_id = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_1(void)
+{
+ const char *msg = "Invalid excl_prog_hash_size.\n";
+ const char *hash = "DEADCODE";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .excl_prog_hash = hash,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_2(void)
+{
+ const char *msg = "Invalid excl_prog_hash_size.\n";
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .excl_prog_hash_size = 1,
+ );
+
+ test_map_create_array(&opts, msg);
+}
+
+void test_map_create_failure(void)
+{
+ if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops"))
+ test_invalid_vmlinux_value_type_id_struct_ops();
+ if (test__start_subtest("invalid_vmlinux_value_type_id_kv_type_id"))
+ test_invalid_vmlinux_value_type_id_kv_type_id();
+ if (test__start_subtest("invalid_value_type_id"))
+ test_invalid_value_type_id();
+ if (test__start_subtest("invalid_map_extra"))
+ test_invalid_map_extra();
+ if (test__start_subtest("invalid_numa_node"))
+ test_invalid_numa_node();
+ if (test__start_subtest("invalid_map_type"))
+ test_invalid_map_type();
+ if (test__start_subtest("invalid_token_fd"))
+ test_invalid_token_fd();
+ if (test__start_subtest("invalid_map_name"))
+ test_invalid_map_name();
+ if (test__start_subtest("invalid_btf_fd"))
+ test_invalid_btf_fd();
+ if (test__start_subtest("invalid_excl_prog_hash_size_1"))
+ test_excl_prog_hash_size_1();
+ if (test__start_subtest("invalid_excl_prog_hash_size_2"))
+ test_excl_prog_hash_size_2();
+}
--
2.52.0
^ permalink raw reply related
* [RESEND PATCH bpf-next v6 8/9] libbpf: Add common attr support for map_create
From: Leon Hwang @ 2026-01-20 15:24 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260120152424.40766-1-leon.hwang@linux.dev>
With the previous commit adding common attribute support for
BPF_MAP_CREATE, users can now retrieve detailed error messages when map
creation fails via the log_buf field.
Introduce struct bpf_log_opts with the following fields:
log_buf, log_size, log_level, and log_true_size.
Extend bpf_map_create_opts with a new field log_opts, allowing users to
capture and inspect log messages on map creation failures.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
tools/lib/bpf/bpf.c | 16 +++++++++++++++-
tools/lib/bpf/bpf.h | 17 ++++++++++++++++-
2 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index ed9c6eaeb656..ac634d5ad4e6 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -205,6 +205,9 @@ int bpf_map_create(enum bpf_map_type map_type,
const struct bpf_map_create_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
+ const size_t attr_common_sz = sizeof(struct bpf_common_attr);
+ struct bpf_common_attr attr_common;
+ struct bpf_log_opts *log_opts;
union bpf_attr attr;
int fd;
@@ -238,7 +241,18 @@ int bpf_map_create(enum bpf_map_type map_type,
attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
- fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+ log_opts = OPTS_GET(opts, log_opts, NULL);
+ if (log_opts && feat_supported(NULL, FEAT_BPF_SYSCALL_COMMON_ATTRS)) {
+ memset(&attr_common, 0, attr_common_sz);
+ attr_common.log_buf = ptr_to_u64(OPTS_GET(log_opts, log_buf, NULL));
+ attr_common.log_size = OPTS_GET(log_opts, log_size, 0);
+ attr_common.log_level = OPTS_GET(log_opts, log_level, 0);
+ fd = sys_bpf_ext_fd(BPF_MAP_CREATE, &attr, attr_sz, &attr_common, attr_common_sz);
+ OPTS_SET(log_opts, log_true_size, attr_common.log_true_size);
+ } else {
+ fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+ OPTS_SET(log_opts, log_true_size, 0);
+ }
return libbpf_err_errno(fd);
}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 2c8e88ddb674..59673f094f86 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -37,6 +37,18 @@ extern "C" {
LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes);
+struct bpf_log_opts {
+ size_t sz; /* size of this struct for forward/backward compatibility */
+
+ char *log_buf;
+ __u32 log_size;
+ __u32 log_level;
+ __u32 log_true_size;
+
+ size_t :0;
+};
+#define bpf_log_opts__last_field log_true_size
+
struct bpf_map_create_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
@@ -57,9 +69,12 @@ struct bpf_map_create_opts {
const void *excl_prog_hash;
__u32 excl_prog_hash_size;
+
+ struct bpf_log_opts *log_opts;
+
size_t :0;
};
-#define bpf_map_create_opts__last_field excl_prog_hash_size
+#define bpf_map_create_opts__last_field log_opts
LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
const char *map_name,
--
2.52.0
^ permalink raw reply related
* [RESEND PATCH bpf-next v6 7/9] bpf: Add syscall common attributes support for map_create
From: Leon Hwang @ 2026-01-20 15:24 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Leon Hwang, Willem de Bruijn, Jason Xing,
Tao Chen, Mykyta Yatsenko, Kumar Kartikeya Dwivedi,
Anton Protopopov, Amery Hung, Rong Tao, linux-kernel, linux-api,
linux-kselftest, kernel-patches-bot
In-Reply-To: <20260120152424.40766-1-leon.hwang@linux.dev>
Currently, many BPF_MAP_CREATE failures return -EINVAL without providing
any explanation to userspace.
With extended BPF syscall support, detailed error messages can now be
reported via the log buffer, allowing users to understand the specific
reason for a failed map creation.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
include/linux/bpf_verifier.h | 2 ++
kernel/bpf/log.c | 30 +++++++++++++++++
kernel/bpf/syscall.c | 65 ++++++++++++++++++++++++++++++------
3 files changed, 87 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9022e4f515f9..280beca480ea 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -644,6 +644,8 @@ int bpf_prog_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs
struct bpf_attrs *attrs_common);
int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *attrs,
struct bpf_attrs *attrs_common);
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *log_attr,
+ struct bpf_attrs *attrs_common);
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log);
#define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 3cccb0c5e482..d7933a412c36 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -912,6 +912,36 @@ int bpf_btf_load_log_attr_init(struct bpf_log_attr *log_attr, struct bpf_attrs *
attrs_common);
}
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *log_attr,
+ struct bpf_attrs *attrs_common)
+{
+ const struct bpf_common_attr *common = attrs_common->attr;
+ struct bpf_verifier_log *log;
+ int err;
+
+ memset(log_attr, 0, sizeof(*log_attr));
+ log_attr->log_buf = common->log_buf;
+ log_attr->log_size = common->log_size;
+ log_attr->log_level = common->log_level;
+ log_attr->attrs_common = attrs_common;
+
+ if (!log_attr->log_buf)
+ return NULL;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_vlog_init(log, log_attr->log_level, u64_to_user_ptr(log_attr->log_buf),
+ log_attr->log_size);
+ if (err) {
+ kfree(log);
+ return ERR_PTR(err);
+ }
+
+ return log;
+}
+
int bpf_log_attr_finalize(struct bpf_log_attr *log_attr, struct bpf_verifier_log *log)
{
u32 log_true_size, off;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d4a93c008c92..7cc33a48b2bf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1370,7 +1370,7 @@ static bool bpf_net_capable(void)
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr, bpfptr_t uattr)
+static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1382,8 +1382,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
- if (err)
+ if (err) {
+ bpf_log(log, "Invalid attr.\n");
return -EINVAL;
+ }
/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
* to avoid per-map type checks tripping on unknown flag
@@ -1392,17 +1394,25 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
attr->map_flags &= ~BPF_F_TOKEN_FD;
if (attr->btf_vmlinux_value_type_id) {
- if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
- attr->btf_key_type_id || attr->btf_value_type_id)
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n");
return -EINVAL;
+ }
+ if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n");
+ return -EINVAL;
+ }
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ bpf_log(log, "Invalid btf_value_type_id.\n");
return -EINVAL;
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_type != BPF_MAP_TYPE_ARENA &&
- attr->map_extra != 0)
+ attr->map_extra != 0) {
+ bpf_log(log, "Invalid map_extra.\n");
return -EINVAL;
+ }
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
@@ -1410,13 +1420,17 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
- !node_online(numa_node)))
+ !node_online(numa_node))) {
+ bpf_log(log, "Invalid numa_node.\n");
return -EINVAL;
+ }
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map_type = attr->map_type;
- if (map_type >= ARRAY_SIZE(bpf_map_types))
+ if (map_type >= ARRAY_SIZE(bpf_map_types)) {
+ bpf_log(log, "Invalid map_type.\n");
return -EINVAL;
+ }
map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
ops = bpf_map_types[map_type];
if (!ops)
@@ -1434,8 +1448,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
if (token_flag) {
token = bpf_token_get_from_fd(attr->map_token_fd);
- if (IS_ERR(token))
+ if (IS_ERR(token)) {
+ bpf_log(log, "Invalid map_token_fd.\n");
return PTR_ERR(token);
+ }
/* if current token doesn't grant map creation permissions,
* then we can't use this token, so ignore it and rely on
@@ -1518,8 +1534,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
- if (err < 0)
+ if (err < 0) {
+ bpf_log(log, "Invalid map_name.\n");
goto free_map;
+ }
preempt_disable();
map->cookie = gen_cookie_next(&bpf_map_cookie);
@@ -1542,6 +1560,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
+ bpf_log(log, "Invalid btf_fd.\n");
err = PTR_ERR(btf);
goto free_map;
}
@@ -1569,6 +1588,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
@@ -1584,6 +1604,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
@@ -1622,6 +1643,29 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
return err;
}
+static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_attrs *attrs_common)
+{
+ struct bpf_verifier_log *log;
+ struct bpf_log_attr log_attr;
+ int err, ret;
+
+ log = bpf_log_attr_create_vlog(&log_attr, attrs_common);
+ if (IS_ERR(log))
+ return PTR_ERR(log);
+
+ err = __map_create(attr, uattr, log);
+ if (err >= 0)
+ goto free;
+
+ ret = bpf_log_attr_finalize(&log_attr, log);
+ if (ret)
+ err = ret;
+
+free:
+ kfree(log);
+ return err;
+}
+
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -6218,7 +6262,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr, uattr);
+ bpf_attrs_init(&attrs_common, &attr_common, uattr_common, size_common);
+ err = map_create(&attr, uattr, &attrs_common);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
--
2.52.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox