* [PATCH v5 3/4] sparc/fcntl.h: convert O_* flag macros from hex to octal
From: Dorjoy Chowdhury @ 2026-03-07 14:06 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260307140726.70219-1-dorjoychy111@gmail.com>
Following the convention in include/uapi/asm-generic/fcntl.h and other
architecture specific arch/*/include/uapi/asm/fcntl.h files.
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
arch/sparc/include/uapi/asm/fcntl.h | 36 ++++++++++++++---------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index bb6e9fa94bc9..33ce58ec57f6 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -2,23 +2,23 @@
#ifndef _SPARC_FCNTL_H
#define _SPARC_FCNTL_H
-#define O_APPEND 0x0008
-#define FASYNC 0x0040 /* fcntl, for BSD compatibility */
-#define O_CREAT 0x0200 /* not fcntl */
-#define O_TRUNC 0x0400 /* not fcntl */
-#define O_EXCL 0x0800 /* not fcntl */
-#define O_DSYNC 0x2000 /* used to be O_SYNC, see below */
-#define O_NONBLOCK 0x4000
+#define O_APPEND 0000000010
+#define FASYNC 0000000100 /* fcntl, for BSD compatibility */
+#define O_CREAT 0000001000 /* not fcntl */
+#define O_TRUNC 0000002000 /* not fcntl */
+#define O_EXCL 0000004000 /* not fcntl */
+#define O_DSYNC 0000020000 /* used to be O_SYNC, see below */
+#define O_NONBLOCK 0000040000
#if defined(__sparc__) && defined(__arch64__)
-#define O_NDELAY 0x0004
+#define O_NDELAY 0000000004
#else
-#define O_NDELAY (0x0004 | O_NONBLOCK)
+#define O_NDELAY (0000000004 | O_NONBLOCK)
#endif
-#define O_NOCTTY 0x8000 /* not fcntl */
-#define O_LARGEFILE 0x40000
-#define O_DIRECT 0x100000 /* direct disk access hint */
-#define O_NOATIME 0x200000
-#define O_CLOEXEC 0x400000
+#define O_NOCTTY 0000100000 /* not fcntl */
+#define O_LARGEFILE 0001000000
+#define O_DIRECT 0004000000 /* direct disk access hint */
+#define O_NOATIME 0010000000
+#define O_CLOEXEC 0020000000
/*
* Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using
* the O_SYNC flag. We continue to use the existing numerical value
@@ -32,12 +32,12 @@
*
* Note: __O_SYNC must never be used directly.
*/
-#define __O_SYNC 0x800000
+#define __O_SYNC 0040000000
#define O_SYNC (__O_SYNC|O_DSYNC)
-#define O_PATH 0x1000000
-#define __O_TMPFILE 0x2000000
-#define OPENAT2_REGULAR 0x4000000
+#define O_PATH 0100000000
+#define __O_TMPFILE 0200000000
+#define OPENAT2_REGULAR 0400000000
#define F_GETOWN 5 /* for sockets. */
#define F_SETOWN 6 /* for sockets. */
--
2.53.0
^ permalink raw reply related
* [PATCH v5 2/4] kselftest/openat2: test for OPENAT2_REGULAR flag
From: Dorjoy Chowdhury @ 2026-03-07 14:06 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260307140726.70219-1-dorjoychy111@gmail.com>
Just a happy path test.
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
.../testing/selftests/openat2/openat2_test.c | 37 ++++++++++++++++++-
1 file changed, 36 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c
index 0e161ef9e9e4..e8847f7d416c 100644
--- a/tools/testing/selftests/openat2/openat2_test.c
+++ b/tools/testing/selftests/openat2/openat2_test.c
@@ -320,8 +320,42 @@ void test_openat2_flags(void)
}
}
+#ifndef OPENAT2_REGULAR
+#define OPENAT2_REGULAR 040000000
+#endif
+
+#ifndef EFTYPE
+#define EFTYPE 134
+#endif
+
+void test_openat2_regular_flag(void)
+{
+ if (!openat2_supported) {
+ ksft_test_result_skip("Skipping %s as openat2 is not supported\n", __func__);
+ return;
+ }
+
+ struct open_how how = {
+ .flags = OPENAT2_REGULAR | O_RDONLY
+ };
+
+ int fd = sys_openat2(AT_FDCWD, "/dev/null", &how);
+
+ if (fd == -ENOENT) {
+ ksft_test_result_skip("Skipping %s as there is no /dev/null\n", __func__);
+ return;
+ }
+
+ if (fd != -EFTYPE) {
+ ksft_test_result_fail("openat2 should return EFTYPE\n");
+ return;
+ }
+
+ ksft_test_result_pass("%s succeeded\n", __func__);
+}
+
#define NUM_TESTS (NUM_OPENAT2_STRUCT_VARIATIONS * NUM_OPENAT2_STRUCT_TESTS + \
- NUM_OPENAT2_FLAG_TESTS)
+ NUM_OPENAT2_FLAG_TESTS + 1)
int main(int argc, char **argv)
{
@@ -330,6 +364,7 @@ int main(int argc, char **argv)
test_openat2_struct();
test_openat2_flags();
+ test_openat2_regular_flag();
if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
ksft_exit_fail();
--
2.53.0
^ permalink raw reply related
* [PATCH v5 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-03-07 14:06 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260307140726.70219-1-dorjoychy111@gmail.com>
This flag indicates the path should be opened if it's a regular file.
This is useful to write secure programs that want to avoid being
tricked into opening device nodes with special semantics while thinking
they operate on regular files. This is a requested feature from the
uapi-group[1].
A corresponding error code EFTYPE has been introduced. For example, if
openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
param, it will return -EFTYPE. EFTYPE is already used in BSD systems
like FreeBSD, macOS.
When used in combination with O_CREAT, either the regular file is
created, or if the path already exists, it is opened if it's a regular
file. Otherwise, -EFTYPE is returned.
When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
as it doesn't make sense to open a path that is both a directory and a
regular file.
[1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
arch/alpha/include/uapi/asm/errno.h | 2 ++
arch/alpha/include/uapi/asm/fcntl.h | 1 +
arch/mips/include/uapi/asm/errno.h | 2 ++
arch/parisc/include/uapi/asm/errno.h | 2 ++
arch/parisc/include/uapi/asm/fcntl.h | 1 +
arch/sparc/include/uapi/asm/errno.h | 2 ++
arch/sparc/include/uapi/asm/fcntl.h | 1 +
fs/ceph/file.c | 4 ++++
fs/gfs2/inode.c | 6 ++++++
fs/namei.c | 4 ++++
fs/nfs/dir.c | 4 ++++
fs/open.c | 4 +++-
fs/smb/client/dir.c | 14 +++++++++++++-
include/linux/fcntl.h | 2 ++
include/uapi/asm-generic/errno.h | 2 ++
include/uapi/asm-generic/fcntl.h | 4 ++++
tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
tools/arch/mips/include/uapi/asm/errno.h | 2 ++
tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
tools/include/uapi/asm-generic/errno.h | 2 ++
21 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
index 6791f6508632..1a99f38813c7 100644
--- a/arch/alpha/include/uapi/asm/errno.h
+++ b/arch/alpha/include/uapi/asm/errno.h
@@ -127,4 +127,6 @@
#define EHWPOISON 139 /* Memory page has hardware error */
+#define EFTYPE 140 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
index 50bdc8e8a271..fe488bf7c18e 100644
--- a/arch/alpha/include/uapi/asm/fcntl.h
+++ b/arch/alpha/include/uapi/asm/fcntl.h
@@ -34,6 +34,7 @@
#define O_PATH 040000000
#define __O_TMPFILE 0100000000
+#define OPENAT2_REGULAR 0200000000
#define F_GETLK 7
#define F_SETLK 8
diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
index c01ed91b1ef4..1835a50b69ce 100644
--- a/arch/mips/include/uapi/asm/errno.h
+++ b/arch/mips/include/uapi/asm/errno.h
@@ -126,6 +126,8 @@
#define EHWPOISON 168 /* Memory page has hardware error */
+#define EFTYPE 169 /* Wrong file type for the intended operation */
+
#define EDQUOT 1133 /* Quota exceeded */
diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
index 8cbc07c1903e..93194fbb0a80 100644
--- a/arch/parisc/include/uapi/asm/errno.h
+++ b/arch/parisc/include/uapi/asm/errno.h
@@ -124,4 +124,6 @@
#define EHWPOISON 257 /* Memory page has hardware error */
+#define EFTYPE 258 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
index 03dee816cb13..d46812f2f0f4 100644
--- a/arch/parisc/include/uapi/asm/fcntl.h
+++ b/arch/parisc/include/uapi/asm/fcntl.h
@@ -19,6 +19,7 @@
#define O_PATH 020000000
#define __O_TMPFILE 040000000
+#define OPENAT2_REGULAR 0100000000
#define F_GETLK64 8
#define F_SETLK64 9
diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
index 4a41e7835fd5..71940ec9130b 100644
--- a/arch/sparc/include/uapi/asm/errno.h
+++ b/arch/sparc/include/uapi/asm/errno.h
@@ -117,4 +117,6 @@
#define EHWPOISON 135 /* Memory page has hardware error */
+#define EFTYPE 136 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index 67dae75e5274..bb6e9fa94bc9 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -37,6 +37,7 @@
#define O_PATH 0x1000000
#define __O_TMPFILE 0x2000000
+#define OPENAT2_REGULAR 0x4000000
#define F_GETOWN 5 /* for sockets. */
#define F_SETOWN 6 /* for sockets. */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66bbf6d517a9..6d8d4c7765e6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
+ if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
+ err = -EFTYPE;
+ goto out_req;
+ }
err = finish_open(file, dentry, ceph_open);
}
out_req:
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8344040ecaf7..4604e2e8a9cc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
error = PTR_ERR(inode);
if (!IS_ERR(inode)) {
+ if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
+ iput(inode);
+ inode = NULL;
+ error = -EFTYPE;
+ goto fail_gunlock;
+ }
if (S_ISDIR(inode->i_mode)) {
iput(inode);
inode = NULL;
diff --git a/fs/namei.c b/fs/namei.c
index 58f715f7657e..2a47289262bd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4651,6 +4651,10 @@ static int do_open(struct nameidata *nd,
if (unlikely(error))
return error;
}
+
+ if ((open_flag & OPENAT2_REGULAR) && !d_is_reg(nd->path.dentry))
+ return -EFTYPE;
+
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2402f57c8e7d..d8037c119317 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2195,6 +2195,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
break;
case -EISDIR:
case -ENOTDIR:
+ if (open_flags & OPENAT2_REGULAR) {
+ err = -EFTYPE;
+ break;
+ }
goto no_open;
case -ELOOP:
if (!(open_flags & O_NOFOLLOW))
diff --git a/fs/open.c b/fs/open.c
index 4f0a76dc8993..026b59af6124 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1195,7 +1195,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
* values before calling build_open_flags(), but openat2(2) checks all
* of its arguments.
*/
- if (flags & ~VALID_OPEN_FLAGS)
+ if (flags & ~VALID_OPENAT2_FLAGS)
return -EINVAL;
if (how->resolve & ~VALID_RESOLVE_FLAGS)
return -EINVAL;
@@ -1234,6 +1234,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
return -EINVAL;
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
+ } else if ((flags & O_DIRECTORY) && (flags & OPENAT2_REGULAR)) {
+ return -EINVAL;
}
if (flags & O_PATH) {
/* O_PATH only permits certain other flags to be set. */
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 953f1fee8cb8..355681ebacf1 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -222,6 +222,13 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
goto cifs_create_get_file_info;
}
+ if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
+ CIFSSMBClose(xid, tcon, fid->netfid);
+ iput(newinode);
+ rc = -EFTYPE;
+ goto out;
+ }
+
if (S_ISDIR(newinode->i_mode)) {
CIFSSMBClose(xid, tcon, fid->netfid);
iput(newinode);
@@ -436,11 +443,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
goto out_err;
}
- if (newinode)
+ if (newinode) {
+ if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
+ rc = -EFTYPE;
+ goto out_err;
+ }
if (S_ISDIR(newinode->i_mode)) {
rc = -EISDIR;
goto out_err;
}
+ }
d_drop(direntry);
d_add(direntry, newinode);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index d1bb87ff70e3..a6c692773af8 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -15,6 +15,8 @@
/* upper 32-bit flags (openat2(2) only) */ \
OPENAT2_EMPTY_PATH)
+#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OPENAT2_REGULAR)
+
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h
index 92e7ae493ee3..bd78e69e0a43 100644
--- a/include/uapi/asm-generic/errno.h
+++ b/include/uapi/asm-generic/errno.h
@@ -122,4 +122,6 @@
#define EHWPOISON 133 /* Memory page has hardware error */
+#define EFTYPE 134 /* Wrong file type for the intended operation */
+
#endif
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 613475285643..b2c2ddd0edc0 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
#define __O_TMPFILE 020000000
#endif
+#ifndef OPENAT2_REGULAR
+#define OPENAT2_REGULAR 040000000
+#endif
+
/* a horrid kludge trying to make sure that this will fail on old kernels */
#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h
index 6791f6508632..1a99f38813c7 100644
--- a/tools/arch/alpha/include/uapi/asm/errno.h
+++ b/tools/arch/alpha/include/uapi/asm/errno.h
@@ -127,4 +127,6 @@
#define EHWPOISON 139 /* Memory page has hardware error */
+#define EFTYPE 140 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h
index c01ed91b1ef4..1835a50b69ce 100644
--- a/tools/arch/mips/include/uapi/asm/errno.h
+++ b/tools/arch/mips/include/uapi/asm/errno.h
@@ -126,6 +126,8 @@
#define EHWPOISON 168 /* Memory page has hardware error */
+#define EFTYPE 169 /* Wrong file type for the intended operation */
+
#define EDQUOT 1133 /* Quota exceeded */
diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h
index 8cbc07c1903e..93194fbb0a80 100644
--- a/tools/arch/parisc/include/uapi/asm/errno.h
+++ b/tools/arch/parisc/include/uapi/asm/errno.h
@@ -124,4 +124,6 @@
#define EHWPOISON 257 /* Memory page has hardware error */
+#define EFTYPE 258 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h
index 4a41e7835fd5..71940ec9130b 100644
--- a/tools/arch/sparc/include/uapi/asm/errno.h
+++ b/tools/arch/sparc/include/uapi/asm/errno.h
@@ -117,4 +117,6 @@
#define EHWPOISON 135 /* Memory page has hardware error */
+#define EFTYPE 136 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h
index 92e7ae493ee3..bd78e69e0a43 100644
--- a/tools/include/uapi/asm-generic/errno.h
+++ b/tools/include/uapi/asm-generic/errno.h
@@ -122,4 +122,6 @@
#define EHWPOISON 133 /* Memory page has hardware error */
+#define EFTYPE 134 /* Wrong file type for the intended operation */
+
#endif
--
2.53.0
^ permalink raw reply related
* [PATCH v5 0/4] OPENAT2_REGULAR flag support for openat2
From: Dorjoy Chowdhury @ 2026-03-07 14:06 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
Hi,
I came upon this "Ability to only open regular files" uapi feature suggestion
from https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
and thought it would be something I could do as a first patch and get to
know the kernel code a bit better.
The following filesystems have been tested by building and booting the kernel
x86 bzImage in a Fedora 43 VM in QEMU. I have tested with OPENAT2_REGULAR that
regular files can be successfully opened and non-regular files (directory, fifo etc)
return -EFTYPE.
- btrfs
- NFS (loopback)
- SMB (loopback)
Changes in v5:
- EFTYPE is already used in BSDs mentioned in commit message
- consistently return -EFTYPE in all filesystems
Changes in v4:
- changed O_REGULAR to OPENAT2_REGULAR
- OPENAT2_REGULAR does not affect O_PATH
- atomic_open codepaths updated to work properly for OPENAT2_REGULAR
- commit message includes the uapi-group URL
- v3 is at: https://lore.kernel.org/linux-fsdevel/20260127180109.66691-1-dorjoychy111@gmail.com/T/
Changes in v3:
- included motivation about O_REGULAR flag in commit message e.g., programs not wanting to be tricked into opening device nodes
- fixed commit message wrongly referencing ENOTREGULAR instead of ENOTREG
- fixed the O_REGULAR flag in arch/parisc/include/uapi/asm/fcntl.h from 060000000 to 0100000000
- added 2 commits converting arch/{mips,sparc}/include/uapi/asm/fcntl.h O_* macros from hex to octal
- v2 is at: https://lore.kernel.org/linux-fsdevel/20260126154156.55723-1-dorjoychy111@gmail.com/T/
Changes in v2:
- rename ENOTREGULAR to ENOTREG
- define ENOTREG in uapi/asm-generic/errno.h (instead of errno-base.h) and in arch/*/include/uapi/asm/errno.h files
- override O_REGULAR in arch/{alpha,sparc,parisc}/include/uapi/asm/fcntl.h due to clash with include/uapi/asm-generic/fcntl.h
- I have kept the kselftest but now that O_REGULAR and ENOTREG can have different value on different architectures I am not sure if it's right
- v1 is at: https://lore.kernel.org/linux-fsdevel/20260125141518.59493-1-dorjoychy111@gmail.com/T/
Thanks.
Regards,
Dorjoy
Dorjoy Chowdhury (4):
openat2: new OPENAT2_REGULAR flag support
kselftest/openat2: test for OPENAT2_REGULAR flag
sparc/fcntl.h: convert O_* flag macros from hex to octal
mips/fcntl.h: convert O_* flag macros from hex to octal
arch/alpha/include/uapi/asm/errno.h | 2 +
arch/alpha/include/uapi/asm/fcntl.h | 1 +
arch/mips/include/uapi/asm/errno.h | 2 +
arch/mips/include/uapi/asm/fcntl.h | 22 +++++------
arch/parisc/include/uapi/asm/errno.h | 2 +
arch/parisc/include/uapi/asm/fcntl.h | 1 +
arch/sparc/include/uapi/asm/errno.h | 2 +
arch/sparc/include/uapi/asm/fcntl.h | 35 +++++++++---------
fs/ceph/file.c | 4 ++
fs/gfs2/inode.c | 6 +++
fs/namei.c | 4 ++
fs/nfs/dir.c | 4 ++
fs/open.c | 4 +-
fs/smb/client/dir.c | 14 ++++++-
include/linux/fcntl.h | 2 +
include/uapi/asm-generic/errno.h | 2 +
include/uapi/asm-generic/fcntl.h | 4 ++
tools/arch/alpha/include/uapi/asm/errno.h | 2 +
tools/arch/mips/include/uapi/asm/errno.h | 2 +
tools/arch/parisc/include/uapi/asm/errno.h | 2 +
tools/arch/sparc/include/uapi/asm/errno.h | 2 +
tools/include/uapi/asm-generic/errno.h | 2 +
.../testing/selftests/openat2/openat2_test.c | 37 ++++++++++++++++++-
23 files changed, 127 insertions(+), 31 deletions(-)
--
2.53.0
^ permalink raw reply
* [PATCH v2] sched/deadline: document new sched_getattr() feature for retrieving current parameters for DEADLINE tasks
From: Tommaso Cucinotta @ 2026-03-04 10:28 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tommaso Cucinotta, linux-api, Juri Lelli, Jonathan Corbet,
Shuah Khan, Shashank Balaji, linux-doc, linux-kernel
In-Reply-To: <20260304102843.1373905-1-tommaso.cucinotta@santannapisa.it>
Document in Documentation/sched/sched-deadline.rst the new capability of
sched_getattr() to retrieve, for DEADLINE tasks, the runtime left and absolute
deadline (setting the flags syscall parameter to 1), in addition to the static
parameters (obtained with flags=0).
Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@santannapisa.it>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
---
Documentation/scheduler/sched-deadline.rst | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index ec543a12..76fdf435 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -628,10 +628,21 @@ Deadline Task Scheduling
* the new scheduling related syscalls that manipulate it, i.e.,
sched_setattr() and sched_getattr() are implemented.
- For debugging purposes, the leftover runtime and absolute deadline of a
- SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
- dl.runtime and dl.deadline, both values in ns). A programmatic way to
- retrieve these values from production code is under discussion.
+ The leftover runtime and absolute deadline of a SCHED_DEADLINE task can be
+ read using the sched_getattr() syscall, setting the last syscall parameter
+ flags to the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 value. This updates the
+ runtime left, converts the absolute deadline in CLOCK_MONOTONIC reference,
+ then returns these parameters to user-space. The absolute deadline is
+ returned as the number of nanoseconds since the CLOCK_MONOTONIC time
+ reference (boot instant), as a u64 in the sched_deadline field of sched_attr,
+ which can represent nearly 585 years since boot time (calling sched_getattr()
+ with flags=0 causes retrieval of the static parameters instead).
+
+ For debugging purposes, these parameters can also be retrieved through
+ /proc/<pid>/sched (entries dl.runtime and dl.deadline, both values in ns),
+ but: this is highly inefficient; the returned runtime left is not updated as
+ done by sched_getattr(); the deadline is provided in kernel rq_clock time
+ reference, that is not directly usable from user-space.
4.3 Default behavior
base-commit: f74d204baf9febf96237af6c1d7eff57fba7de36
--
2.45.2
^ permalink raw reply related
* [PATCH v2] sched/deadline: document new sched_getattr() feature for retrieving current parameters for DEADLINE tasks
From: Tommaso Cucinotta @ 2026-03-04 10:28 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tommaso Cucinotta, linux-api, Jonathan Corbet, Shuah Khan,
Juri Lelli, Shashank Balaji, linux-doc, linux-kernel
In-Reply-To: <20260303104215.1324243-1-tommaso.cucinotta@santannapisa.it>
Compared to the initially submitted documentation patch, this
version addresses the issue highlighted by Juri of the wrong wrapping
of the commit message, and the one found by the chatbot of the wrong
use of quotes around the flags parameter. I'm also adding "v2" in
the subject line, as requested by Randy.
^ permalink raw reply
* Re: [PATCH bpf-next v10 3/8] bpf: Refactor reporting log_true_size for prog_load
From: Leon Hwang @ 2026-03-04 6:17 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, LKML, Linux API,
open list:KERNEL SELFTEST FRAMEWORK, kernel-patches-bot
In-Reply-To: <CAADnVQJ4E5L8rL-K=yJJZpCeRBvEJZcSKOEQP0kg2ztowhGmvA@mail.gmail.com>
On 4/3/26 13:58, Alexei Starovoitov wrote:
> On Tue, Mar 3, 2026 at 9:47 PM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>> On 4/3/26 00:32, Alexei Starovoitov wrote:
>>> On Wed, Feb 11, 2026 at 7:13 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>>>
>>
>> [...]
>>
>>>> @@ -6241,7 +6244,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
>>>> err = map_freeze(&attr);
>>>> break;
>>>> case BPF_PROG_LOAD:
>>>> - err = bpf_prog_load(&attr, uattr, size);
>>>> + if (from_user && size >= offsetofend(union bpf_attr, log_true_size))
>>>> + log_true_size = uattr.user + offsetof(union bpf_attr, log_true_size);
>>>
>>> So you added 'from_user' gating because
>>> you replaced copy_to_bpfptr_offset() with copy_to_user()?
>>> This is a drastic change in behavior and you don't even talk about
>>> it in the commit log.
>>> You said "refactor". This is not a refactoring!
>>>
>>> This is v10. The common_attr feature is useful, but
>>> you really need to think harder about what your patches
>>> are doing.
>>>
>>
>> Refactoring should not introduce any functional changes. If a functional
>> change is involved, it should be factored out of the refactoring commit
>> into a separate commit with an explanation in the commit log.
>>
>> I'll add this to my self-review checklist.
>>
>> The intention of 'from_user' was to replace copy_to_bpfptr_offset() with
>> copy_to_user(), since the log is always copied to the user-space buffer
>> when the log level is not BPF_LOG_KERNEL in
>> kernel/bpf/log.c::bpf_verifier_vlog().
>>
>> The 'from_user' gating will be dropped in v12 to keep this patch as pure
>> refactoring.
>
> You were told multiple times to avoid copy pasting AI into your emails.
> Sorry, but this crosses the line for me.
> Your patches will be ignored for 2 weeks.
Oops. The above reply was written by my hand. Possibly, the reply
carried LLM smell because I learnt LLM tongue recently.
As you said, I won't send patches for 2 weeks. :-(
Thanks,
Leon
^ permalink raw reply
* Re: [PATCH bpf-next v10 3/8] bpf: Refactor reporting log_true_size for prog_load
From: Alexei Starovoitov @ 2026-03-04 5:58 UTC (permalink / raw)
To: Leon Hwang
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, LKML, Linux API,
open list:KERNEL SELFTEST FRAMEWORK, kernel-patches-bot
In-Reply-To: <c9cd645f-810b-4dd4-a1ed-27569dca5055@linux.dev>
On Tue, Mar 3, 2026 at 9:47 PM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> On 4/3/26 00:32, Alexei Starovoitov wrote:
> > On Wed, Feb 11, 2026 at 7:13 AM Leon Hwang <leon.hwang@linux.dev> wrote:
> >>
>
> [...]
>
> >> @@ -6241,7 +6244,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
> >> err = map_freeze(&attr);
> >> break;
> >> case BPF_PROG_LOAD:
> >> - err = bpf_prog_load(&attr, uattr, size);
> >> + if (from_user && size >= offsetofend(union bpf_attr, log_true_size))
> >> + log_true_size = uattr.user + offsetof(union bpf_attr, log_true_size);
> >
> > So you added 'from_user' gating because
> > you replaced copy_to_bpfptr_offset() with copy_to_user()?
> > This is a drastic change in behavior and you don't even talk about
> > it in the commit log.
> > You said "refactor". This is not a refactoring!
> >
> > This is v10. The common_attr feature is useful, but
> > you really need to think harder about what your patches
> > are doing.
> >
>
> Refactoring should not introduce any functional changes. If a functional
> change is involved, it should be factored out of the refactoring commit
> into a separate commit with an explanation in the commit log.
>
> I'll add this to my self-review checklist.
>
> The intention of 'from_user' was to replace copy_to_bpfptr_offset() with
> copy_to_user(), since the log is always copied to the user-space buffer
> when the log level is not BPF_LOG_KERNEL in
> kernel/bpf/log.c::bpf_verifier_vlog().
>
> The 'from_user' gating will be dropped in v12 to keep this patch as pure
> refactoring.
You were told multiple times to avoid copy pasting AI into your emails.
Sorry, but this crosses the line for me.
Your patches will be ignored for 2 weeks.
^ permalink raw reply
* Re: [PATCH bpf-next v10 3/8] bpf: Refactor reporting log_true_size for prog_load
From: Leon Hwang @ 2026-03-04 5:47 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, LKML, Linux API,
open list:KERNEL SELFTEST FRAMEWORK, kernel-patches-bot
In-Reply-To: <CAADnVQKc5H=k-++CHxs+Y1ggptRSLRcACLgVaMgOmt=QBT=dkA@mail.gmail.com>
On 4/3/26 00:32, Alexei Starovoitov wrote:
> On Wed, Feb 11, 2026 at 7:13 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
[...]
>> @@ -6241,7 +6244,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
>> err = map_freeze(&attr);
>> break;
>> case BPF_PROG_LOAD:
>> - err = bpf_prog_load(&attr, uattr, size);
>> + if (from_user && size >= offsetofend(union bpf_attr, log_true_size))
>> + log_true_size = uattr.user + offsetof(union bpf_attr, log_true_size);
>
> So you added 'from_user' gating because
> you replaced copy_to_bpfptr_offset() with copy_to_user()?
> This is a drastic change in behavior and you don't even talk about
> it in the commit log.
> You said "refactor". This is not a refactoring!
>
> This is v10. The common_attr feature is useful, but
> you really need to think harder about what your patches
> are doing.
>
Refactoring should not introduce any functional changes. If a functional
change is involved, it should be factored out of the refactoring commit
into a separate commit with an explanation in the commit log.
I'll add this to my self-review checklist.
The intention of 'from_user' was to replace copy_to_bpfptr_offset() with
copy_to_user(), since the log is always copied to the user-space buffer
when the log level is not BPF_LOG_KERNEL in
kernel/bpf/log.c::bpf_verifier_vlog().
The 'from_user' gating will be dropped in v12 to keep this patch as pure
refactoring.
Thanks,
Leon
^ permalink raw reply
* Re: [PATCH] sched/deadline: document new sched_getattr() feature for retrieving current parameters for DEADLINE tasks
From: Randy Dunlap @ 2026-03-03 23:20 UTC (permalink / raw)
To: Tommaso Cucinotta, Peter Zijlstra
Cc: Tommaso Cucinotta, linux-api, Juri Lelli, Jonathan Corbet,
Shuah Khan, Shashank Balaji, linux-doc, linux-kernel
In-Reply-To: <20260303184313.1356499-1-tommaso.cucinotta@santannapisa.it>
This patch should be marked as v2, with the differences between
v1 and v2 described.
On 3/3/26 10:42 AM, Tommaso Cucinotta wrote:
> Document in Documentation/sched/sched-deadline.rst the new capability of
> sched_getattr() to retrieve, for DEADLINE tasks, the runtime left and absolute
> deadline (setting the flags syscall parameter to 1), in addition to the static
> parameters (obtained with flags=0).
>
> Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@santannapisa.it>
> Acked-by: Juri Lelli <juri.lelli@redhat.com>
> ---
> Documentation/scheduler/sched-deadline.rst | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
> index ec543a12..76fdf435 100644
> --- a/Documentation/scheduler/sched-deadline.rst
> +++ b/Documentation/scheduler/sched-deadline.rst
> @@ -628,10 +628,21 @@ Deadline Task Scheduling
> * the new scheduling related syscalls that manipulate it, i.e.,
> sched_setattr() and sched_getattr() are implemented.
>
> - For debugging purposes, the leftover runtime and absolute deadline of a
> - SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
> - dl.runtime and dl.deadline, both values in ns). A programmatic way to
> - retrieve these values from production code is under discussion.
> + The leftover runtime and absolute deadline of a SCHED_DEADLINE task can be
> + read using the sched_getattr() syscall, setting the last syscall parameter
> + flags to the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 value. This updates the
About the build warning due to the use of `flags':
If you want smart quotes, just use 'flags'.
If you want italics, use `flags`.
If you want a code-look (monotype), use ``flags``.
> + runtime left, converts the absolute deadline in CLOCK_MONOTONIC reference,
> + then returns these parameters to user-space. The absolute deadline is
> + returned as the number of nanoseconds since the CLOCK_MONOTONIC time
> + reference (boot instant), as a u64 in the sched_deadline field of sched_attr,
> + which can represent nearly 585 years since boot time (calling sched_getattr()
> + with flags=0 causes retrieval of the static parameters instead).
> +
> + For debugging purposes, these parameters can also be retrieved through
> + /proc/<pid>/sched (entries dl.runtime and dl.deadline, both values in ns),
> + but: this is highly inefficient; the returned runtime left is not updated as
> + done by sched_getattr(); the deadline is provided in kernel rq_clock time
> + reference, that is not directly usable from user-space.
>
>
> 4.3 Default behavior
>
> base-commit: f74d204baf9febf96237af6c1d7eff57fba7de36
--
~Randy
^ permalink raw reply
* [PATCH] sched/deadline: document new sched_getattr() feature for retrieving current parameters for DEADLINE tasks
From: Tommaso Cucinotta @ 2026-03-03 18:42 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tommaso Cucinotta, linux-api, Juri Lelli, Jonathan Corbet,
Shuah Khan, Shashank Balaji, linux-doc, linux-kernel
In-Reply-To: <20260303104215.1324243-1-tommaso.cucinotta@santannapisa.it>
Document in Documentation/sched/sched-deadline.rst the new capability of
sched_getattr() to retrieve, for DEADLINE tasks, the runtime left and absolute
deadline (setting the flags syscall parameter to 1), in addition to the static
parameters (obtained with flags=0).
Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@santannapisa.it>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
---
Documentation/scheduler/sched-deadline.rst | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index ec543a12..76fdf435 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -628,10 +628,21 @@ Deadline Task Scheduling
* the new scheduling related syscalls that manipulate it, i.e.,
sched_setattr() and sched_getattr() are implemented.
- For debugging purposes, the leftover runtime and absolute deadline of a
- SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
- dl.runtime and dl.deadline, both values in ns). A programmatic way to
- retrieve these values from production code is under discussion.
+ The leftover runtime and absolute deadline of a SCHED_DEADLINE task can be
+ read using the sched_getattr() syscall, setting the last syscall parameter
+ flags to the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 value. This updates the
+ runtime left, converts the absolute deadline in CLOCK_MONOTONIC reference,
+ then returns these parameters to user-space. The absolute deadline is
+ returned as the number of nanoseconds since the CLOCK_MONOTONIC time
+ reference (boot instant), as a u64 in the sched_deadline field of sched_attr,
+ which can represent nearly 585 years since boot time (calling sched_getattr()
+ with flags=0 causes retrieval of the static parameters instead).
+
+ For debugging purposes, these parameters can also be retrieved through
+ /proc/<pid>/sched (entries dl.runtime and dl.deadline, both values in ns),
+ but: this is highly inefficient; the returned runtime left is not updated as
+ done by sched_getattr(); the deadline is provided in kernel rq_clock time
+ reference, that is not directly usable from user-space.
4.3 Default behavior
base-commit: f74d204baf9febf96237af6c1d7eff57fba7de36
--
2.45.2
^ permalink raw reply related
* Re: [PATCH bpf-next v10 3/8] bpf: Refactor reporting log_true_size for prog_load
From: Alexei Starovoitov @ 2026-03-03 16:32 UTC (permalink / raw)
To: Leon Hwang
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, Christian Brauner, Seth Forshee, Yuichiro Tsuji,
Andrey Albershteyn, Willem de Bruijn, Jason Xing, Tao Chen,
Mykyta Yatsenko, Kumar Kartikeya Dwivedi, Anton Protopopov,
Amery Hung, Rong Tao, LKML, Linux API,
open list:KERNEL SELFTEST FRAMEWORK, kernel-patches-bot
In-Reply-To: <20260211151115.78013-4-leon.hwang@linux.dev>
On Wed, Feb 11, 2026 at 7:13 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> The next commit will add support for reporting logs via extended common
> attributes, including 'log_true_size'.
>
> To prepare for that, refactor the 'log_true_size' reporting logic by
> introducing a new struct bpf_log_attr to encapsulate log-related behavior:
>
> * bpf_log_attr_init(): initialize log fields, which will support
> extended common attributes in the next commit.
> * bpf_log_attr_finalize(): handle log finalization and write back
> 'log_true_size' to userspace.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
> include/linux/bpf.h | 4 +++-
> include/linux/bpf_verifier.h | 11 +++++++++++
> kernel/bpf/log.c | 25 +++++++++++++++++++++++++
> kernel/bpf/syscall.c | 13 ++++++++++---
> kernel/bpf/verifier.c | 17 ++++-------------
> 5 files changed, 53 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cd9b96434904..d4dbcc7ad156 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2913,7 +2913,9 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
> size_t actual_size);
>
> /* verify correctness of eBPF program */
> -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
> +struct bpf_log_attr;
> +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr,
> + struct bpf_log_attr *attr_log);
>
> #ifndef CONFIG_BPF_JIT_ALWAYS_ON
> void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index ef8e45a362d9..dbd9bdb955b3 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -635,6 +635,17 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
> return log && log->level;
> }
>
> +struct bpf_log_attr {
> + char __user *log_buf;
> + u32 log_size;
> + u32 log_level;
> + u32 __user *log_true_size;
> +};
> +
> +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level,
> + u32 __user *log_true_size);
> +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log);
> +
> #define BPF_MAX_SUBPROGS 256
>
> struct bpf_subprog_arg_info {
> diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
> index a0c3b35de2ce..e31747b84fe2 100644
> --- a/kernel/bpf/log.c
> +++ b/kernel/bpf/log.c
> @@ -863,3 +863,28 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
> }
> print_verifier_state(env, vstate, frameno, false);
> }
> +
> +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level,
> + u32 __user *log_true_size)
> +{
> + memset(log, 0, sizeof(*log));
> + log->log_buf = u64_to_user_ptr(log_buf);
> + log->log_size = log_size;
> + log->log_level = log_level;
> + log->log_true_size = log_true_size;
> + return 0;
> +}
> +
> +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log)
> +{
> + u32 log_true_size;
> + int err;
> +
> + err = bpf_vlog_finalize(log, &log_true_size);
> +
> + if (attr->log_true_size && copy_to_user(attr->log_true_size, &log_true_size,
> + sizeof(log_true_size)))
> + return -EFAULT;
> +
> + return err;
> +}
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0e231c0b1d04..e86674811996 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2867,7 +2867,7 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
> /* last field in 'union bpf_attr' used by this command */
> #define BPF_PROG_LOAD_LAST_FIELD keyring_id
>
> -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
> {
> enum bpf_prog_type type = attr->prog_type;
> struct bpf_prog *prog, *dst_prog = NULL;
> @@ -3085,7 +3085,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> goto free_prog_sec;
>
> /* run eBPF verifier */
> - err = bpf_check(&prog, attr, uattr, uattr_size);
> + err = bpf_check(&prog, attr, uattr, attr_log);
> if (err < 0)
> goto free_used_maps;
>
> @@ -6189,7 +6189,10 @@ static int prog_assoc_struct_ops(union bpf_attr *attr)
> static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
> bpfptr_t uattr_common, unsigned int size_common)
> {
> + bool from_user = !bpfptr_is_kernel(uattr);
> struct bpf_common_attr attr_common;
> + u32 __user *log_true_size = NULL;
> + struct bpf_log_attr attr_log;
> union bpf_attr attr;
> int err;
>
> @@ -6241,7 +6244,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
> err = map_freeze(&attr);
> break;
> case BPF_PROG_LOAD:
> - err = bpf_prog_load(&attr, uattr, size);
> + if (from_user && size >= offsetofend(union bpf_attr, log_true_size))
> + log_true_size = uattr.user + offsetof(union bpf_attr, log_true_size);
So you added 'from_user' gating because
you replaced copy_to_bpfptr_offset() with copy_to_user()?
This is a drastic change in behavior and you don't even talk about
it in the commit log.
You said "refactor". This is not a refactoring!
This is v10. The common_attr feature is useful, but
you really need to think harder about what your patches
are doing.
pw-bot: cr
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Mathieu Desnoyers @ 2026-03-02 16:56 UTC (permalink / raw)
To: Florian Weimer
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha@sourceware.org, Arnd Bergmann,
Sebastian Andrzej Siewior
In-Reply-To: <lhuqzq2chdw.fsf@oldenburg.str.redhat.com>
On 2026-03-02 11:42, Florian Weimer wrote:
> * Mathieu Desnoyers:
[...]
>> AFAIU we don't need to evaluate this on context switch. We only need
>> to evaluate it at:
>>
>> (a) Signal delivery,
>> (b) Process exit.
>
> Ah, missed that part. It changes the rules somewhat.
>
>> Also, the tradeoff here is not clear cut to me: the only thing the rseq
>> flag would prevent is comparisons of the instruction pointer against a
>> vDSO range at (a) and (b), which are not as performance critical as
>> context switches. I'm not sure it would warrant the added complexity of
>> the rseq flag, and coupling with rseq. Moreover, I'm not convinced that
>> loading an extra rseq flag field from userspace would be faster than
>> just comparing with a known range of vDSO addresses.
>
> It wouldn't work for the signal case anyway. That would need space in
> rseq for some kind of write-ahead log of the operation before it's being
> carried out, so that it can be completed on signal delivery/process
> exit.
The signal handler case can be dealt with by making sure we clear the
pending ops list on signal delivery. AFAIU with that in place we would
not need a write-ahead log. But even then, I don't think the rseq flag
would bring any benefit over simple vDSO instruction pointer ranges
comparisons.
Also the rseq flag set/clear cannot be done atomically with respect
to the mutex unlock (success) and pending ops clear state transitions,
so we'd need instruction pointer comparisons anyway.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Florian Weimer @ 2026-03-02 16:42 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha@sourceware.org, Arnd Bergmann,
Sebastian Andrzej Siewior
In-Reply-To: <3f30f2f0-5173-42e2-aa89-0af9bb391c0e@efficios.com>
* Mathieu Desnoyers:
> On 2026-03-02 10:32, Florian Weimer wrote:
>> * Mathieu Desnoyers:
>>
>>> On 2026-03-02 02:31, Florian Weimer wrote:
>>>> * Mathieu Desnoyers:
>>>>
>>>>> Of course, we'd have to implement the whole transaction in assembler
>>>>> for each architecture.
>>>> Could this be hidden ina vDSO call?
>>>
> [...]
>>> I suspect the IP ranges and associated store-conditional flags I identified
>>> for the rseq_rl_cs approach are pretty much the key states we need to track.
>>> Architectures which support atomic exchange instructions are even simpler.
>>> We'd just have to keep track of this unlock operations steps internally
>>> between the kernel and the vDSO.
>> If the unlock operation is in the vDSO, we need to parameterize it
>> somehow, regarding offsets, values written etc., so that it's not
>> specific to exactly one robust mutex implementation.
>
> Agreed.
>
>>
>>> But you mentioned that rseq would be needed for a flag, so what I am
>>> missing ?
>> It's so that you don't have to figure out that the program counter
>> is
>> somewhere in the special robust mutex unlock code every time a task gets
>> descheduled.
>
> AFAIU we don't need to evaluate this on context switch. We only need
> to evaluate it at:
>
> (a) Signal delivery,
> (b) Process exit.
Ah, missed that part. It changes the rules somewhat.
> Also, the tradeoff here is not clear cut to me: the only thing the rseq
> flag would prevent is comparisons of the instruction pointer against a
> vDSO range at (a) and (b), which are not as performance critical as
> context switches. I'm not sure it would warrant the added complexity of
> the rseq flag, and coupling with rseq. Moreover, I'm not convinced that
> loading an extra rseq flag field from userspace would be faster than
> just comparing with a known range of vDSO addresses.
It wouldn't work for the signal case anyway. That would need space in
rseq for some kind of write-ahead log of the operation before it's being
carried out, so that it can be completed on signal delivery/process
exit.
Thanks,
Florian
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Mathieu Desnoyers @ 2026-03-02 16:32 UTC (permalink / raw)
To: Florian Weimer
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha@sourceware.org, Arnd Bergmann,
Sebastian Andrzej Siewior
In-Reply-To: <lhu5x7edz7r.fsf@oldenburg.str.redhat.com>
On 2026-03-02 10:32, Florian Weimer wrote:
> * Mathieu Desnoyers:
>
>> On 2026-03-02 02:31, Florian Weimer wrote:
>>> * Mathieu Desnoyers:
>>>
>>>> Of course, we'd have to implement the whole transaction in assembler
>>>> for each architecture.
>>> Could this be hidden ina vDSO call?
>>
[...]
>> I suspect the IP ranges and associated store-conditional flags I identified
>> for the rseq_rl_cs approach are pretty much the key states we need to track.
>> Architectures which support atomic exchange instructions are even simpler.
>> We'd just have to keep track of this unlock operations steps internally
>> between the kernel and the vDSO.
>
> If the unlock operation is in the vDSO, we need to parameterize it
> somehow, regarding offsets, values written etc., so that it's not
> specific to exactly one robust mutex implementation.
Agreed.
>
>> But you mentioned that rseq would be needed for a flag, so what I am
>> missing ?
>
> It's so that you don't have to figure out that the program counter is
> somewhere in the special robust mutex unlock code every time a task gets
> descheduled.
AFAIU we don't need to evaluate this on context switch. We only need
to evaluate it at:
(a) Signal delivery,
(b) Process exit.
Also, the tradeoff here is not clear cut to me: the only thing the rseq
flag would prevent is comparisons of the instruction pointer against a
vDSO range at (a) and (b), which are not as performance critical as
context switches. I'm not sure it would warrant the added complexity of
the rseq flag, and coupling with rseq. Moreover, I'm not convinced that
loading an extra rseq flag field from userspace would be faster than
just comparing with a known range of vDSO addresses.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Florian Weimer @ 2026-03-02 15:32 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha@sourceware.org, Arnd Bergmann,
Sebastian Andrzej Siewior
In-Reply-To: <6bbc7276-4f06-4ec4-ba1a-53425871a6cb@efficios.com>
* Mathieu Desnoyers:
> On 2026-03-02 02:31, Florian Weimer wrote:
>> * Mathieu Desnoyers:
>>
>>> Of course, we'd have to implement the whole transaction in assembler
>>> for each architecture.
>> Could this be hidden ina vDSO call?
>
> Yes, good idea! I think this approach could work as well and reduce coupling
> between kernel and userspace compared to the rseq_rl_cs approach. It's OK
> as long as an extra function call on robust mutex unlock is not an issue
> performance wise.
I don't have a performance concern there. It would be specific to
robust mutexes.
>> The question is whether we can model the unlock operation so that
>> it's sufficiently generic.
>
> I suspect the IP ranges and associated store-conditional flags I identified
> for the rseq_rl_cs approach are pretty much the key states we need to track.
> Architectures which support atomic exchange instructions are even simpler.
> We'd just have to keep track of this unlock operations steps internally
> between the kernel and the vDSO.
If the unlock operation is in the vDSO, we need to parameterize it
somehow, regarding offsets, values written etc., so that it's not
specific to exactly one robust mutex implementation.
> But you mentioned that rseq would be needed for a flag, so what I am
> missing ?
It's so that you don't have to figure out that the program counter is
somewhere in the special robust mutex unlock code every time a task gets
descheduled.
Thanks,
Foorian
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Mathieu Desnoyers @ 2026-03-02 14:57 UTC (permalink / raw)
To: Florian Weimer
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha@sourceware.org, Arnd Bergmann,
Sebastian Andrzej Siewior
In-Reply-To: <lhufr6ihelv.fsf@oldenburg.str.redhat.com>
On 2026-03-02 02:31, Florian Weimer wrote:
> * Mathieu Desnoyers:
>
>> Of course, we'd have to implement the whole transaction in assembler
>> for each architecture.
>
> Could this be hidden ina vDSO call?
Yes, good idea! I think this approach could work as well and reduce coupling
between kernel and userspace compared to the rseq_rl_cs approach. It's OK
as long as an extra function call on robust mutex unlock is not an issue
performance wise.
> It would have to receive a pointer
> to the rseq area in addition to other arguments that identify the unlock
> operation to be performed. The advantage is that the kernel would now> the addresses involved, so a single rseq flag should be sufficient.
But if we implement the robust list unlock operation in a vDSO, if we
don't consider signal handlers nesting, then we would not even need a
rseq flag, right ?
Having this in a vDSO makes it so that the kernel knows when it's
terminating a process while it runs specific ranges of instruction
pointers within the vDSO. It even knows about the relevant registers
(e.g. ll/sc success) within specific instruction pointer ranges.
The remaining question is how to handle signal handlers which can
nest over vDSO. When this happens, we can end up terminating a process
while it is running within a signal handler which has been delivered on
top of the vDSO, so the topmost frame's instruction pointer points to
the signal handler code rather than the vDSO.
One possible approach to take care of this would be to add a robust list
pending ops clear on signal delivery. When a signal is delivered
on top of the robust list unlock vDSO range, *and* the mutex is known
to have been successfully unlocked, but the pending ops was not cleared
yet, the signal delivery could clear the pending ops before delivering
the signal.
> It
> could also vary the LL/SC sequence based on architecture capabilities.
Yes. I would be good for selecting dynamically between aarch64 LL/SC vs
LSE atomics.
>
> The question is whether we can model the unlock operation so that it's
> sufficiently generic.
I suspect the IP ranges and associated store-conditional flags I identified
for the rseq_rl_cs approach are pretty much the key states we need to track.
Architectures which support atomic exchange instructions are even simpler.
We'd just have to keep track of this unlock operations steps internally
between the kernel and the vDSO.
But you mentioned that rseq would be needed for a flag, so what I am
missing ?
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply
* Re: [PATCH 1/2] mount: add OPEN_TREE_NAMESPACE
From: Florian Weimer @ 2026-03-02 10:15 UTC (permalink / raw)
To: Christian Brauner
Cc: linux-fsdevel, Jeff Layton, Alexander Viro, Amir Goldstein,
Josef Bacik, Jan Kara, Aleksa Sarai, linux-api, rudi
In-Reply-To: <20260224-kandidat-wohltat-ae8fb7a57738@brauner>
* Christian Brauner:
> On Tue, Feb 24, 2026 at 02:30:37PM +0100, Florian Weimer wrote:
>> * Christian Brauner:
>>
>> > On Tue, Feb 24, 2026 at 12:23:33PM +0100, Florian Weimer wrote:
>> >> * Christian Brauner:
>> >>
>> >> > diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
>> >> > index 5d3f8c9e3a62..acbc22241c9c 100644
>> >> > --- a/include/uapi/linux/mount.h
>> >> > +++ b/include/uapi/linux/mount.h
>> >> > @@ -61,7 +61,8 @@
>> >> > /*
>> >> > * open_tree() flags.
>> >> > */
>> >> > -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
>> >> > +#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */
>> >>
>> >> This change causes pointless -Werror=undef errors in projects that have
>> >> settled on the old definition.
>> >>
>> >> Reported here:
>> >>
>> >> Bug 33921 - Building with Linux-7.0-rc1 errors on OPEN_TREE_CLONE
>> >> <https://sourceware.org/bugzilla/show_bug.cgi?id=33921>
>> >
>> > Send a patch to change it back, please.
>> > Otherwise it might take a few days until I get around to it.
>>
>> Rudi, could you post a patch?
>
> I'm a bit confused though and not super happy that you're basically
> asking us to be so constrained that we aren't even allowed to change 1
> to 1 - just syntactically different.
I'm not happy about it, either. But it has happened before, for the
RENAME_* constants I believe.
We are already including <linux/mount.h> from <sys/mount.h>, so we can
work around this reliably on the glibc side, regardless of header
inclusion order.
Thanks,
Florian
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Florian Weimer @ 2026-03-02 7:31 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel, libc-alpha
In-Reply-To: <694424f4-20d1-4473-8955-859acbad466f@efficios.com>
* Mathieu Desnoyers:
> Of course, we'd have to implement the whole transaction in assembler
> for each architecture.
Could this be hidden ina vDSO call? It would have to receive a pointer
to the rseq area in addition to other arguments that identify the unlock
operation to be performed. The advantage is that the kernel would now
the addresses involved, so a single rseq flag should be sufficient. It
could also vary the LL/SC sequence based on architecture capabilities.
The question is whether we can model the unlock operation so that it's
sufficiently generic.
Thanks,
Florian
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Mathieu Desnoyers @ 2026-03-01 15:49 UTC (permalink / raw)
To: André Almeida
Cc: kernel-dev, Liam R . Howlett, linux-api, Darren Hart,
Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Florian Weimer,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel, libc-alpha,
Arnd Bergmann, Sebastian Andrzej Siewior
In-Reply-To: <bd7a8dd3-8dee-4886-abe6-bdda25fe4a0d@efficios.com>
Hi André,
So it looks like I got a simpler idea on how to solve this at some
point between going to bed and waking up.
Let's extend the rseq system call. Here is how:
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 863c4a00a66b..0592be0c3b32 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -86,6 +86,59 @@ struct rseq_slice_ctrl {
};
};
+/**
+ * rseq_rl_cs - Robust list unlock transaction descriptor
+ *
+ * rseq_rl_cs describes a transaction which begins with a successful
+ * robust mutex unlock followed by clearing a robust list pending ops.
+ *
+ * Userspace prepares for a robust_list unlock transaction by storing
+ * the address of a struct rseq_rl_cs descriptor into its per-thread
+ * rseq area rseq_rl_cs field. After the transaction is over, userspace
+ * clears the rseq_rl_cs pointer.
+ *
+ * A thread is considered to be within a rseq_rl_cs transaction if
+ * either of those conditions are true:
+ *
+ * - ip >= post_cond_store_ip && ip < post_success_ip && ll_sc_success(pt_regs)
+ * - ip >= post_success_ip && ip < post_clear_op_pending_ip
+ *
+ * If the kernel terminates a process within an active robust list
+ * unlock transaction, it should consider the robust list op pending
+ * as empty even if it contains an op pending address.
+ */
+struct rseq_rl_cs {
+ /* Version of this structure. */
+ __u32 version;
+ /* Reserved flags. */
+ __u32 flags;
+ /*
+ * Address immediately after store which unlocks the robust
+ * mutex. This store is usually implemented with an atomic
+ * exchange, or linked-load/store-conditional. In case it is
+ * implemented with ll/sc, the kernel needs to check whether the
+ * conditional store has succeeded with the appropriate registers
+ * or flags, as defined by the architecture ABI.
+ */
+ __u64 post_cond_store_ip;
+ /*
+ * For architectures implementing atomic exchange as ll/sc,
+ * a conditional branch is needed to handle failure.
+ * The unlock success IP is the address immediately after
+ * the conditional branch instruction after which the kernel
+ * can assume that the ll/sc has succeeded without checking
+ * registers or flags. For architectures where the the mutex
+ * unlock store instruction cannot fail, this address is equal
+ * to post_cond_store_ip.
+ */
+ __u64 post_success_ip;
+ /*
+ * Address after the instruction which clears the op pending
+ * list. This store is the last instruction of this sequence.
+ */
+ __u64 post_clear_op_pending_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
/*
* struct rseq is aligned on 4 * 8 bytes to ensure it is always
* contained within a single cache-line.
@@ -180,6 +233,28 @@ struct rseq {
*/
struct rseq_slice_ctrl slice_ctrl;
+ /*
+ * Restartable sequences rseq_rl_cs field.
+ *
+ * Contains NULL when no robust list unlock transaction is
+ * active for the current thread, or holds a pointer to the
+ * currently active struct rseq_rl_cs.
+ *
+ * Updated by user-space, which sets the address of the currently
+ * active rseq_rl_cs at some point before the beginning of the
+ * transaction, and set to NULL by user-space at some point
+ * after the transaction has completed.
+ *
+ * Read by the kernel. Set by user-space with single-copy
+ * atomicity semantics. This field should only be updated by the
+ * thread which registered this data structure. Aligned on
+ * 64-bit.
+ *
+ * 32-bit architectures should update the low order bits of the
+ * rseq_cs field, leaving the high order bits initialized to 0.
+ */
+ __u64 rseq_rl_cs;
+
/*
* Flexible array member at end of structure, after last feature field.
*/
Of course, we'd have to implement the whole transaction in assembler for each
architecture.
Feedback is welcome!
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply related
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Suren Baghdasaryan @ 2026-02-27 20:41 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: André Almeida, kernel-dev, Liam R . Howlett, linux-api,
Darren Hart, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
Florian Weimer, Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes,
Rich Felker, Carlos O'Donell, Michal Hocko, linux-kernel,
libc-alpha, Arnd Bergmann, Sebastian Andrzej Siewior, npache
In-Reply-To: <bd7a8dd3-8dee-4886-abe6-bdda25fe4a0d@efficios.com>
On Fri, Feb 27, 2026 at 8:00 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> On 2026-02-27 14:16, André Almeida wrote:
> [...]
> >> Trying to find a backward compatible way to solve this may be tricky.
> >> Here is one possible approach I have in mind: Introduce a new syscall,
> >> e.g. sys_cleanup_robust_list(void *addr)
> >>
> >> This system call would be invoked on pthread_mutex_destroy(3) of
> >> robust mutexes, and do the following:
> >>
> >> - Calculate the offset of @addr within its mapping,
> >> - Iterate on all processes which map the backing store which contain
> >> the lock address @addr.
> >> - Iterate on each thread sibling within each of those processes,
> >> - If the thread has a robust list, and its list_op_pending points
> >> to the same offset within the backing store mapping, clear the
> >> list_op_pending pointer.
> >>
> >> The overhead would be added specifically to pthread_mutex_destroy(3),
> >> and only for robust mutexes.
> >>
> >> Thoughts ?
> >>
> [...]
> >
> > About the system call, we would call sys_cleanup_robust_list() before
> > freeing/unmapping the robust mutex. To guarantee that we check every
> > process that shares the memory region, would we need to check *every*
> > single process? I don't think there's a way find a way to find such maps
> > without checking them all.
>
> We should be able to do it with just an iteration on the struct address_space
> reverse mapping (list of vma which map the shared mapping).
>
> AFAIU we'd want to get the struct address_space associated with the
> __user pointer, then, while holding i_mmap_lock_read(mapping), iterate
> on its reverse mapping (i_mmap field) with vma_interval_tree_foreach. We
> can get each mm_struct through vma->vm_mm.
>
> We'd want to do most of this in a kthread and use other mm_struct through
> use_mm().
>
> For each mm_struct, we go through the owner field to get the thread
> group leader, and iterate on all thread siblings (for_each_thread).
>
> For each of those threads, we'd want to clear the list_op_pending
> if it matches the offset of @addr within the mapping. I suspect we'd
> want to clear that userspace pointer with a futex_atomic_cmpxchg_inatomic
> which only clears the pointer if the old value match the one we expect.
I've been looking into this problem this week and IIUC Nico Pache
pursued this direction at some point (see [1]). I'm CC'ing him to
share his experience.
FYI, the link also contains an interesting discussion between Thomas
and Michal about difficulty of identifying all the VMAs possibly
involved in the lock chain and some technical challenges.
[1] https://lore.kernel.org/all/bd61369c-ef50-2eb4-2cca-91422fbfa328@redhat.com/
Thanks,
Suren.
>
> Thanks,
>
> Mathieu
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
>
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: Mathieu Desnoyers @ 2026-02-27 19:59 UTC (permalink / raw)
To: André Almeida
Cc: kernel-dev, Liam R . Howlett, linux-api, Darren Hart,
Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Florian Weimer,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel, libc-alpha,
Arnd Bergmann, Sebastian Andrzej Siewior
In-Reply-To: <ed918547-1406-4ae6-8a94-4e03712a4923@igalia.com>
On 2026-02-27 14:16, André Almeida wrote:
[...]
>> Trying to find a backward compatible way to solve this may be tricky.
>> Here is one possible approach I have in mind: Introduce a new syscall,
>> e.g. sys_cleanup_robust_list(void *addr)
>>
>> This system call would be invoked on pthread_mutex_destroy(3) of
>> robust mutexes, and do the following:
>>
>> - Calculate the offset of @addr within its mapping,
>> - Iterate on all processes which map the backing store which contain
>> the lock address @addr.
>> - Iterate on each thread sibling within each of those processes,
>> - If the thread has a robust list, and its list_op_pending points
>> to the same offset within the backing store mapping, clear the
>> list_op_pending pointer.
>>
>> The overhead would be added specifically to pthread_mutex_destroy(3),
>> and only for robust mutexes.
>>
>> Thoughts ?
>>
[...]
>
> About the system call, we would call sys_cleanup_robust_list() before
> freeing/unmapping the robust mutex. To guarantee that we check every
> process that shares the memory region, would we need to check *every*
> single process? I don't think there's a way find a way to find such maps
> without checking them all.
We should be able to do it with just an iteration on the struct address_space
reverse mapping (list of vma which map the shared mapping).
AFAIU we'd want to get the struct address_space associated with the
__user pointer, then, while holding i_mmap_lock_read(mapping), iterate
on its reverse mapping (i_mmap field) with vma_interval_tree_foreach. We
can get each mm_struct through vma->vm_mm.
We'd want to do most of this in a kthread and use other mm_struct through
use_mm().
For each mm_struct, we go through the owner field to get the thread
group leader, and iterate on all thread siblings (for_each_thread).
For each of those threads, we'd want to clear the list_op_pending
if it matches the offset of @addr within the mapping. I suspect we'd
want to clear that userspace pointer with a futex_atomic_cmpxchg_inatomic
which only clears the pointer if the old value match the one we expect.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: André Almeida @ 2026-02-27 19:16 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: kernel-dev, Liam R . Howlett, linux-api, Darren Hart,
Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Florian Weimer,
Torvald Riegel, Davidlohr Bueso, Lorenzo Stoakes, Rich Felker,
Carlos O'Donell, Michal Hocko, linux-kernel, libc-alpha,
Arnd Bergmann, Sebastian Andrzej Siewior
In-Reply-To: <a1e24288-6ffc-438d-8a2a-d152134c9555@efficios.com>
Hi Mathieu,
Em 20/02/2026 20:17, Mathieu Desnoyers escreveu:
> On 2026-02-20 17:41, Mathieu Desnoyers wrote:
>> On 2026-02-20 16:42, Mathieu Desnoyers wrote:
>>> +CC libc-alpha.
>>>
>>> On 2026-02-20 15:26, André Almeida wrote:
>>>> During LPC 2025, I presented a session about creating a new syscall for
>>>> robust_list[0][1]. However, most of the session discussion wasn't
>>>> much related
>>>> to the new syscall itself, but much more related to an old bug that
>>>> exists in
>>>> the current robust_list mechanism.
>>>>
>>>> Since at least 2012, there's an open bug reporting a race condition, as
>>>> Carlos O'Donell pointed out:
>>>>
>>>> "File corruption race condition in robust mutex unlocking"
>>>> https://sourceware.org/bugzilla/show_bug.cgi?id=14485
>>>>
>>>> To help understand the bug, I've created a reproducer (patch 1/2) and a
>>>> companion kernel hack (patch 2/2) that helps to make the race condition
>>>> more likely. When the bug happens, the reproducer shows a message
>>>> comparing the original memory with the corrupted one:
>>>>
>>>> "Memory was corrupted by the kernel: 8001fe8d8001fe8d vs
>>>> 8001fe8dc0000000"
>>>>
>>>> I'm not sure yet what would be the appropriated approach to fix it,
>>>> so I
>>>> decided to reach the community before moving forward in some direction.
>>>> One suggestion from Peter[2] resolves around serializing the mmap()
>>>> and the
>>>> robust list exit path, which might cause overheads for the common case,
>>>> where list_op_pending is empty.
>>>>
>>>> However, giving that there's a new interface being prepared, this could
>>>> also give the opportunity to rethink how list_op_pending works, and get
>>>> rid of the race condition by design.
>>>>
>>>> Feedback is very much welcome.
>>>
>>> Looking at this bug, one thing I'm starting to consider is that it
>>> appears to be an issue inherent to lack of synchronization between
>>> pthread_mutex_destroy(3) and the per-thread list_op_pending fields
>>> and not so much a kernel issue.
>>>
>>> Here is why I think the issue is purely userspace:
>>>
>>> Let's suppose we have a shared memory area across Processes 1 and
>>> Process 2,
>>> which internally have its own custom memory allocator in userspace to
>>> allocate/free space within that shared memory.
>>>
>>> Process 1, Thread A stumbles through the scenario highlighted by this
>>> bug, and
>>> basically gets preempted at this FIXME in libc
>>> __pthread_mutex_unlock_full():
>>>
>>> if (__glibc_unlikely ((atomic_exchange_release (&mutex-
>>> >__data.__lock, 0)
>>> & FUTEX_WAITERS) != 0))
>>> futex_wake ((unsigned int *) &mutex->__data.__lock, 1,
>>> private);
>>>
>>> /* We must clear op_pending after we release the mutex.
>>> FIXME However, this violates the mutex destruction
>>> requirements
>>> because another thread could acquire the mutex, destroy it,
>>> and
>>> reuse the memory for something else; then, if this thread
>>> crashes,
>>> and the memory happens to have a value equal to the TID,
>>> the kernel
>>> will believe it is still related to the mutex (which has been
>>> destroyed already) and will modify some other random
>>> object. */
>>> __asm ("" ::: "memory");
>>> THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
>>>
>>> Then Process 1, Thread B runs, grabs the lock, releases it, and based on
>>> program state it knows it can pthread_mutex_destroy() this lock, free
>>> its
>>> associated memory through the custom shared memory allocator, and
>>> allocate
>>> it for other purposes. Then we get to the point where Process 1 is
>>> killed, and where the robust futex kernel code corrupts data in shared
>>> memory because of the dangling list_op_pending pointer.
>>>
>>> That shared memory data is still observable by Process B, which will
>>> get a
>>> corrupted state.
>>>
>>> Notice how this all happens without any munmap(2)/mmap(2) in the
>>> sequence ?
>>> This is why I think this is purely a userspace issue rather than an
>>> issue
>>> we can solve by adding extra synchronization in the kernel.
>>>
>>> The one point we have in that sequence where I think we can add
>>> synchronization
>>> is pthread_mutex_destroy(3) in libc. One possible "big hammer"
>>> solution would be
>>> to make pthread_mutex_destroy iterate on all other threads
>>> list_op_pending
>>> and busy-wait if it finds that the mutex address is in use. It would
>>> of course
>>> only have to do that for robust futexes.
>>>
>>> If that big hammer solution is not fast enough for many-threaded use-
>>> cases,
>>> then we can think of other approaches such as adding a reference counter
>>> in the mutex structure, or introducing hazard pointers in userspace
>>> to reduce
>>> synchronization iteration from nr_threads to nr_cpus (or even down to
>>> max
>>> rseq mm_cid).
>>
>> To make matters even worse, the pthread_mutex_destroy(3) and reallocation
>> could happen from Process 2 rather than Process 1. So iterating on a
>> threads from Process 1 is not sufficient. We'd need to synchronize
>> pthread_mutex_destroy on something within the mutex structure which is
>> observable from all processes using the lock, for instance a reference
>> count.
> Trying to find a backward compatible way to solve this may be tricky.
> Here is one possible approach I have in mind: Introduce a new syscall,
> e.g. sys_cleanup_robust_list(void *addr)
>
> This system call would be invoked on pthread_mutex_destroy(3) of
> robust mutexes, and do the following:
>
> - Calculate the offset of @addr within its mapping,
> - Iterate on all processes which map the backing store which contain
> the lock address @addr.
> - Iterate on each thread sibling within each of those processes,
> - If the thread has a robust list, and its list_op_pending points
> to the same offset within the backing store mapping, clear the
> list_op_pending pointer.
>
> The overhead would be added specifically to pthread_mutex_destroy(3),
> and only for robust mutexes.
>
> Thoughts ?
>
Right, your explanation makes sense to me. I think the only difference
between alloc/free and map/munmap is that ""freeing" memory does not
actually return it to the operating system for other applications to
use"[1], so I don't know if this custom allocator is violating some
memory rules.
About the system call, we would call sys_cleanup_robust_list() before
freeing/unmapping the robust mutex. To guarantee that we check every
process that shares the memory region, would we need to check *every*
single process? I don't think there's a way find a way to find such maps
without checking them all.
I'm trying to explore the idea about the reference counter. Would the
mummap() be blocked till the refcount goes to zero or something like
that? I've also tried to find more examples of a memory region that's
shared between one or more process and the kernel at the same time to
get some inspiration, but it seems robust_list is a quite unique design
on its own regarding this memory sharing problem.
[1] https://sourceware.org/glibc/wiki/MallocInternals
^ permalink raw reply
* Re: [RFC PATCH 0/2] futex: how to solve the robust_list race condition?
From: André Almeida @ 2026-02-27 19:15 UTC (permalink / raw)
To: Liam R. Howlett
Cc: Carlos O'Donell, Sebastian Andrzej Siewior, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Thomas Gleixner, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Mathieu Desnoyers, kernel-dev, linux-api, linux-kernel,
Suren Baghdasaryan, Lorenzo Stoakes, Michal Hocko
In-Reply-To: <sn6isqtjcgzix4iwifcg6fy2lq3klfdykezyodzbt7fz7urhcs@dc5sxuzypdoc>
Hi Liam,
Em 20/02/2026 17:51, Liam R. Howlett escreveu:
> +Cc Suren, Lorenzo, and Michal
>
> * André Almeida <andrealmeid@igalia.com> [260220 15:27]:
>> During LPC 2025, I presented a session about creating a new syscall for
>> robust_list[0][1]. However, most of the session discussion wasn't much related
>> to the new syscall itself, but much more related to an old bug that exists in
>> the current robust_list mechanism.
>
> Ah, sorry for hijacking the session, that was not my intention, but this
> needs to be addressed before we propagate the issue into the next
> iteration.
>
No problem! I believe that this reflects the fact that the race
condition is the main concern about this new interface, and that we
should focus our discussion around this.
>>
>> Since at least 2012, there's an open bug reporting a race condition, as
>> Carlos O'Donell pointed out:
>>
>> "File corruption race condition in robust mutex unlocking"
>> https://sourceware.org/bugzilla/show_bug.cgi?id=14485
>>
[...]
>
> There was a delay added to the oom reaper for these tasks [1] by commit
> e4a38402c36e ("oom_kill.c: futex: delay the OOM reaper to allow time for
> proper futex cleanup")
>
> We did discuss marking the vmas as needing to be skipped by the oom
> manager, but no clear path forward was clear. It's also not clear if
> that's the only area where such a problem exists.
>
> [1]. https://lore.kernel.org/all/20220414144042.677008-1-npache@redhat.com/T/#u
>
So how would you detect which vmas should be skipped? And this won't fix
the issue when the memory is unmapped right, just for the OOM case?
^ permalink raw reply
* Re: [PATCH v8 03/17] fat: Implement fileattr_get for case sensitivity
From: Jan Kara @ 2026-02-27 11:41 UTC (permalink / raw)
To: Chuck Lever
Cc: Al Viro, Christian Brauner, Jan Kara, linux-fsdevel, linux-ext4,
linux-xfs, linux-cifs, linux-nfs, linux-api, linux-f2fs-devel,
hirofumi, linkinjeon, sj1557.seo, yuezhang.mo,
almaz.alexandrovich, slava, glaubitz, frank.li, tytso,
adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever
In-Reply-To: <20260217214741.1928576-4-cel@kernel.org>
On Tue 17-02-26 16:47:27, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> Report FAT's case sensitivity behavior via the FS_XFLAG_CASEFOLD
> and FS_XFLAG_CASENONPRESERVING flags. FAT filesystems are
> case-insensitive by default.
>
> MSDOS supports a 'nocase' mount option that enables case-sensitive
> behavior; check this option when reporting case sensitivity.
>
> VFAT long filename entries preserve case; without VFAT, only
> uppercased 8.3 short names are stored. MSDOS with 'nocase' also
> preserves case since the name-formatting code skips upcasing when
> 'nocase' is set. Check both options when reporting case preservation.
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Looks good to me from general POV. It would be good to get confirmation
from FAT maintainer you've got all the corner cases of FAT configuration
right :) Anyway, feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/fat/fat.h | 3 +++
> fs/fat/file.c | 22 ++++++++++++++++++++++
> fs/fat/namei_msdos.c | 1 +
> fs/fat/namei_vfat.c | 1 +
> 4 files changed, 27 insertions(+)
>
> diff --git a/fs/fat/fat.h b/fs/fat/fat.h
> index 0d269dba897b..c5bcd1063f9c 100644
> --- a/fs/fat/fat.h
> +++ b/fs/fat/fat.h
> @@ -10,6 +10,8 @@
> #include <linux/fs_context.h>
> #include <linux/fs_parser.h>
>
> +struct file_kattr;
> +
> /*
> * vfat shortname flags
> */
> @@ -407,6 +409,7 @@ extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
> extern int fat_getattr(struct mnt_idmap *idmap,
> const struct path *path, struct kstat *stat,
> u32 request_mask, unsigned int flags);
> +int fat_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
> extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
> int datasync);
>
> diff --git a/fs/fat/file.c b/fs/fat/file.c
> index 124d9c5431c8..6823269a8604 100644
> --- a/fs/fat/file.c
> +++ b/fs/fat/file.c
> @@ -17,6 +17,7 @@
> #include <linux/fsnotify.h>
> #include <linux/security.h>
> #include <linux/falloc.h>
> +#include <linux/fileattr.h>
> #include "fat.h"
>
> static long fat_fallocate(struct file *file, int mode,
> @@ -396,6 +397,26 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
> fat_flush_inodes(inode->i_sb, inode, NULL);
> }
>
> +int fat_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
> +{
> + struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
> +
> + /*
> + * FAT filesystems are case-insensitive by default. MSDOS
> + * supports a 'nocase' mount option for case-sensitive behavior.
> + *
> + * VFAT long filename entries preserve case. Without VFAT, only
> + * uppercased 8.3 short names are stored. MSDOS with 'nocase'
> + * also preserves case.
> + */
> + if (!sbi->options.nocase)
> + fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
> + if (!sbi->options.isvfat && !sbi->options.nocase)
> + fa->fsx_xflags |= FS_XFLAG_CASENONPRESERVING;
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(fat_fileattr_get);
> +
> int fat_getattr(struct mnt_idmap *idmap, const struct path *path,
> struct kstat *stat, u32 request_mask, unsigned int flags)
> {
> @@ -573,5 +594,6 @@ EXPORT_SYMBOL_GPL(fat_setattr);
> const struct inode_operations fat_file_inode_operations = {
> .setattr = fat_setattr,
> .getattr = fat_getattr,
> + .fileattr_get = fat_fileattr_get,
> .update_time = fat_update_time,
> };
> diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
> index 048c103b506a..4a3db08e51c0 100644
> --- a/fs/fat/namei_msdos.c
> +++ b/fs/fat/namei_msdos.c
> @@ -642,6 +642,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
> .rename = msdos_rename,
> .setattr = fat_setattr,
> .getattr = fat_getattr,
> + .fileattr_get = fat_fileattr_get,
> .update_time = fat_update_time,
> };
>
> diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
> index 2acfe3123a72..18f4c316aa05 100644
> --- a/fs/fat/namei_vfat.c
> +++ b/fs/fat/namei_vfat.c
> @@ -1185,6 +1185,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
> .rename = vfat_rename2,
> .setattr = fat_setattr,
> .getattr = fat_getattr,
> + .fileattr_get = fat_fileattr_get,
> .update_time = fat_update_time,
> };
>
> --
> 2.53.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox