* [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-03-28 17:22 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260328172314.45807-1-dorjoychy111@gmail.com>
This flag indicates the path should be opened if it's a regular file.
This is useful to write secure programs that want to avoid being
tricked into opening device nodes with special semantics while thinking
they operate on regular files. This is a requested feature from the
uapi-group[1].
A corresponding error code EFTYPE has been introduced. For example, if
openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
param, it will return -EFTYPE. EFTYPE is already used in BSD systems
like FreeBSD, macOS.
When used in combination with O_CREAT, either the regular file is
created, or if the path already exists, it is opened if it's a regular
file. Otherwise, -EFTYPE is returned.
When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
as it doesn't make sense to open a path that is both a directory and a
regular file.
[1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
arch/alpha/include/uapi/asm/errno.h | 2 ++
arch/alpha/include/uapi/asm/fcntl.h | 1 +
arch/mips/include/uapi/asm/errno.h | 2 ++
arch/parisc/include/uapi/asm/errno.h | 2 ++
arch/parisc/include/uapi/asm/fcntl.h | 1 +
arch/sparc/include/uapi/asm/errno.h | 2 ++
arch/sparc/include/uapi/asm/fcntl.h | 1 +
fs/ceph/file.c | 4 ++++
fs/fcntl.c | 4 ++--
fs/gfs2/inode.c | 6 ++++++
fs/namei.c | 4 ++++
fs/nfs/dir.c | 4 ++++
fs/open.c | 8 +++++---
fs/smb/client/dir.c | 14 +++++++++++++-
include/linux/fcntl.h | 2 ++
include/uapi/asm-generic/errno.h | 2 ++
include/uapi/asm-generic/fcntl.h | 4 ++++
tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
tools/arch/mips/include/uapi/asm/errno.h | 2 ++
tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
tools/include/uapi/asm-generic/errno.h | 2 ++
22 files changed, 67 insertions(+), 6 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
index 6791f6508632..1a99f38813c7 100644
--- a/arch/alpha/include/uapi/asm/errno.h
+++ b/arch/alpha/include/uapi/asm/errno.h
@@ -127,4 +127,6 @@
#define EHWPOISON 139 /* Memory page has hardware error */
+#define EFTYPE 140 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
index 50bdc8e8a271..fe488bf7c18e 100644
--- a/arch/alpha/include/uapi/asm/fcntl.h
+++ b/arch/alpha/include/uapi/asm/fcntl.h
@@ -34,6 +34,7 @@
#define O_PATH 040000000
#define __O_TMPFILE 0100000000
+#define OPENAT2_REGULAR 0200000000
#define F_GETLK 7
#define F_SETLK 8
diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
index c01ed91b1ef4..1835a50b69ce 100644
--- a/arch/mips/include/uapi/asm/errno.h
+++ b/arch/mips/include/uapi/asm/errno.h
@@ -126,6 +126,8 @@
#define EHWPOISON 168 /* Memory page has hardware error */
+#define EFTYPE 169 /* Wrong file type for the intended operation */
+
#define EDQUOT 1133 /* Quota exceeded */
diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
index 8cbc07c1903e..93194fbb0a80 100644
--- a/arch/parisc/include/uapi/asm/errno.h
+++ b/arch/parisc/include/uapi/asm/errno.h
@@ -124,4 +124,6 @@
#define EHWPOISON 257 /* Memory page has hardware error */
+#define EFTYPE 258 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
index 03dee816cb13..d46812f2f0f4 100644
--- a/arch/parisc/include/uapi/asm/fcntl.h
+++ b/arch/parisc/include/uapi/asm/fcntl.h
@@ -19,6 +19,7 @@
#define O_PATH 020000000
#define __O_TMPFILE 040000000
+#define OPENAT2_REGULAR 0100000000
#define F_GETLK64 8
#define F_SETLK64 9
diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
index 4a41e7835fd5..71940ec9130b 100644
--- a/arch/sparc/include/uapi/asm/errno.h
+++ b/arch/sparc/include/uapi/asm/errno.h
@@ -117,4 +117,6 @@
#define EHWPOISON 135 /* Memory page has hardware error */
+#define EFTYPE 136 /* Wrong file type for the intended operation */
+
#endif
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index 67dae75e5274..bb6e9fa94bc9 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -37,6 +37,7 @@
#define O_PATH 0x1000000
#define __O_TMPFILE 0x2000000
+#define OPENAT2_REGULAR 0x4000000
#define F_GETOWN 5 /* for sockets. */
#define F_SETOWN 6 /* for sockets. */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66bbf6d517a9..6d8d4c7765e6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
+ if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
+ err = -EFTYPE;
+ goto out_req;
+ }
err = finish_open(file, dentry, ceph_open);
}
out_req:
diff --git a/fs/fcntl.c b/fs/fcntl.c
index beab8080badf..240bb511557a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
- (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+ (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC));
fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8344040ecaf7..4604e2e8a9cc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
error = PTR_ERR(inode);
if (!IS_ERR(inode)) {
+ if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
+ iput(inode);
+ inode = NULL;
+ error = -EFTYPE;
+ goto fail_gunlock;
+ }
if (S_ISDIR(inode->i_mode)) {
iput(inode);
inode = NULL;
diff --git a/fs/namei.c b/fs/namei.c
index 2113958c3b7a..e557c538c238 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4679,6 +4679,10 @@ static int do_open(struct nameidata *nd,
if (unlikely(error))
return error;
}
+
+ if ((open_flag & OPENAT2_REGULAR) && !d_is_reg(nd->path.dentry))
+ return -EFTYPE;
+
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ddc3789363a5..bfe9470327c8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2195,6 +2195,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
break;
case -EISDIR:
case -ENOTDIR:
+ if (open_flags & OPENAT2_REGULAR) {
+ err = -EFTYPE;
+ break;
+ }
goto no_open;
case -ELOOP:
if (!(open_flags & O_NOFOLLOW))
diff --git a/fs/open.c b/fs/open.c
index 681d405bc61e..a6f445f72181 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -960,7 +960,7 @@ static int do_dentry_open(struct file *f,
if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
f->f_mode |= FMODE_CAN_ODIRECT;
- f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | OPENAT2_REGULAR);
f->f_iocb_flags = iocb_flags(f);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -1183,7 +1183,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
- BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
+ BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPENAT2_FLAGS),
"struct open_flags doesn't yet handle flags > 32 bits");
/*
@@ -1196,7 +1196,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
* values before calling build_open_flags(), but openat2(2) checks all
* of its arguments.
*/
- if (flags & ~VALID_OPEN_FLAGS)
+ if (flags & ~VALID_OPENAT2_FLAGS)
return -EINVAL;
if (how->resolve & ~VALID_RESOLVE_FLAGS)
return -EINVAL;
@@ -1235,6 +1235,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
return -EINVAL;
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
+ } else if ((flags & O_DIRECTORY) && (flags & OPENAT2_REGULAR)) {
+ return -EINVAL;
}
if (flags & O_PATH) {
/* O_PATH only permits certain other flags to be set. */
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 953f1fee8cb8..355681ebacf1 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -222,6 +222,13 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
goto cifs_create_get_file_info;
}
+ if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
+ CIFSSMBClose(xid, tcon, fid->netfid);
+ iput(newinode);
+ rc = -EFTYPE;
+ goto out;
+ }
+
if (S_ISDIR(newinode->i_mode)) {
CIFSSMBClose(xid, tcon, fid->netfid);
iput(newinode);
@@ -436,11 +443,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
goto out_err;
}
- if (newinode)
+ if (newinode) {
+ if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
+ rc = -EFTYPE;
+ goto out_err;
+ }
if (S_ISDIR(newinode->i_mode)) {
rc = -EISDIR;
goto out_err;
}
+ }
d_drop(direntry);
d_add(direntry, newinode);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..a80026718217 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -12,6 +12,8 @@
FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OPENAT2_REGULAR)
+
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h
index 92e7ae493ee3..bd78e69e0a43 100644
--- a/include/uapi/asm-generic/errno.h
+++ b/include/uapi/asm-generic/errno.h
@@ -122,4 +122,6 @@
#define EHWPOISON 133 /* Memory page has hardware error */
+#define EFTYPE 134 /* Wrong file type for the intended operation */
+
#endif
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 613475285643..b2c2ddd0edc0 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
#define __O_TMPFILE 020000000
#endif
+#ifndef OPENAT2_REGULAR
+#define OPENAT2_REGULAR 040000000
+#endif
+
/* a horrid kludge trying to make sure that this will fail on old kernels */
#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h
index 6791f6508632..1a99f38813c7 100644
--- a/tools/arch/alpha/include/uapi/asm/errno.h
+++ b/tools/arch/alpha/include/uapi/asm/errno.h
@@ -127,4 +127,6 @@
#define EHWPOISON 139 /* Memory page has hardware error */
+#define EFTYPE 140 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h
index c01ed91b1ef4..1835a50b69ce 100644
--- a/tools/arch/mips/include/uapi/asm/errno.h
+++ b/tools/arch/mips/include/uapi/asm/errno.h
@@ -126,6 +126,8 @@
#define EHWPOISON 168 /* Memory page has hardware error */
+#define EFTYPE 169 /* Wrong file type for the intended operation */
+
#define EDQUOT 1133 /* Quota exceeded */
diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h
index 8cbc07c1903e..93194fbb0a80 100644
--- a/tools/arch/parisc/include/uapi/asm/errno.h
+++ b/tools/arch/parisc/include/uapi/asm/errno.h
@@ -124,4 +124,6 @@
#define EHWPOISON 257 /* Memory page has hardware error */
+#define EFTYPE 258 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h
index 4a41e7835fd5..71940ec9130b 100644
--- a/tools/arch/sparc/include/uapi/asm/errno.h
+++ b/tools/arch/sparc/include/uapi/asm/errno.h
@@ -117,4 +117,6 @@
#define EHWPOISON 135 /* Memory page has hardware error */
+#define EFTYPE 136 /* Wrong file type for the intended operation */
+
#endif
diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h
index 92e7ae493ee3..bd78e69e0a43 100644
--- a/tools/include/uapi/asm-generic/errno.h
+++ b/tools/include/uapi/asm-generic/errno.h
@@ -122,4 +122,6 @@
#define EHWPOISON 133 /* Memory page has hardware error */
+#define EFTYPE 134 /* Wrong file type for the intended operation */
+
#endif
--
2.53.0
^ permalink raw reply related
* [PATCH v6 2/4] kselftest/openat2: test for OPENAT2_REGULAR flag
From: Dorjoy Chowdhury @ 2026-03-28 17:22 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260328172314.45807-1-dorjoychy111@gmail.com>
Just a happy path test.
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
.../testing/selftests/openat2/openat2_test.c | 37 ++++++++++++++++++-
1 file changed, 36 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c
index 0e161ef9e9e4..e8847f7d416c 100644
--- a/tools/testing/selftests/openat2/openat2_test.c
+++ b/tools/testing/selftests/openat2/openat2_test.c
@@ -320,8 +320,42 @@ void test_openat2_flags(void)
}
}
+#ifndef OPENAT2_REGULAR
+#define OPENAT2_REGULAR 040000000
+#endif
+
+#ifndef EFTYPE
+#define EFTYPE 134
+#endif
+
+void test_openat2_regular_flag(void)
+{
+ if (!openat2_supported) {
+ ksft_test_result_skip("Skipping %s as openat2 is not supported\n", __func__);
+ return;
+ }
+
+ struct open_how how = {
+ .flags = OPENAT2_REGULAR | O_RDONLY
+ };
+
+ int fd = sys_openat2(AT_FDCWD, "/dev/null", &how);
+
+ if (fd == -ENOENT) {
+ ksft_test_result_skip("Skipping %s as there is no /dev/null\n", __func__);
+ return;
+ }
+
+ if (fd != -EFTYPE) {
+ ksft_test_result_fail("openat2 should return EFTYPE\n");
+ return;
+ }
+
+ ksft_test_result_pass("%s succeeded\n", __func__);
+}
+
#define NUM_TESTS (NUM_OPENAT2_STRUCT_VARIATIONS * NUM_OPENAT2_STRUCT_TESTS + \
- NUM_OPENAT2_FLAG_TESTS)
+ NUM_OPENAT2_FLAG_TESTS + 1)
int main(int argc, char **argv)
{
@@ -330,6 +364,7 @@ int main(int argc, char **argv)
test_openat2_struct();
test_openat2_flags();
+ test_openat2_regular_flag();
if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
ksft_exit_fail();
--
2.53.0
^ permalink raw reply related
* [PATCH v6 3/4] sparc/fcntl.h: convert O_* flag macros from hex to octal
From: Dorjoy Chowdhury @ 2026-03-28 17:22 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260328172314.45807-1-dorjoychy111@gmail.com>
Following the convention in include/uapi/asm-generic/fcntl.h and other
architecture specific arch/*/include/uapi/asm/fcntl.h files.
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
arch/sparc/include/uapi/asm/fcntl.h | 36 ++++++++++++++---------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index bb6e9fa94bc9..33ce58ec57f6 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -2,23 +2,23 @@
#ifndef _SPARC_FCNTL_H
#define _SPARC_FCNTL_H
-#define O_APPEND 0x0008
-#define FASYNC 0x0040 /* fcntl, for BSD compatibility */
-#define O_CREAT 0x0200 /* not fcntl */
-#define O_TRUNC 0x0400 /* not fcntl */
-#define O_EXCL 0x0800 /* not fcntl */
-#define O_DSYNC 0x2000 /* used to be O_SYNC, see below */
-#define O_NONBLOCK 0x4000
+#define O_APPEND 0000000010
+#define FASYNC 0000000100 /* fcntl, for BSD compatibility */
+#define O_CREAT 0000001000 /* not fcntl */
+#define O_TRUNC 0000002000 /* not fcntl */
+#define O_EXCL 0000004000 /* not fcntl */
+#define O_DSYNC 0000020000 /* used to be O_SYNC, see below */
+#define O_NONBLOCK 0000040000
#if defined(__sparc__) && defined(__arch64__)
-#define O_NDELAY 0x0004
+#define O_NDELAY 0000000004
#else
-#define O_NDELAY (0x0004 | O_NONBLOCK)
+#define O_NDELAY (0000000004 | O_NONBLOCK)
#endif
-#define O_NOCTTY 0x8000 /* not fcntl */
-#define O_LARGEFILE 0x40000
-#define O_DIRECT 0x100000 /* direct disk access hint */
-#define O_NOATIME 0x200000
-#define O_CLOEXEC 0x400000
+#define O_NOCTTY 0000100000 /* not fcntl */
+#define O_LARGEFILE 0001000000
+#define O_DIRECT 0004000000 /* direct disk access hint */
+#define O_NOATIME 0010000000
+#define O_CLOEXEC 0020000000
/*
* Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using
* the O_SYNC flag. We continue to use the existing numerical value
@@ -32,12 +32,12 @@
*
* Note: __O_SYNC must never be used directly.
*/
-#define __O_SYNC 0x800000
+#define __O_SYNC 0040000000
#define O_SYNC (__O_SYNC|O_DSYNC)
-#define O_PATH 0x1000000
-#define __O_TMPFILE 0x2000000
-#define OPENAT2_REGULAR 0x4000000
+#define O_PATH 0100000000
+#define __O_TMPFILE 0200000000
+#define OPENAT2_REGULAR 0400000000
#define F_GETOWN 5 /* for sockets. */
#define F_SETOWN 6 /* for sockets. */
--
2.53.0
^ permalink raw reply related
* [PATCH v6 4/4] mips/fcntl.h: convert O_* flag macros from hex to octal
From: Dorjoy Chowdhury @ 2026-03-28 17:22 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260328172314.45807-1-dorjoychy111@gmail.com>
Following the convention in include/uapi/asm-generic/fcntl.h and other
architecture specific arch/*/include/uapi/asm/fcntl.h files.
Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
---
arch/mips/include/uapi/asm/fcntl.h | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/mips/include/uapi/asm/fcntl.h b/arch/mips/include/uapi/asm/fcntl.h
index 0369a38e3d4f..6aa3f49df17e 100644
--- a/arch/mips/include/uapi/asm/fcntl.h
+++ b/arch/mips/include/uapi/asm/fcntl.h
@@ -11,15 +11,15 @@
#include <asm/sgidefs.h>
-#define O_APPEND 0x0008
-#define O_DSYNC 0x0010 /* used to be O_SYNC, see below */
-#define O_NONBLOCK 0x0080
-#define O_CREAT 0x0100 /* not fcntl */
-#define O_TRUNC 0x0200 /* not fcntl */
-#define O_EXCL 0x0400 /* not fcntl */
-#define O_NOCTTY 0x0800 /* not fcntl */
-#define FASYNC 0x1000 /* fcntl, for BSD compatibility */
-#define O_LARGEFILE 0x2000 /* allow large file opens */
+#define O_APPEND 0000010
+#define O_DSYNC 0000020 /* used to be O_SYNC, see below */
+#define O_NONBLOCK 0000200
+#define O_CREAT 0000400 /* not fcntl */
+#define O_TRUNC 0001000 /* not fcntl */
+#define O_EXCL 0002000 /* not fcntl */
+#define O_NOCTTY 0004000 /* not fcntl */
+#define FASYNC 0010000 /* fcntl, for BSD compatibility */
+#define O_LARGEFILE 0020000 /* allow large file opens */
/*
* Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using
* the O_SYNC flag. We continue to use the existing numerical value
@@ -33,9 +33,9 @@
*
* Note: __O_SYNC must never be used directly.
*/
-#define __O_SYNC 0x4000
+#define __O_SYNC 0040000
#define O_SYNC (__O_SYNC|O_DSYNC)
-#define O_DIRECT 0x8000 /* direct disk access hint */
+#define O_DIRECT 0100000 /* direct disk access hint */
#define F_GETLK 14
#define F_SETLK 6
--
2.53.0
^ permalink raw reply related
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Jeff Layton @ 2026-03-30 11:49 UTC (permalink / raw)
To: Dorjoy Chowdhury, linux-fsdevel
Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
v9fs, linux-kselftest, viro, brauner, jack, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <20260328172314.45807-2-dorjoychy111@gmail.com>
On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> This flag indicates the path should be opened if it's a regular file.
> This is useful to write secure programs that want to avoid being
> tricked into opening device nodes with special semantics while thinking
> they operate on regular files. This is a requested feature from the
> uapi-group[1].
>
> A corresponding error code EFTYPE has been introduced. For example, if
> openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> like FreeBSD, macOS.
>
> When used in combination with O_CREAT, either the regular file is
> created, or if the path already exists, it is opened if it's a regular
> file. Otherwise, -EFTYPE is returned.
>
> When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> as it doesn't make sense to open a path that is both a directory and a
> regular file.
>
> [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
>
> Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> ---
> arch/alpha/include/uapi/asm/errno.h | 2 ++
> arch/alpha/include/uapi/asm/fcntl.h | 1 +
> arch/mips/include/uapi/asm/errno.h | 2 ++
> arch/parisc/include/uapi/asm/errno.h | 2 ++
> arch/parisc/include/uapi/asm/fcntl.h | 1 +
> arch/sparc/include/uapi/asm/errno.h | 2 ++
> arch/sparc/include/uapi/asm/fcntl.h | 1 +
> fs/ceph/file.c | 4 ++++
> fs/fcntl.c | 4 ++--
> fs/gfs2/inode.c | 6 ++++++
> fs/namei.c | 4 ++++
> fs/nfs/dir.c | 4 ++++
> fs/open.c | 8 +++++---
> fs/smb/client/dir.c | 14 +++++++++++++-
> include/linux/fcntl.h | 2 ++
> include/uapi/asm-generic/errno.h | 2 ++
> include/uapi/asm-generic/fcntl.h | 4 ++++
> tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> tools/include/uapi/asm-generic/errno.h | 2 ++
> 22 files changed, 67 insertions(+), 6 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> index 6791f6508632..1a99f38813c7 100644
> --- a/arch/alpha/include/uapi/asm/errno.h
> +++ b/arch/alpha/include/uapi/asm/errno.h
> @@ -127,4 +127,6 @@
>
> #define EHWPOISON 139 /* Memory page has hardware error */
>
> +#define EFTYPE 140 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> index 50bdc8e8a271..fe488bf7c18e 100644
> --- a/arch/alpha/include/uapi/asm/fcntl.h
> +++ b/arch/alpha/include/uapi/asm/fcntl.h
> @@ -34,6 +34,7 @@
>
> #define O_PATH 040000000
> #define __O_TMPFILE 0100000000
> +#define OPENAT2_REGULAR 0200000000
>
> #define F_GETLK 7
> #define F_SETLK 8
> diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> index c01ed91b1ef4..1835a50b69ce 100644
> --- a/arch/mips/include/uapi/asm/errno.h
> +++ b/arch/mips/include/uapi/asm/errno.h
> @@ -126,6 +126,8 @@
>
> #define EHWPOISON 168 /* Memory page has hardware error */
>
> +#define EFTYPE 169 /* Wrong file type for the intended operation */
> +
> #define EDQUOT 1133 /* Quota exceeded */
>
>
> diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> index 8cbc07c1903e..93194fbb0a80 100644
> --- a/arch/parisc/include/uapi/asm/errno.h
> +++ b/arch/parisc/include/uapi/asm/errno.h
> @@ -124,4 +124,6 @@
>
> #define EHWPOISON 257 /* Memory page has hardware error */
>
> +#define EFTYPE 258 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> index 03dee816cb13..d46812f2f0f4 100644
> --- a/arch/parisc/include/uapi/asm/fcntl.h
> +++ b/arch/parisc/include/uapi/asm/fcntl.h
> @@ -19,6 +19,7 @@
>
> #define O_PATH 020000000
> #define __O_TMPFILE 040000000
> +#define OPENAT2_REGULAR 0100000000
>
> #define F_GETLK64 8
> #define F_SETLK64 9
> diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> index 4a41e7835fd5..71940ec9130b 100644
> --- a/arch/sparc/include/uapi/asm/errno.h
> +++ b/arch/sparc/include/uapi/asm/errno.h
> @@ -117,4 +117,6 @@
>
> #define EHWPOISON 135 /* Memory page has hardware error */
>
> +#define EFTYPE 136 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> index 67dae75e5274..bb6e9fa94bc9 100644
> --- a/arch/sparc/include/uapi/asm/fcntl.h
> +++ b/arch/sparc/include/uapi/asm/fcntl.h
> @@ -37,6 +37,7 @@
>
> #define O_PATH 0x1000000
> #define __O_TMPFILE 0x2000000
> +#define OPENAT2_REGULAR 0x4000000
>
> #define F_GETOWN 5 /* for sockets. */
> #define F_SETOWN 6 /* for sockets. */
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 66bbf6d517a9..6d8d4c7765e6 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> ceph_init_inode_acls(newino, &as_ctx);
> file->f_mode |= FMODE_CREATED;
> }
> + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> + err = -EFTYPE;
> + goto out_req;
> + }
^^^
This doesn't look quite right. Here's a larger chunk of the code:
-------------------------8<--------------------------
if (d_in_lookup(dentry)) {
dn = ceph_finish_lookup(req, dentry, err);
if (IS_ERR(dn))
err = PTR_ERR(dn);
} else {
/* we were given a hashed negative dentry */
dn = NULL;
}
if (err)
goto out_req;
if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
doutc(cl, "finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
if (IS_ENCRYPTED(dir) &&
!fscrypt_has_permitted_context(dir, d_inode(dentry))) {
pr_warn_client(cl,
"Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
goto out_req;
}
doutc(cl, "finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
struct inode *newino = d_inode(dentry);
cache_file_layout(dir, newino);
ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
-------------------------8<--------------------------
It looks like this won't handle it correctly if the pathwalk terminates
on a symlink (re: d_is_symlink() case). You should either set up a test
ceph cluster on your own, or reach out to the ceph community and ask
them to test this.
> err = finish_open(file, dentry, ceph_open);
> }
> out_req:
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index beab8080badf..240bb511557a 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> * is defined as O_NONBLOCK on some platforms and not on others.
> */
> - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> HWEIGHT32(
> - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> __FMODE_EXEC));
>
> fasync_cache = kmem_cache_create("fasync_cache",
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 8344040ecaf7..4604e2e8a9cc 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> error = PTR_ERR(inode);
> if (!IS_ERR(inode)) {
> + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
JFYI: it's quite simple to set up a single-node gfs2 fs to test this.
> + iput(inode);
> + inode = NULL;
> + error = -EFTYPE;
> + goto fail_gunlock;
> + }
> if (S_ISDIR(inode->i_mode)) {
> iput(inode);
> inode = NULL;
> diff --git a/fs/namei.c b/fs/namei.c
> index 2113958c3b7a..e557c538c238 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -4679,6 +4679,10 @@ static int do_open(struct nameidata *nd,
> if (unlikely(error))
> return error;
> }
> +
> + if ((open_flag & OPENAT2_REGULAR) && !d_is_reg(nd->path.dentry))
> + return -EFTYPE;
> +
> if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
> return -ENOTDIR;
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index ddc3789363a5..bfe9470327c8 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -2195,6 +2195,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
> break;
> case -EISDIR:
> case -ENOTDIR:
> + if (open_flags & OPENAT2_REGULAR) {
> + err = -EFTYPE;
> + break;
> + }
> goto no_open;
> case -ELOOP:
> if (!(open_flags & O_NOFOLLOW))
> diff --git a/fs/open.c b/fs/open.c
> index 681d405bc61e..a6f445f72181 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -960,7 +960,7 @@ static int do_dentry_open(struct file *f,
> if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
> f->f_mode |= FMODE_CAN_ODIRECT;
>
> - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
> + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | OPENAT2_REGULAR);
> f->f_iocb_flags = iocb_flags(f);
>
> file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
> @@ -1183,7 +1183,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> int lookup_flags = 0;
> int acc_mode = ACC_MODE(flags);
>
> - BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
> + BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPENAT2_FLAGS),
> "struct open_flags doesn't yet handle flags > 32 bits");
>
> /*
> @@ -1196,7 +1196,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> * values before calling build_open_flags(), but openat2(2) checks all
> * of its arguments.
> */
> - if (flags & ~VALID_OPEN_FLAGS)
> + if (flags & ~VALID_OPENAT2_FLAGS)
> return -EINVAL;
> if (how->resolve & ~VALID_RESOLVE_FLAGS)
> return -EINVAL;
> @@ -1235,6 +1235,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> return -EINVAL;
> if (!(acc_mode & MAY_WRITE))
> return -EINVAL;
> + } else if ((flags & O_DIRECTORY) && (flags & OPENAT2_REGULAR)) {
> + return -EINVAL;
> }
> if (flags & O_PATH) {
> /* O_PATH only permits certain other flags to be set. */
> diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
> index 953f1fee8cb8..355681ebacf1 100644
> --- a/fs/smb/client/dir.c
> +++ b/fs/smb/client/dir.c
> @@ -222,6 +222,13 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
> goto cifs_create_get_file_info;
> }
>
> + if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
> + CIFSSMBClose(xid, tcon, fid->netfid);
> + iput(newinode);
> + rc = -EFTYPE;
> + goto out;
> + }
> +
> if (S_ISDIR(newinode->i_mode)) {
> CIFSSMBClose(xid, tcon, fid->netfid);
> iput(newinode);
> @@ -436,11 +443,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
> goto out_err;
> }
>
> - if (newinode)
> + if (newinode) {
> + if ((oflags & OPENAT2_REGULAR) && !S_ISREG(newinode->i_mode)) {
> + rc = -EFTYPE;
> + goto out_err;
> + }
> if (S_ISDIR(newinode->i_mode)) {
> rc = -EISDIR;
> goto out_err;
> }
> + }
>
> d_drop(direntry);
> d_add(direntry, newinode);
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..a80026718217 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -12,6 +12,8 @@
> FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
>
> +#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OPENAT2_REGULAR)
> +
> /* List of all valid flags for the how->resolve argument: */
> #define VALID_RESOLVE_FLAGS \
> (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
> diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h
> index 92e7ae493ee3..bd78e69e0a43 100644
> --- a/include/uapi/asm-generic/errno.h
> +++ b/include/uapi/asm-generic/errno.h
> @@ -122,4 +122,6 @@
>
> #define EHWPOISON 133 /* Memory page has hardware error */
>
> +#define EFTYPE 134 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> index 613475285643..b2c2ddd0edc0 100644
> --- a/include/uapi/asm-generic/fcntl.h
> +++ b/include/uapi/asm-generic/fcntl.h
> @@ -88,6 +88,10 @@
> #define __O_TMPFILE 020000000
> #endif
>
> +#ifndef OPENAT2_REGULAR
> +#define OPENAT2_REGULAR 040000000
> +#endif
> +
> /* a horrid kludge trying to make sure that this will fail on old kernels */
> #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
>
> diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h
> index 6791f6508632..1a99f38813c7 100644
> --- a/tools/arch/alpha/include/uapi/asm/errno.h
> +++ b/tools/arch/alpha/include/uapi/asm/errno.h
> @@ -127,4 +127,6 @@
>
> #define EHWPOISON 139 /* Memory page has hardware error */
>
> +#define EFTYPE 140 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h
> index c01ed91b1ef4..1835a50b69ce 100644
> --- a/tools/arch/mips/include/uapi/asm/errno.h
> +++ b/tools/arch/mips/include/uapi/asm/errno.h
> @@ -126,6 +126,8 @@
>
> #define EHWPOISON 168 /* Memory page has hardware error */
>
> +#define EFTYPE 169 /* Wrong file type for the intended operation */
> +
> #define EDQUOT 1133 /* Quota exceeded */
>
>
> diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h
> index 8cbc07c1903e..93194fbb0a80 100644
> --- a/tools/arch/parisc/include/uapi/asm/errno.h
> +++ b/tools/arch/parisc/include/uapi/asm/errno.h
> @@ -124,4 +124,6 @@
>
> #define EHWPOISON 257 /* Memory page has hardware error */
>
> +#define EFTYPE 258 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h
> index 4a41e7835fd5..71940ec9130b 100644
> --- a/tools/arch/sparc/include/uapi/asm/errno.h
> +++ b/tools/arch/sparc/include/uapi/asm/errno.h
> @@ -117,4 +117,6 @@
>
> #define EHWPOISON 135 /* Memory page has hardware error */
>
> +#define EFTYPE 136 /* Wrong file type for the intended operation */
> +
> #endif
> diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h
> index 92e7ae493ee3..bd78e69e0a43 100644
> --- a/tools/include/uapi/asm-generic/errno.h
> +++ b/tools/include/uapi/asm-generic/errno.h
> @@ -122,4 +122,6 @@
>
> #define EHWPOISON 133 /* Memory page has hardware error */
>
> +#define EFTYPE 134 /* Wrong file type for the intended operation */
> +
> #endif
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-03-30 15:07 UTC (permalink / raw)
To: Jeff Layton
Cc: linux-fsdevel, linux-kernel, linux-api, ceph-devel, gfs2,
linux-nfs, linux-cifs, v9fs, linux-kselftest, viro, brauner, jack,
chuck.lever, alex.aring, arnd, adilger, mjguzik, smfrench,
richard.henderson, mattst88, linmag7, tsbogend, James.Bottomley,
deller, davem, andreas, idryomov, amarkuze, slava, agruenba,
trondmy, anna, sfrench, pc, ronniesahlberg, sprasad, tom,
bharathsm, shuah, miklos, hansg
In-Reply-To: <e526fbdb450a593b575355c1c9ae21f286427275.camel@kernel.org>
On Mon, Mar 30, 2026 at 5:49 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> > This flag indicates the path should be opened if it's a regular file.
> > This is useful to write secure programs that want to avoid being
> > tricked into opening device nodes with special semantics while thinking
> > they operate on regular files. This is a requested feature from the
> > uapi-group[1].
> >
> > A corresponding error code EFTYPE has been introduced. For example, if
> > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > like FreeBSD, macOS.
> >
> > When used in combination with O_CREAT, either the regular file is
> > created, or if the path already exists, it is opened if it's a regular
> > file. Otherwise, -EFTYPE is returned.
> >
> > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > as it doesn't make sense to open a path that is both a directory and a
> > regular file.
> >
> > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> >
> > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > ---
> > arch/alpha/include/uapi/asm/errno.h | 2 ++
> > arch/alpha/include/uapi/asm/fcntl.h | 1 +
> > arch/mips/include/uapi/asm/errno.h | 2 ++
> > arch/parisc/include/uapi/asm/errno.h | 2 ++
> > arch/parisc/include/uapi/asm/fcntl.h | 1 +
> > arch/sparc/include/uapi/asm/errno.h | 2 ++
> > arch/sparc/include/uapi/asm/fcntl.h | 1 +
> > fs/ceph/file.c | 4 ++++
> > fs/fcntl.c | 4 ++--
> > fs/gfs2/inode.c | 6 ++++++
> > fs/namei.c | 4 ++++
> > fs/nfs/dir.c | 4 ++++
> > fs/open.c | 8 +++++---
> > fs/smb/client/dir.c | 14 +++++++++++++-
> > include/linux/fcntl.h | 2 ++
> > include/uapi/asm-generic/errno.h | 2 ++
> > include/uapi/asm-generic/fcntl.h | 4 ++++
> > tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> > tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> > tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> > tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> > tools/include/uapi/asm-generic/errno.h | 2 ++
> > 22 files changed, 67 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> > index 6791f6508632..1a99f38813c7 100644
> > --- a/arch/alpha/include/uapi/asm/errno.h
> > +++ b/arch/alpha/include/uapi/asm/errno.h
> > @@ -127,4 +127,6 @@
> >
> > #define EHWPOISON 139 /* Memory page has hardware error */
> >
> > +#define EFTYPE 140 /* Wrong file type for the intended operation */
> > +
> > #endif
> > diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> > index 50bdc8e8a271..fe488bf7c18e 100644
> > --- a/arch/alpha/include/uapi/asm/fcntl.h
> > +++ b/arch/alpha/include/uapi/asm/fcntl.h
> > @@ -34,6 +34,7 @@
> >
> > #define O_PATH 040000000
> > #define __O_TMPFILE 0100000000
> > +#define OPENAT2_REGULAR 0200000000
> >
> > #define F_GETLK 7
> > #define F_SETLK 8
> > diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> > index c01ed91b1ef4..1835a50b69ce 100644
> > --- a/arch/mips/include/uapi/asm/errno.h
> > +++ b/arch/mips/include/uapi/asm/errno.h
> > @@ -126,6 +126,8 @@
> >
> > #define EHWPOISON 168 /* Memory page has hardware error */
> >
> > +#define EFTYPE 169 /* Wrong file type for the intended operation */
> > +
> > #define EDQUOT 1133 /* Quota exceeded */
> >
> >
> > diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> > index 8cbc07c1903e..93194fbb0a80 100644
> > --- a/arch/parisc/include/uapi/asm/errno.h
> > +++ b/arch/parisc/include/uapi/asm/errno.h
> > @@ -124,4 +124,6 @@
> >
> > #define EHWPOISON 257 /* Memory page has hardware error */
> >
> > +#define EFTYPE 258 /* Wrong file type for the intended operation */
> > +
> > #endif
> > diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> > index 03dee816cb13..d46812f2f0f4 100644
> > --- a/arch/parisc/include/uapi/asm/fcntl.h
> > +++ b/arch/parisc/include/uapi/asm/fcntl.h
> > @@ -19,6 +19,7 @@
> >
> > #define O_PATH 020000000
> > #define __O_TMPFILE 040000000
> > +#define OPENAT2_REGULAR 0100000000
> >
> > #define F_GETLK64 8
> > #define F_SETLK64 9
> > diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> > index 4a41e7835fd5..71940ec9130b 100644
> > --- a/arch/sparc/include/uapi/asm/errno.h
> > +++ b/arch/sparc/include/uapi/asm/errno.h
> > @@ -117,4 +117,6 @@
> >
> > #define EHWPOISON 135 /* Memory page has hardware error */
> >
> > +#define EFTYPE 136 /* Wrong file type for the intended operation */
> > +
> > #endif
> > diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> > index 67dae75e5274..bb6e9fa94bc9 100644
> > --- a/arch/sparc/include/uapi/asm/fcntl.h
> > +++ b/arch/sparc/include/uapi/asm/fcntl.h
> > @@ -37,6 +37,7 @@
> >
> > #define O_PATH 0x1000000
> > #define __O_TMPFILE 0x2000000
> > +#define OPENAT2_REGULAR 0x4000000
> >
> > #define F_GETOWN 5 /* for sockets. */
> > #define F_SETOWN 6 /* for sockets. */
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 66bbf6d517a9..6d8d4c7765e6 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > ceph_init_inode_acls(newino, &as_ctx);
> > file->f_mode |= FMODE_CREATED;
> > }
> > + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> > + err = -EFTYPE;
> > + goto out_req;
> > + }
>
> ^^^
> This doesn't look quite right. Here's a larger chunk of the code:
>
> -------------------------8<--------------------------
> if (d_in_lookup(dentry)) {
> dn = ceph_finish_lookup(req, dentry, err);
> if (IS_ERR(dn))
> err = PTR_ERR(dn);
> } else {
> /* we were given a hashed negative dentry */
> dn = NULL;
> }
> if (err)
> goto out_req;
> if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> /* make vfs retry on splice, ENOENT, or symlink */
> doutc(cl, "finish_no_open on dn %p\n", dn);
> err = finish_no_open(file, dn);
> } else {
> if (IS_ENCRYPTED(dir) &&
> !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
> pr_warn_client(cl,
> "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
> ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
> goto out_req;
> }
>
> doutc(cl, "finish_open on dn %p\n", dn);
> if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> struct inode *newino = d_inode(dentry);
>
> cache_file_layout(dir, newino);
> ceph_init_inode_acls(newino, &as_ctx);
> file->f_mode |= FMODE_CREATED;
> }
> err = finish_open(file, dentry, ceph_open);
> }
> -------------------------8<--------------------------
>
> It looks like this won't handle it correctly if the pathwalk terminates
> on a symlink (re: d_is_symlink() case). You should either set up a test
> ceph cluster on your own, or reach out to the ceph community and ask
> them to test this.
>
Thanks for reviewing. The d_is_symlink() case seems to be calling
finish_no_open so shouldn't this be okay?
> > err = finish_open(file, dentry, ceph_open);
> > }
> > out_req:
> > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > index beab8080badf..240bb511557a 100644
> > --- a/fs/fcntl.c
> > +++ b/fs/fcntl.c
> > @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> > * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > * is defined as O_NONBLOCK on some platforms and not on others.
> > */
> > - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> > + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > HWEIGHT32(
> > - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > __FMODE_EXEC));
> >
> > fasync_cache = kmem_cache_create("fasync_cache",
> > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > index 8344040ecaf7..4604e2e8a9cc 100644
> > --- a/fs/gfs2/inode.c
> > +++ b/fs/gfs2/inode.c
> > @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> > inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> > error = PTR_ERR(inode);
> > if (!IS_ERR(inode)) {
> > + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
>
> Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
>
Yes, I thought the masking off was happening after this codepath got
executed. Maybe it's better anyway to pass another flags param to this
function and forward the flags from the gfs2_atomic_open function and
in other call sites pass 0 ? What do you think?
Regards,
Dorjoy
^ permalink raw reply
* [RFC PATCH 0/2] vfs: mkdirat_fd() syscall
From: Jori Koolstra @ 2026-03-31 17:19 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner, Jeff Layton,
Chuck Lever, Arnd Bergmann, Shuah Khan, Greg Kroah-Hartman
Cc: H . Peter Anvin, Jan Kara, Alexander Aring, Peter Zijlstra,
Oleg Nesterov, Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
Aleksa Sarai, linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Jori Koolstra
This series implements the mkdirat_fd() syscall that was suggested over
at the UAPI group kernel feature page [1] with some tests.
Obviously, if we want this we should also implement mknodeat_fd() and
symlinkat_fd(), but their implementation can be done quite similar I
believe.
I have added an unigned int flags like [2] suggests and an example flag
that we may want to remove (it right now mainly serves an internal
purpose). But it marks where I would want to place the definitions.
This has been compiled and tested on x86 only. [2] is a bit confusing
here and there, so I hope I have added the proper syscall definitions
everywhere where they needs to be added.
[1]: https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
[2]: https://www.kernel.org/doc/html/latest/process/adding-syscalls.html
Jori Koolstra (2):
vfs: syscalls: add mkdirat_fd()
selftest: add tests for mkdirat_fd()
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/internal.h | 1 +
fs/namei.c | 26 +++-
include/linux/fcntl.h | 2 +
include/linux/syscalls.h | 2 +
include/uapi/asm-generic/fcntl.h | 3 +
include/uapi/asm-generic/unistd.h | 5 +-
scripts/syscall.tbl | 1 +
tools/include/uapi/asm-generic/unistd.h | 5 +-
tools/testing/selftests/filesystems/Makefile | 4 +-
.../selftests/filesystems/mkdirat_fd_test.c | 139 ++++++++++++++++++
11 files changed, 183 insertions(+), 6 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/mkdirat_fd_test.c
--
2.53.0
^ permalink raw reply
* [RFC PATCH 2/2] selftest: add tests for mkdirat_fd()
From: Jori Koolstra @ 2026-03-31 17:19 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner, Jeff Layton,
Chuck Lever, Arnd Bergmann, Shuah Khan, Greg Kroah-Hartman
Cc: H . Peter Anvin, Jan Kara, Alexander Aring, Peter Zijlstra,
Oleg Nesterov, Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
Aleksa Sarai, linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Jori Koolstra, Ingo Molnar
In-Reply-To: <20260331172011.3512876-1-jkoolstra@xs4all.nl>
Add some tests for the new mkdirat_fd() syscall to test compliance and
to showcase its behaviour.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
tools/include/uapi/asm-generic/unistd.h | 5 +-
tools/testing/selftests/filesystems/Makefile | 4 +-
.../selftests/filesystems/mkdirat_fd_test.c | 139 ++++++++++++++++++
3 files changed, 145 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/mkdirat_fd_test.c
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index a627acc8fb5f..5bae1029f5d9 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
#define __NR_rseq_slice_yield 471
__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+#define __NR_mkdirat_fd 472
+__SYSCALL(__NR_mkdirat_fd, sys_mkdirat_fd)
+
#undef __NR_syscalls
-#define __NR_syscalls 472
+#define __NR_syscalls 473
/*
* 32 bit systems traditionally used different
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..7357769db57a 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+CFLAGS += $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog mkdirat_fd_test
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/mkdirat_fd_test.c b/tools/testing/selftests/filesystems/mkdirat_fd_test.c
new file mode 100644
index 000000000000..9058be49dc7b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mkdirat_fd_test.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/stat.h>
+
+#include <asm-generic/unistd.h>
+
+#include "kselftest_harness.h"
+
+#ifndef MKDIRAT_FD_NEED_FD
+#define MKDIRAT_FD_NEED_FD 0x01
+#endif
+
+#define mkdirat_fd_checked(dfd, pathname) ({ \
+ struct stat __st; \
+ int __fd = sys_mkdirat_fd(dfd, pathname, S_IRWXU, MKDIRAT_FD_NEED_FD); \
+ ASSERT_GE(__fd, 0); \
+ EXPECT_EQ(fstat(__fd, &__st), 0); \
+ EXPECT_TRUE(S_ISDIR(__st.st_mode)); \
+ __fd; \
+})
+
+static inline int sys_mkdirat_fd(int dfd, const char *pathname, mode_t mode,
+ unsigned int flags)
+{
+ return syscall(__NR_mkdirat_fd, dfd, pathname, mode, flags);
+}
+
+FIXTURE(mkdirat_fd) {
+ char dirpath[PATH_MAX];
+ int dfd;
+};
+
+FIXTURE_SETUP(mkdirat_fd)
+{
+ snprintf(self->dirpath, sizeof(self->dirpath),
+ "/tmp/mkdirat_fd_test.%d", getpid());
+ ASSERT_EQ(mkdir(self->dirpath, S_IRWXU), 0);
+
+ self->dfd = open(self->dirpath, O_DIRECTORY);
+ ASSERT_GE(self->dfd, 0);
+}
+
+FIXTURE_TEARDOWN(mkdirat_fd)
+{
+ close(self->dfd);
+ rmdir(self->dirpath);
+}
+
+/* Does mkdirat_fd return a fd at all */
+TEST_F(mkdirat_fd, returns_fd)
+{
+ int fd = mkdirat_fd_checked(self->dfd, "newdir");
+ EXPECT_EQ(close(fd), 0)
+ EXPECT_EQ(unlinkat(self->dfd, "newdir", AT_REMOVEDIR), 0);
+}
+
+/* The fd must refer to the directory that was just created. */
+TEST_F(mkdirat_fd, fd_is_created_dir)
+{
+ int fd;
+ struct stat st_via_fd, st_via_path;
+ char path[PATH_MAX];
+
+ fd = mkdirat_fd_checked(self->dfd, "checkdir");
+
+ ASSERT_EQ(fstat(fd, &st_via_fd), 0);
+
+ snprintf(path, sizeof(path), "%s/checkdir", self->dirpath);
+ ASSERT_EQ(stat(path, &st_via_path), 0);
+
+ EXPECT_EQ(st_via_fd.st_ino, st_via_path.st_ino);
+ EXPECT_EQ(st_via_fd.st_dev, st_via_path.st_dev);
+
+ EXPECT_EQ(close(fd), 0)
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+
+/* Missing parent component must fail with ENOENT. */
+TEST_F(mkdirat_fd, enoent_missing_parent)
+{
+ EXPECT_EQ(sys_mkdirat_fd(self->dfd, "nonexistent/child", S_IRWXU, MKDIRAT_FD_NEED_FD), -1);
+ EXPECT_EQ(errno, ENOENT);
+}
+
+/* An invalid dfd must fail with EBADF. */
+TEST_F(mkdirat_fd, ebadf)
+{
+ EXPECT_EQ(sys_mkdirat_fd(-42, "badfdir", S_IRWXU, MKDIRAT_FD_NEED_FD), -1);
+ EXPECT_EQ(errno, EBADF);
+}
+
+/* A dfd that points to a file (not a directory) must fail with ENOTDIR. */
+TEST_F(mkdirat_fd, enotdir_dfd)
+{
+ int file_fd;
+
+ file_fd = openat(self->dfd, "file",
+ O_CREAT | O_WRONLY, S_IRWXU);
+ ASSERT_GE(file_fd, 0);
+
+ EXPECT_EQ(sys_mkdirat_fd(file_fd, "subdir", S_IRWXU, MKDIRAT_FD_NEED_FD), -1);
+ EXPECT_EQ(errno, ENOTDIR);
+
+ EXPECT_EQ(close(file_fd), 0);
+ EXPECT_EQ(unlinkat(self->dfd, "file", 0), 0);
+}
+
+/*
+ * The returned fd must be usable as a dfd for further *at() calls.
+ */
+TEST_F(mkdirat_fd, fd_usable_as_dfd)
+{
+ int parent_fd, child_fd;
+
+ parent_fd = mkdirat_fd_checked(self->dfd, "parent");
+ child_fd = mkdirat_fd_checked(parent_fd, "child");
+
+ EXPECT_EQ(close(child_fd), 0);
+ EXPECT_EQ(close(parent_fd), 0);
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/parent/child", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+ snprintf(path, sizeof(path), "%s/parent", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+/* Unknown flags must be rejected with EINVAL. */
+TEST_F(mkdirat_fd, einval_unknown_flags)
+{
+ EXPECT_EQ(sys_mkdirat_fd(self->dfd, "flagsdir", S_IRWXU, ~MKDIRAT_FD_NEED_FD), -1);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+TEST_HARNESS_MAIN
--
2.53.0
^ permalink raw reply related
* [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Jori Koolstra @ 2026-03-31 17:19 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner, Jeff Layton,
Chuck Lever, Arnd Bergmann, Shuah Khan, Greg Kroah-Hartman,
H. Peter Anvin, Jan Kara, Alexander Aring
Cc: Peter Zijlstra, Oleg Nesterov, Andrey Albershteyn, Jiri Olsa,
Mathieu Desnoyers, Thomas Weißschuh, Namhyung Kim,
Arnaldo Carvalho de Melo, Aleksa Sarai, linux-kernel,
linux-fsdevel, linux-api, linux-arch, linux-kselftest, cmirabil,
Jori Koolstra, Masami Hiramatsu (Google)
In-Reply-To: <20260331172011.3512876-1-jkoolstra@xs4all.nl>
Currently there is no way to race-freely create and open a directory.
For regular files we have open(O_CREAT) for creating a new file inode,
and returning a pinning fd to it. The lack of such functionality for
directories means that when populating a directory tree there's always
a race involved: the inodes first need to be created, and then opened
to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
but in the time window between the creation and the opening they might
be replaced by something else.
Addressing this race without proper APIs is possible (by immediately
fstat()ing what was opened, to verify that it has the right inode type),
but difficult to get right. Hence, mkdirat_fd() that creates a directory
and returns an O_DIRECTORY fd is useful.
This feature idea (and description) is taken from the UAPI group:
https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/internal.h | 1 +
fs/namei.c | 26 ++++++++++++++++++++++++--
include/linux/fcntl.h | 2 ++
include/linux/syscalls.h | 2 ++
include/uapi/asm-generic/fcntl.h | 3 +++
include/uapi/asm-generic/unistd.h | 5 ++++-
scripts/syscall.tbl | 1 +
8 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 524155d655da..dda920c26941 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -396,6 +396,7 @@
469 common file_setattr sys_file_setattr
470 common listns sys_listns
471 common rseq_slice_yield sys_rseq_slice_yield
+472 common mkdirat_fd sys_mkdirat_fd
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/internal.h b/fs/internal.h
index cbc384a1aa09..2885a3e4ebdd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -58,6 +58,7 @@ int filename_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
+int filename_mkdirat_fd(int dfd, struct filename *name, umode_t mode, unsigned int flags);
int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
diff --git a/fs/namei.c b/fs/namei.c
index 1eb9db055292..93252937983e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5256,6 +5256,11 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
EXPORT_SYMBOL(vfs_mkdir);
int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
+{
+ return filename_mkdirat_fd(dfd, name, mode, 0);
+}
+
+int filename_mkdirat_fd(int dfd, struct filename *name, umode_t mode, unsigned int flags)
{
struct dentry *dentry;
struct path path;
@@ -5263,7 +5268,7 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
unsigned int lookup_flags = LOOKUP_DIRECTORY;
struct delegated_inode delegated_inode = { };
-retry:
+start:
dentry = filename_create(dfd, name, &path, lookup_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -5276,7 +5281,6 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
- end_creating_path(&path, dentry);
if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
@@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+
+ if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
+ struct path new_path = { .mnt = path.mnt, .dentry = dentry };
+ error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
+ }
+ end_creating_path(&path, dentry);
return error;
+retry:
+ end_creating_path(&path, dentry);
+ goto start;
+}
+
+SYSCALL_DEFINE4(mkdirat_fd, int, dfd, const char __user *, pathname, umode_t, mode,
+ unsigned int, flags)
+{
+ CLASS(filename, name)(pathname);
+ if (flags & ~VALID_MKDIRAT_FD_FLAGS)
+ return -EINVAL;
+ return filename_mkdirat_fd(dfd, name, mode, flags | MKDIRAT_FD_NEED_FD);
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..d2f0fdb82847 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -25,6 +25,8 @@
#define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
#endif
+#define VALID_MKDIRAT_FD_FLAGS (MKDIRAT_FD_NEED_FD)
+
#if BITS_PER_LONG == 32
#define IS_GETLK32(cmd) ((cmd) == F_GETLK)
#define IS_SETLK32(cmd) ((cmd) == F_SETLK)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 02bd6ddb6278..52e7f09d5525 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
u32 size, u32 flags);
asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
+asmlinkage long sys_mkdirat_fd(int dfd, const char __user *pathname, umode_t mode,
+ unsigned int flags)
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 613475285643..621458bf1fbf 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -95,6 +95,9 @@
#define O_NDELAY O_NONBLOCK
#endif
+/* Flags for mkdirat_fd */
+#define MKDIRAT_FD_NEED_FD 0x01
+
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
#define F_SETFD 2 /* set/clear close_on_exec */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a627acc8fb5f..5bae1029f5d9 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
#define __NR_rseq_slice_yield 471
__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+#define __NR_mkdirat_fd 472
+__SYSCALL(__NR_mkdirat_fd, sys_mkdirat_fd)
+
#undef __NR_syscalls
-#define __NR_syscalls 472
+#define __NR_syscalls 473
/*
* 32 bit systems traditionally used different
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 7a42b32b6577..db3bd97d4a1a 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -412,3 +412,4 @@
469 common file_setattr sys_file_setattr
470 common listns sys_listns
471 common rseq_slice_yield sys_rseq_slice_yield
+472 common mkdirat_fd sys_mkdirat_fd
--
2.53.0
^ permalink raw reply related
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Arnd Bergmann @ 2026-03-31 19:13 UTC (permalink / raw)
To: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, shuah,
Greg Kroah-Hartman, H. Peter Anvin, Jan Kara, Alexander Aring
Cc: Peter Zijlstra, Oleg Nesterov, Andrey Albershteyn, Jiri Olsa,
Mathieu Desnoyers, Thomas Weißschuh, Namhyung Kim,
Arnaldo Carvalho de Melo, Aleksa Sarai, linux-kernel,
linux-fsdevel, linux-api, Linux-Arch, linux-kselftest, cmirabil,
Masami Hiramatsu
In-Reply-To: <20260331172011.3512876-2-jkoolstra@xs4all.nl>
On Tue, Mar 31, 2026, at 19:19, Jori Koolstra wrote:
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat_fd() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
I checked that the calling conventions are fine, i.e. this will work
as expected across all architectures. I assume you are also aware
that the non-RFC patch will need to add the syscall number to all
.tbl files.
The hardest problem here does seem to be the naming of the
new syscall, and I'm sorry to not be able to offer any solution
either, just two observations:
- mkdirat/mkdirat_fd sounds similar to the existing
quotactl/quotactl_fd pair, but quotactl_fd() takes a file
descriptor argument rather than returning it, which makes
this addition quite confusing.
- the nicest interface IMO would have been a variation of
openat(dfd, filename, O_CREAT | O_DIRECTORY, mode)
but that is a minefield of incompatible implementations[1],
so we can't do that without changing the behavior for
existing callers that currently run into an error.
Arnd
[1] https://lwn.net/Articles/926782/
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Yann Droneaud @ 2026-03-31 20:25 UTC (permalink / raw)
To: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, Arnd Bergmann,
Shuah Khan, Greg Kroah-Hartman, H. Peter Anvin, Jan Kara,
Alexander Aring
Cc: Peter Zijlstra, Oleg Nesterov, Andrey Albershteyn, Jiri Olsa,
Mathieu Desnoyers, Thomas Weißschuh, Namhyung Kim,
Arnaldo Carvalho de Melo, Aleksa Sarai, linux-kernel,
linux-fsdevel, linux-api, linux-arch, linux-kselftest, cmirabil,
Masami Hiramatsu (Google)
In-Reply-To: <20260331172011.3512876-2-jkoolstra@xs4all.nl>
Hi,
Le 31/03/2026 à 19:19, Jori Koolstra a écrit :
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat_fd() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/internal.h | 1 +
> fs/namei.c | 26 ++++++++++++++++++++++++--
> include/linux/fcntl.h | 2 ++
> include/linux/syscalls.h | 2 ++
> include/uapi/asm-generic/fcntl.h | 3 +++
> include/uapi/asm-generic/unistd.h | 5 ++++-
> scripts/syscall.tbl | 1 +
> 8 files changed, 38 insertions(+), 3 deletions(-)
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..d2f0fdb82847 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -25,6 +25,8 @@
> #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
> #endif
>
> +#define VALID_MKDIRAT_FD_FLAGS (MKDIRAT_FD_NEED_FD)
> +
I don't see support for O_CLOEXEC-ish flag, is the file descriptor in
close-on-exec mode by default ? If yes, it should be mentioned.
> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> index 613475285643..621458bf1fbf 100644
> --- a/include/uapi/asm-generic/fcntl.h
> +++ b/include/uapi/asm-generic/fcntl.h
> @@ -95,6 +95,9 @@
> #define O_NDELAY O_NONBLOCK
> #endif
>
> +/* Flags for mkdirat_fd */
> +#define MKDIRAT_FD_NEED_FD 0x01
> +
Regards.
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: H. Peter Anvin @ 2026-03-31 20:42 UTC (permalink / raw)
To: Yann Droneaud, Jori Koolstra, Andy Lutomirski, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, Arnd Bergmann,
Shuah Khan, Greg Kroah-Hartman, Jan Kara, Alexander Aring
Cc: Peter Zijlstra, Oleg Nesterov, Andrey Albershteyn, Jiri Olsa,
Mathieu Desnoyers, Thomas Weißschuh, Namhyung Kim,
Arnaldo Carvalho de Melo, Aleksa Sarai, linux-kernel,
linux-fsdevel, linux-api, linux-arch, linux-kselftest, cmirabil,
Masami Hiramatsu (Google)
In-Reply-To: <df5a6fec-ca67-4196-9e7b-cd129c79578e@droneaud.fr>
On March 31, 2026 1:25:03 PM PDT, Yann Droneaud <yann@droneaud.fr> wrote:
>Hi,
>
>Le 31/03/2026 à 19:19, Jori Koolstra a écrit :
>> Currently there is no way to race-freely create and open a directory.
>> For regular files we have open(O_CREAT) for creating a new file inode,
>> and returning a pinning fd to it. The lack of such functionality for
>> directories means that when populating a directory tree there's always
>> a race involved: the inodes first need to be created, and then opened
>> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
>> but in the time window between the creation and the opening they might
>> be replaced by something else.
>>
>> Addressing this race without proper APIs is possible (by immediately
>> fstat()ing what was opened, to verify that it has the right inode type),
>> but difficult to get right. Hence, mkdirat_fd() that creates a directory
>> and returns an O_DIRECTORY fd is useful.
>>
>> This feature idea (and description) is taken from the UAPI group:
>> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>>
>> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>> ---
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/internal.h | 1 +
>> fs/namei.c | 26 ++++++++++++++++++++++++--
>> include/linux/fcntl.h | 2 ++
>> include/linux/syscalls.h | 2 ++
>> include/uapi/asm-generic/fcntl.h | 3 +++
>> include/uapi/asm-generic/unistd.h | 5 ++++-
>> scripts/syscall.tbl | 1 +
>> 8 files changed, 38 insertions(+), 3 deletions(-)
>
>> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
>> index a332e79b3207..d2f0fdb82847 100644
>> --- a/include/linux/fcntl.h
>> +++ b/include/linux/fcntl.h
>> @@ -25,6 +25,8 @@
>> #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
>> #endif
>> +#define VALID_MKDIRAT_FD_FLAGS (MKDIRAT_FD_NEED_FD)
>> +
>
>I don't see support for O_CLOEXEC-ish flag, is the file descriptor in close-on-exec mode by default ? If yes, it should be mentioned.
>
>
>> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
>> index 613475285643..621458bf1fbf 100644
>> --- a/include/uapi/asm-generic/fcntl.h
>> +++ b/include/uapi/asm-generic/fcntl.h
>> @@ -95,6 +95,9 @@
>> #define O_NDELAY O_NONBLOCK
>> #endif
>> +/* Flags for mkdirat_fd */
>> +#define MKDIRAT_FD_NEED_FD 0x01
>> +
>
>
>Regards.
>
>
And even if it is, POSIX already has O_CLOFORK and we should expect that that will be needed, too.
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Mateusz Guzik @ 2026-04-01 4:19 UTC (permalink / raw)
To: Jori Koolstra
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner, Jeff Layton,
Chuck Lever, Arnd Bergmann, Shuah Khan, Greg Kroah-Hartman,
H. Peter Anvin, Jan Kara, Alexander Aring, Peter Zijlstra,
Oleg Nesterov, Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
Aleksa Sarai, linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Masami Hiramatsu (Google)
In-Reply-To: <20260331172011.3512876-2-jkoolstra@xs4all.nl>
On Tue, Mar 31, 2026 at 07:19:58PM +0200, Jori Koolstra wrote:
> @@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> lookup_flags |= LOOKUP_REVAL;
> goto retry;
> }
> +
> + if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
> + struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> + error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
> + }
> + end_creating_path(&path, dentry);
> return error;
You can't do it like this. Should it turn out no fd can be allocated,
the entire thing is going to error out while keeping the newly created
directory behind. You need to allocate the fd first, then do the hard
work, and only then fd_install and or free the fd. The FD_ADD machinery
can probably still be used provided proper wrapping of the real new
mkdir.
It should be perfectly feasible to de facto wrap existing mkdir
functionality by this syscall.
On top of that similarly to what other people mentioned the new syscall
will definitely want to support O_CLOEXEC and probably other flags down
the line.
Trying to handle this in open() is a no-go. openat2 is rather
problematic.
I tend to agree mkdirat_fd is not a good name for the syscall either,
but I don't have a suggestion I'm happy with. I think least bad name
would follow the existing stuff and be mkdirat2 or similar.
The routine would have to start with validating the passed O_ flags, for
now only allowing O_CLOEXEC and EINVAL-ing otherwise.
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Cyril Hrubis @ 2026-04-01 9:44 UTC (permalink / raw)
To: Mateusz Guzik
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, Arnd Bergmann,
Shuah Khan, Greg Kroah-Hartman, H. Peter Anvin, Jan Kara,
Alexander Aring, Peter Zijlstra, Oleg Nesterov,
Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
Aleksa Sarai, linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Masami Hiramatsu (Google)
In-Reply-To: <pbobkjhtuli53o3z34ajyxztaosmztwlygxfxhhjq5ajt47inc@ngtoge3ucdm5>
Hi!
> I tend to agree mkdirat_fd is not a good name for the syscall either,
> but I don't have a suggestion I'm happy with. I think least bad name
> would follow the existing stuff and be mkdirat2 or similar.
Why not mkdirat_open() as it does combine these two syscalls into one?
--
Cyril Hrubis
chrubis@suse.cz
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Jori Koolstra @ 2026-04-01 10:25 UTC (permalink / raw)
To: Mateusz Guzik
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner, Jeff Layton,
Chuck Lever, Arnd Bergmann, Shuah Khan, Greg Kroah-Hartman,
H. Peter Anvin, Jan Kara, Alexander Aring, Peter Zijlstra,
Oleg Nesterov, Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
Aleksa Sarai, linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Masami Hiramatsu (Google)
In-Reply-To: <pbobkjhtuli53o3z34ajyxztaosmztwlygxfxhhjq5ajt47inc@ngtoge3ucdm5>
> Op 01-04-2026 06:19 CEST schreef Mateusz Guzik <mjguzik@gmail.com>:
>
>
> On Tue, Mar 31, 2026 at 07:19:58PM +0200, Jori Koolstra wrote:
> > @@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> > lookup_flags |= LOOKUP_REVAL;
> > goto retry;
> > }
> > +
> > + if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
> > + struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> > + error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
> > + }
> > + end_creating_path(&path, dentry);
> > return error;
>
>
> You can't do it like this. Should it turn out no fd can be allocated,
> the entire thing is going to error out while keeping the newly created
> directory behind. You need to allocate the fd first, then do the hard
> work, and only then fd_install and or free the fd. The FD_ADD machinery
> can probably still be used provided proper wrapping of the real new
> mkdir.
But isn't this exactly what happens in open(O_CREAT) too? Eventually we
call
error = dir_inode->i_op->create(idmap, dir_inode, dentry,
mode, open_flag & O_EXCL);
and only then do we assign and install the fd. AFAIK there is no cleanup
happening there either if the FD_ADD step fails. You will just have a
regular file and no descriptor. But I would have to test this to be sure.
>
> On top of that similarly to what other people mentioned the new syscall
> will definitely want to support O_CLOEXEC and probably other flags down
> the line.
>
I agree, and perhaps O_PATH too. Maybe just all open flags relevant to
directories?
> Trying to handle this in open() is a no-go. openat2 is rather
> problematic.
I don't think that is necessarily true. It turned out O_CREAT | O_DIRECTORY
was bugged for a very long time. Christian Brauner fixed it eventually, and
that combination now returns EINVAL. But I think there is nothing really
stopping us from implementing that combination in the expected way, apart
from whatever reasons there were for not allowing this in the first place,
which I don't know about (maybe mixing semantics?)
>
> I tend to agree mkdirat_fd is not a good name for the syscall either,
> but I don't have a suggestion I'm happy with. I think least bad name
> would follow the existing stuff and be mkdirat2 or similar.
>
> The routine would have to start with validating the passed O_ flags, for
> now only allowing O_CLOEXEC and EINVAL-ing otherwise.
Thanks,
Jori
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: David Laight @ 2026-04-01 14:09 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, shuah,
Greg Kroah-Hartman, H. Peter Anvin, Jan Kara, Alexander Aring,
Peter Zijlstra, Oleg Nesterov, Andrey Albershteyn, Jiri Olsa,
Mathieu Desnoyers, Thomas Weißschuh, Namhyung Kim,
Arnaldo Carvalho de Melo, Aleksa Sarai, linux-kernel,
linux-fsdevel, linux-api, Linux-Arch, linux-kselftest, cmirabil,
Masami Hiramatsu
In-Reply-To: <c2ea52f2-b232-404b-9ec6-75d8efae6bea@app.fastmail.com>
On Tue, 31 Mar 2026 21:13:34 +0200
"Arnd Bergmann" <arnd@arndb.de> wrote:
> On Tue, Mar 31, 2026, at 19:19, Jori Koolstra wrote:
> > Currently there is no way to race-freely create and open a directory.
> > For regular files we have open(O_CREAT) for creating a new file inode,
> > and returning a pinning fd to it. The lack of such functionality for
> > directories means that when populating a directory tree there's always
> > a race involved: the inodes first need to be created, and then opened
> > to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> > but in the time window between the creation and the opening they might
> > be replaced by something else.
> >
> > Addressing this race without proper APIs is possible (by immediately
> > fstat()ing what was opened, to verify that it has the right inode type),
> > but difficult to get right. Hence, mkdirat_fd() that creates a directory
> > and returns an O_DIRECTORY fd is useful.
> >
> > This feature idea (and description) is taken from the UAPI group:
> > https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
> >
> > Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>
> I checked that the calling conventions are fine, i.e. this will work
> as expected across all architectures. I assume you are also aware
> that the non-RFC patch will need to add the syscall number to all
> .tbl files.
>
> The hardest problem here does seem to be the naming of the
> new syscall, and I'm sorry to not be able to offer any solution
> either, just two observations:
>
> - mkdirat/mkdirat_fd sounds similar to the existing
> quotactl/quotactl_fd pair, but quotactl_fd() takes a file
> descriptor argument rather than returning it, which makes
> this addition quite confusing.
>
> - the nicest interface IMO would have been a variation of
> openat(dfd, filename, O_CREAT | O_DIRECTORY, mode)
> but that is a minefield of incompatible implementations[1],
> so we can't do that without changing the behavior for
> existing callers that currently run into an error.
Just require O_TMPFILE to be set as well :-)
You know you'll never regret it one Apr-1 is over.
Can something be done with the flags to openat2().
That might save allocating an extra system call.
David
>
> Arnd
>
> [1] https://lwn.net/Articles/926782/
>
^ permalink raw reply
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Jeff Layton @ 2026-04-01 19:02 UTC (permalink / raw)
To: Dorjoy Chowdhury
Cc: linux-fsdevel, linux-kernel, linux-api, ceph-devel, gfs2,
linux-nfs, linux-cifs, v9fs, linux-kselftest, viro, brauner, jack,
chuck.lever, alex.aring, arnd, adilger, mjguzik, smfrench,
richard.henderson, mattst88, linmag7, tsbogend, James.Bottomley,
deller, davem, andreas, idryomov, amarkuze, slava, agruenba,
trondmy, anna, sfrench, pc, ronniesahlberg, sprasad, tom,
bharathsm, shuah, miklos, hansg
In-Reply-To: <CAFfO_h75dF2s83VNtUaNuRmto1NVVcxo7kN6eAtNtN3ME8mPiQ@mail.gmail.com>
On Mon, 2026-03-30 at 21:07 +0600, Dorjoy Chowdhury wrote:
> On Mon, Mar 30, 2026 at 5:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> > > This flag indicates the path should be opened if it's a regular file.
> > > This is useful to write secure programs that want to avoid being
> > > tricked into opening device nodes with special semantics while thinking
> > > they operate on regular files. This is a requested feature from the
> > > uapi-group[1].
> > >
> > > A corresponding error code EFTYPE has been introduced. For example, if
> > > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > > like FreeBSD, macOS.
> > >
> > > When used in combination with O_CREAT, either the regular file is
> > > created, or if the path already exists, it is opened if it's a regular
> > > file. Otherwise, -EFTYPE is returned.
> > >
> > > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > > as it doesn't make sense to open a path that is both a directory and a
> > > regular file.
> > >
> > > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> > >
> > > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > > ---
> > > arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > arch/alpha/include/uapi/asm/fcntl.h | 1 +
> > > arch/mips/include/uapi/asm/errno.h | 2 ++
> > > arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > arch/parisc/include/uapi/asm/fcntl.h | 1 +
> > > arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > arch/sparc/include/uapi/asm/fcntl.h | 1 +
> > > fs/ceph/file.c | 4 ++++
> > > fs/fcntl.c | 4 ++--
> > > fs/gfs2/inode.c | 6 ++++++
> > > fs/namei.c | 4 ++++
> > > fs/nfs/dir.c | 4 ++++
> > > fs/open.c | 8 +++++---
> > > fs/smb/client/dir.c | 14 +++++++++++++-
> > > include/linux/fcntl.h | 2 ++
> > > include/uapi/asm-generic/errno.h | 2 ++
> > > include/uapi/asm-generic/fcntl.h | 4 ++++
> > > tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> > > tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > tools/include/uapi/asm-generic/errno.h | 2 ++
> > > 22 files changed, 67 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> > > index 6791f6508632..1a99f38813c7 100644
> > > --- a/arch/alpha/include/uapi/asm/errno.h
> > > +++ b/arch/alpha/include/uapi/asm/errno.h
> > > @@ -127,4 +127,6 @@
> > >
> > > #define EHWPOISON 139 /* Memory page has hardware error */
> > >
> > > +#define EFTYPE 140 /* Wrong file type for the intended operation */
> > > +
> > > #endif
> > > diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> > > index 50bdc8e8a271..fe488bf7c18e 100644
> > > --- a/arch/alpha/include/uapi/asm/fcntl.h
> > > +++ b/arch/alpha/include/uapi/asm/fcntl.h
> > > @@ -34,6 +34,7 @@
> > >
> > > #define O_PATH 040000000
> > > #define __O_TMPFILE 0100000000
> > > +#define OPENAT2_REGULAR 0200000000
> > >
> > > #define F_GETLK 7
> > > #define F_SETLK 8
> > > diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> > > index c01ed91b1ef4..1835a50b69ce 100644
> > > --- a/arch/mips/include/uapi/asm/errno.h
> > > +++ b/arch/mips/include/uapi/asm/errno.h
> > > @@ -126,6 +126,8 @@
> > >
> > > #define EHWPOISON 168 /* Memory page has hardware error */
> > >
> > > +#define EFTYPE 169 /* Wrong file type for the intended operation */
> > > +
> > > #define EDQUOT 1133 /* Quota exceeded */
> > >
> > >
> > > diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> > > index 8cbc07c1903e..93194fbb0a80 100644
> > > --- a/arch/parisc/include/uapi/asm/errno.h
> > > +++ b/arch/parisc/include/uapi/asm/errno.h
> > > @@ -124,4 +124,6 @@
> > >
> > > #define EHWPOISON 257 /* Memory page has hardware error */
> > >
> > > +#define EFTYPE 258 /* Wrong file type for the intended operation */
> > > +
> > > #endif
> > > diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> > > index 03dee816cb13..d46812f2f0f4 100644
> > > --- a/arch/parisc/include/uapi/asm/fcntl.h
> > > +++ b/arch/parisc/include/uapi/asm/fcntl.h
> > > @@ -19,6 +19,7 @@
> > >
> > > #define O_PATH 020000000
> > > #define __O_TMPFILE 040000000
> > > +#define OPENAT2_REGULAR 0100000000
> > >
> > > #define F_GETLK64 8
> > > #define F_SETLK64 9
> > > diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> > > index 4a41e7835fd5..71940ec9130b 100644
> > > --- a/arch/sparc/include/uapi/asm/errno.h
> > > +++ b/arch/sparc/include/uapi/asm/errno.h
> > > @@ -117,4 +117,6 @@
> > >
> > > #define EHWPOISON 135 /* Memory page has hardware error */
> > >
> > > +#define EFTYPE 136 /* Wrong file type for the intended operation */
> > > +
> > > #endif
> > > diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> > > index 67dae75e5274..bb6e9fa94bc9 100644
> > > --- a/arch/sparc/include/uapi/asm/fcntl.h
> > > +++ b/arch/sparc/include/uapi/asm/fcntl.h
> > > @@ -37,6 +37,7 @@
> > >
> > > #define O_PATH 0x1000000
> > > #define __O_TMPFILE 0x2000000
> > > +#define OPENAT2_REGULAR 0x4000000
> > >
> > > #define F_GETOWN 5 /* for sockets. */
> > > #define F_SETOWN 6 /* for sockets. */
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index 66bbf6d517a9..6d8d4c7765e6 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > ceph_init_inode_acls(newino, &as_ctx);
> > > file->f_mode |= FMODE_CREATED;
> > > }
> > > + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> > > + err = -EFTYPE;
> > > + goto out_req;
> > > + }
> >
> > ^^^
> > This doesn't look quite right. Here's a larger chunk of the code:
> >
> > -------------------------8<--------------------------
> > if (d_in_lookup(dentry)) {
> > dn = ceph_finish_lookup(req, dentry, err);
> > if (IS_ERR(dn))
> > err = PTR_ERR(dn);
> > } else {
> > /* we were given a hashed negative dentry */
> > dn = NULL;
> > }
> > if (err)
> > goto out_req;
> > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> > /* make vfs retry on splice, ENOENT, or symlink */
> > doutc(cl, "finish_no_open on dn %p\n", dn);
> > err = finish_no_open(file, dn);
> > } else {
> > if (IS_ENCRYPTED(dir) &&
> > !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
> > pr_warn_client(cl,
> > "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
> > ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
> > goto out_req;
> > }
> >
> > doutc(cl, "finish_open on dn %p\n", dn);
> > if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > struct inode *newino = d_inode(dentry);
> >
> > cache_file_layout(dir, newino);
> > ceph_init_inode_acls(newino, &as_ctx);
> > file->f_mode |= FMODE_CREATED;
> > }
> > err = finish_open(file, dentry, ceph_open);
> > }
> > -------------------------8<--------------------------
> >
> > It looks like this won't handle it correctly if the pathwalk terminates
> > on a symlink (re: d_is_symlink() case). You should either set up a test
> > ceph cluster on your own, or reach out to the ceph community and ask
> > them to test this.
> >
>
> Thanks for reviewing. The d_is_symlink() case seems to be calling
> finish_no_open so shouldn't this be okay?
>
My mistake -- you're correct. I keep forgetting that finish_no_open()
will handle this case regardless of what else happens.
> > > err = finish_open(file, dentry, ceph_open);
> > > }
> > > out_req:
> > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > index beab8080badf..240bb511557a 100644
> > > --- a/fs/fcntl.c
> > > +++ b/fs/fcntl.c
> > > @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> > > * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > > * is defined as O_NONBLOCK on some platforms and not on others.
> > > */
> > > - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> > > + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > > HWEIGHT32(
> > > - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > __FMODE_EXEC));
> > >
> > > fasync_cache = kmem_cache_create("fasync_cache",
> > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > index 8344040ecaf7..4604e2e8a9cc 100644
> > > --- a/fs/gfs2/inode.c
> > > +++ b/fs/gfs2/inode.c
> > > @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> > > inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> > > error = PTR_ERR(inode);
> > > if (!IS_ERR(inode)) {
> > > + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
> >
> > Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
> >
> Yes, I thought the masking off was happening after this codepath got
> executed. Maybe it's better anyway to pass another flags param to this
> function and forward the flags from the gfs2_atomic_open function and
> in other call sites pass 0 ? What do you think?
>
Also my mistake. That happens in do_dentry_open() which happens in
finish_open(), so you should be OK here.
Reviewed-by: Jeff Layton <jlayton@kernel.org>
^ permalink raw reply
* Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
From: Aleksa Sarai @ 2026-04-02 2:52 UTC (permalink / raw)
To: Mateusz Guzik
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Christian Brauner, Jeff Layton, Chuck Lever, Arnd Bergmann,
Shuah Khan, Greg Kroah-Hartman, H. Peter Anvin, Jan Kara,
Alexander Aring, Peter Zijlstra, Oleg Nesterov,
Andrey Albershteyn, Jiri Olsa, Mathieu Desnoyers,
Thomas Weißschuh, Namhyung Kim, Arnaldo Carvalho de Melo,
linux-kernel, linux-fsdevel, linux-api, linux-arch,
linux-kselftest, cmirabil, Masami Hiramatsu (Google)
In-Reply-To: <pbobkjhtuli53o3z34ajyxztaosmztwlygxfxhhjq5ajt47inc@ngtoge3ucdm5>
[-- Attachment #1: Type: text/plain, Size: 2688 bytes --]
On 2026-04-01, Mateusz Guzik <mjguzik@gmail.com> wrote:
> On Tue, Mar 31, 2026 at 07:19:58PM +0200, Jori Koolstra wrote:
> > @@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> > lookup_flags |= LOOKUP_REVAL;
> > goto retry;
> > }
> > +
> > + if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
> > + struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> > + error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
> > + }
> > + end_creating_path(&path, dentry);
> > return error;
>
>
> You can't do it like this. Should it turn out no fd can be allocated,
> the entire thing is going to error out while keeping the newly created
> directory behind. You need to allocate the fd first, then do the hard
> work, and only then fd_install and or free the fd. The FD_ADD machinery
> can probably still be used provided proper wrapping of the real new
> mkdir.
>
> It should be perfectly feasible to de facto wrap existing mkdir
> functionality by this syscall.
>
> On top of that similarly to what other people mentioned the new syscall
> will definitely want to support O_CLOEXEC and probably other flags down
> the line.
>
> Trying to handle this in open() is a no-go. openat2 is rather
> problematic.
I'm interested in what makes you say that. It would be very nice to be able
to do mkdir + RESOLVE_IN_ROOT and get an fd back all in one syscall. :D
To be fair, build_open_how() will need some more magic to keep openat()
working, and that won't be particularly pretty. If we went with
O_CREAT|O_DIRECTORY we would need to be quite careful to make sure
O_TMPFILE continues to work for both openat() and openat2()...
> I tend to agree mkdirat_fd is not a good name for the syscall either,
> but I don't have a suggestion I'm happy with. I think least bad name
> would follow the existing stuff and be mkdirat2 or similar.
>
> The routine would have to start with validating the passed O_ flags, for
> now only allowing O_CLOEXEC and EINVAL-ing otherwise.
Please do not use O_* flags! O_CLOEXEC takes up 3 flag bits on different
architectures which makes adding new flags a nightmare.
I think this should take AT_* flags and (like most newer syscalls)
O_CLOEXEC should be automatically set. Userspace can unset it with
fnctl(F_SETFD) in the relatively rare case where they don't want
O_CLOEXEC. Alternatively, we could just bite the bullet and make
AT_NO_CLOEXEC a thing...
But yes, new syscalls *absolutely* need to take some kind of flag
argument. I'd hoped we finally learned our lesson on that one...
--
Aleksa Sarai
https://www.cyphar.com/
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-04-04 15:17 UTC (permalink / raw)
To: Jeff Layton
Cc: linux-fsdevel, linux-kernel, linux-api, ceph-devel, gfs2,
linux-nfs, linux-cifs, v9fs, linux-kselftest, viro, brauner, jack,
chuck.lever, alex.aring, arnd, adilger, mjguzik, smfrench,
richard.henderson, mattst88, linmag7, tsbogend, James.Bottomley,
deller, davem, andreas, idryomov, amarkuze, slava, agruenba,
trondmy, anna, sfrench, pc, ronniesahlberg, sprasad, tom,
bharathsm, shuah, miklos, hansg
In-Reply-To: <4385168f2147efb8131d5fe4209e88d2d15a60bf.camel@kernel.org>
On Thu, Apr 2, 2026 at 1:02 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2026-03-30 at 21:07 +0600, Dorjoy Chowdhury wrote:
> > On Mon, Mar 30, 2026 at 5:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> > > > This flag indicates the path should be opened if it's a regular file.
> > > > This is useful to write secure programs that want to avoid being
> > > > tricked into opening device nodes with special semantics while thinking
> > > > they operate on regular files. This is a requested feature from the
> > > > uapi-group[1].
> > > >
> > > > A corresponding error code EFTYPE has been introduced. For example, if
> > > > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > > > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > > > like FreeBSD, macOS.
> > > >
> > > > When used in combination with O_CREAT, either the regular file is
> > > > created, or if the path already exists, it is opened if it's a regular
> > > > file. Otherwise, -EFTYPE is returned.
> > > >
> > > > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > > > as it doesn't make sense to open a path that is both a directory and a
> > > > regular file.
> > > >
> > > > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> > > >
> > > > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > > > ---
> > > > arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > arch/alpha/include/uapi/asm/fcntl.h | 1 +
> > > > arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > arch/parisc/include/uapi/asm/fcntl.h | 1 +
> > > > arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > arch/sparc/include/uapi/asm/fcntl.h | 1 +
> > > > fs/ceph/file.c | 4 ++++
> > > > fs/fcntl.c | 4 ++--
> > > > fs/gfs2/inode.c | 6 ++++++
> > > > fs/namei.c | 4 ++++
> > > > fs/nfs/dir.c | 4 ++++
> > > > fs/open.c | 8 +++++---
> > > > fs/smb/client/dir.c | 14 +++++++++++++-
> > > > include/linux/fcntl.h | 2 ++
> > > > include/uapi/asm-generic/errno.h | 2 ++
> > > > include/uapi/asm-generic/fcntl.h | 4 ++++
> > > > tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > tools/include/uapi/asm-generic/errno.h | 2 ++
> > > > 22 files changed, 67 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> > > > index 6791f6508632..1a99f38813c7 100644
> > > > --- a/arch/alpha/include/uapi/asm/errno.h
> > > > +++ b/arch/alpha/include/uapi/asm/errno.h
> > > > @@ -127,4 +127,6 @@
> > > >
> > > > #define EHWPOISON 139 /* Memory page has hardware error */
> > > >
> > > > +#define EFTYPE 140 /* Wrong file type for the intended operation */
> > > > +
> > > > #endif
> > > > diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> > > > index 50bdc8e8a271..fe488bf7c18e 100644
> > > > --- a/arch/alpha/include/uapi/asm/fcntl.h
> > > > +++ b/arch/alpha/include/uapi/asm/fcntl.h
> > > > @@ -34,6 +34,7 @@
> > > >
> > > > #define O_PATH 040000000
> > > > #define __O_TMPFILE 0100000000
> > > > +#define OPENAT2_REGULAR 0200000000
> > > >
> > > > #define F_GETLK 7
> > > > #define F_SETLK 8
> > > > diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> > > > index c01ed91b1ef4..1835a50b69ce 100644
> > > > --- a/arch/mips/include/uapi/asm/errno.h
> > > > +++ b/arch/mips/include/uapi/asm/errno.h
> > > > @@ -126,6 +126,8 @@
> > > >
> > > > #define EHWPOISON 168 /* Memory page has hardware error */
> > > >
> > > > +#define EFTYPE 169 /* Wrong file type for the intended operation */
> > > > +
> > > > #define EDQUOT 1133 /* Quota exceeded */
> > > >
> > > >
> > > > diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> > > > index 8cbc07c1903e..93194fbb0a80 100644
> > > > --- a/arch/parisc/include/uapi/asm/errno.h
> > > > +++ b/arch/parisc/include/uapi/asm/errno.h
> > > > @@ -124,4 +124,6 @@
> > > >
> > > > #define EHWPOISON 257 /* Memory page has hardware error */
> > > >
> > > > +#define EFTYPE 258 /* Wrong file type for the intended operation */
> > > > +
> > > > #endif
> > > > diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> > > > index 03dee816cb13..d46812f2f0f4 100644
> > > > --- a/arch/parisc/include/uapi/asm/fcntl.h
> > > > +++ b/arch/parisc/include/uapi/asm/fcntl.h
> > > > @@ -19,6 +19,7 @@
> > > >
> > > > #define O_PATH 020000000
> > > > #define __O_TMPFILE 040000000
> > > > +#define OPENAT2_REGULAR 0100000000
> > > >
> > > > #define F_GETLK64 8
> > > > #define F_SETLK64 9
> > > > diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> > > > index 4a41e7835fd5..71940ec9130b 100644
> > > > --- a/arch/sparc/include/uapi/asm/errno.h
> > > > +++ b/arch/sparc/include/uapi/asm/errno.h
> > > > @@ -117,4 +117,6 @@
> > > >
> > > > #define EHWPOISON 135 /* Memory page has hardware error */
> > > >
> > > > +#define EFTYPE 136 /* Wrong file type for the intended operation */
> > > > +
> > > > #endif
> > > > diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> > > > index 67dae75e5274..bb6e9fa94bc9 100644
> > > > --- a/arch/sparc/include/uapi/asm/fcntl.h
> > > > +++ b/arch/sparc/include/uapi/asm/fcntl.h
> > > > @@ -37,6 +37,7 @@
> > > >
> > > > #define O_PATH 0x1000000
> > > > #define __O_TMPFILE 0x2000000
> > > > +#define OPENAT2_REGULAR 0x4000000
> > > >
> > > > #define F_GETOWN 5 /* for sockets. */
> > > > #define F_SETOWN 6 /* for sockets. */
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index 66bbf6d517a9..6d8d4c7765e6 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > > ceph_init_inode_acls(newino, &as_ctx);
> > > > file->f_mode |= FMODE_CREATED;
> > > > }
> > > > + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> > > > + err = -EFTYPE;
> > > > + goto out_req;
> > > > + }
> > >
> > > ^^^
> > > This doesn't look quite right. Here's a larger chunk of the code:
> > >
> > > -------------------------8<--------------------------
> > > if (d_in_lookup(dentry)) {
> > > dn = ceph_finish_lookup(req, dentry, err);
> > > if (IS_ERR(dn))
> > > err = PTR_ERR(dn);
> > > } else {
> > > /* we were given a hashed negative dentry */
> > > dn = NULL;
> > > }
> > > if (err)
> > > goto out_req;
> > > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> > > /* make vfs retry on splice, ENOENT, or symlink */
> > > doutc(cl, "finish_no_open on dn %p\n", dn);
> > > err = finish_no_open(file, dn);
> > > } else {
> > > if (IS_ENCRYPTED(dir) &&
> > > !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
> > > pr_warn_client(cl,
> > > "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
> > > ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
> > > goto out_req;
> > > }
> > >
> > > doutc(cl, "finish_open on dn %p\n", dn);
> > > if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > > struct inode *newino = d_inode(dentry);
> > >
> > > cache_file_layout(dir, newino);
> > > ceph_init_inode_acls(newino, &as_ctx);
> > > file->f_mode |= FMODE_CREATED;
> > > }
> > > err = finish_open(file, dentry, ceph_open);
> > > }
> > > -------------------------8<--------------------------
> > >
> > > It looks like this won't handle it correctly if the pathwalk terminates
> > > on a symlink (re: d_is_symlink() case). You should either set up a test
> > > ceph cluster on your own, or reach out to the ceph community and ask
> > > them to test this.
> > >
> >
> > Thanks for reviewing. The d_is_symlink() case seems to be calling
> > finish_no_open so shouldn't this be okay?
> >
>
> My mistake -- you're correct. I keep forgetting that finish_no_open()
> will handle this case regardless of what else happens.
>
> > > > err = finish_open(file, dentry, ceph_open);
> > > > }
> > > > out_req:
> > > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > > index beab8080badf..240bb511557a 100644
> > > > --- a/fs/fcntl.c
> > > > +++ b/fs/fcntl.c
> > > > @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> > > > * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > > > * is defined as O_NONBLOCK on some platforms and not on others.
> > > > */
> > > > - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> > > > + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > > > HWEIGHT32(
> > > > - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > __FMODE_EXEC));
> > > >
> > > > fasync_cache = kmem_cache_create("fasync_cache",
> > > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > > index 8344040ecaf7..4604e2e8a9cc 100644
> > > > --- a/fs/gfs2/inode.c
> > > > +++ b/fs/gfs2/inode.c
> > > > @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> > > > inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> > > > error = PTR_ERR(inode);
> > > > if (!IS_ERR(inode)) {
> > > > + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
> > >
> > > Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
> > >
> > Yes, I thought the masking off was happening after this codepath got
> > executed. Maybe it's better anyway to pass another flags param to this
> > function and forward the flags from the gfs2_atomic_open function and
> > in other call sites pass 0 ? What do you think?
> >
>
> Also my mistake. That happens in do_dentry_open() which happens in
> finish_open(), so you should be OK here.
>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
Thanks for patiently reviewing this! I am planning on sending patches
for man-pages and looking into some xfs-tests for this. But I am not
sure if this patch series will get more reviews from others or if it
will be picked up in the vfs branch?
Regards,
Dorjoy
^ permalink raw reply
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Jeff Layton @ 2026-04-05 23:27 UTC (permalink / raw)
To: Dorjoy Chowdhury
Cc: linux-fsdevel, linux-kernel, linux-api, ceph-devel, gfs2,
linux-nfs, linux-cifs, v9fs, linux-kselftest, viro, brauner, jack,
chuck.lever, alex.aring, arnd, adilger, mjguzik, smfrench,
richard.henderson, mattst88, linmag7, tsbogend, James.Bottomley,
deller, davem, andreas, idryomov, amarkuze, slava, agruenba,
trondmy, anna, sfrench, pc, ronniesahlberg, sprasad, tom,
bharathsm, shuah, miklos, hansg
In-Reply-To: <CAFfO_h4dhsXji=+FjO9EikX0_oUUDkWe8tC1F7u4WqhNAjRB=g@mail.gmail.com>
On Sat, 2026-04-04 at 21:17 +0600, Dorjoy Chowdhury wrote:
> On Thu, Apr 2, 2026 at 1:02 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Mon, 2026-03-30 at 21:07 +0600, Dorjoy Chowdhury wrote:
> > > On Mon, Mar 30, 2026 at 5:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> > > > > This flag indicates the path should be opened if it's a regular file.
> > > > > This is useful to write secure programs that want to avoid being
> > > > > tricked into opening device nodes with special semantics while thinking
> > > > > they operate on regular files. This is a requested feature from the
> > > > > uapi-group[1].
> > > > >
> > > > > A corresponding error code EFTYPE has been introduced. For example, if
> > > > > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > > > > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > > > > like FreeBSD, macOS.
> > > > >
> > > > > When used in combination with O_CREAT, either the regular file is
> > > > > created, or if the path already exists, it is opened if it's a regular
> > > > > file. Otherwise, -EFTYPE is returned.
> > > > >
> > > > > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > > > > as it doesn't make sense to open a path that is both a directory and a
> > > > > regular file.
> > > > >
> > > > > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> > > > >
> > > > > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > > > > ---
> > > > > arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > > arch/alpha/include/uapi/asm/fcntl.h | 1 +
> > > > > arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > > arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > > arch/parisc/include/uapi/asm/fcntl.h | 1 +
> > > > > arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > > arch/sparc/include/uapi/asm/fcntl.h | 1 +
> > > > > fs/ceph/file.c | 4 ++++
> > > > > fs/fcntl.c | 4 ++--
> > > > > fs/gfs2/inode.c | 6 ++++++
> > > > > fs/namei.c | 4 ++++
> > > > > fs/nfs/dir.c | 4 ++++
> > > > > fs/open.c | 8 +++++---
> > > > > fs/smb/client/dir.c | 14 +++++++++++++-
> > > > > include/linux/fcntl.h | 2 ++
> > > > > include/uapi/asm-generic/errno.h | 2 ++
> > > > > include/uapi/asm-generic/fcntl.h | 4 ++++
> > > > > tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > > tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > > tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > > tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > > tools/include/uapi/asm-generic/errno.h | 2 ++
> > > > > 22 files changed, 67 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> > > > > index 6791f6508632..1a99f38813c7 100644
> > > > > --- a/arch/alpha/include/uapi/asm/errno.h
> > > > > +++ b/arch/alpha/include/uapi/asm/errno.h
> > > > > @@ -127,4 +127,6 @@
> > > > >
> > > > > #define EHWPOISON 139 /* Memory page has hardware error */
> > > > >
> > > > > +#define EFTYPE 140 /* Wrong file type for the intended operation */
> > > > > +
> > > > > #endif
> > > > > diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> > > > > index 50bdc8e8a271..fe488bf7c18e 100644
> > > > > --- a/arch/alpha/include/uapi/asm/fcntl.h
> > > > > +++ b/arch/alpha/include/uapi/asm/fcntl.h
> > > > > @@ -34,6 +34,7 @@
> > > > >
> > > > > #define O_PATH 040000000
> > > > > #define __O_TMPFILE 0100000000
> > > > > +#define OPENAT2_REGULAR 0200000000
> > > > >
> > > > > #define F_GETLK 7
> > > > > #define F_SETLK 8
> > > > > diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> > > > > index c01ed91b1ef4..1835a50b69ce 100644
> > > > > --- a/arch/mips/include/uapi/asm/errno.h
> > > > > +++ b/arch/mips/include/uapi/asm/errno.h
> > > > > @@ -126,6 +126,8 @@
> > > > >
> > > > > #define EHWPOISON 168 /* Memory page has hardware error */
> > > > >
> > > > > +#define EFTYPE 169 /* Wrong file type for the intended operation */
> > > > > +
> > > > > #define EDQUOT 1133 /* Quota exceeded */
> > > > >
> > > > >
> > > > > diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> > > > > index 8cbc07c1903e..93194fbb0a80 100644
> > > > > --- a/arch/parisc/include/uapi/asm/errno.h
> > > > > +++ b/arch/parisc/include/uapi/asm/errno.h
> > > > > @@ -124,4 +124,6 @@
> > > > >
> > > > > #define EHWPOISON 257 /* Memory page has hardware error */
> > > > >
> > > > > +#define EFTYPE 258 /* Wrong file type for the intended operation */
> > > > > +
> > > > > #endif
> > > > > diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> > > > > index 03dee816cb13..d46812f2f0f4 100644
> > > > > --- a/arch/parisc/include/uapi/asm/fcntl.h
> > > > > +++ b/arch/parisc/include/uapi/asm/fcntl.h
> > > > > @@ -19,6 +19,7 @@
> > > > >
> > > > > #define O_PATH 020000000
> > > > > #define __O_TMPFILE 040000000
> > > > > +#define OPENAT2_REGULAR 0100000000
> > > > >
> > > > > #define F_GETLK64 8
> > > > > #define F_SETLK64 9
> > > > > diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> > > > > index 4a41e7835fd5..71940ec9130b 100644
> > > > > --- a/arch/sparc/include/uapi/asm/errno.h
> > > > > +++ b/arch/sparc/include/uapi/asm/errno.h
> > > > > @@ -117,4 +117,6 @@
> > > > >
> > > > > #define EHWPOISON 135 /* Memory page has hardware error */
> > > > >
> > > > > +#define EFTYPE 136 /* Wrong file type for the intended operation */
> > > > > +
> > > > > #endif
> > > > > diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> > > > > index 67dae75e5274..bb6e9fa94bc9 100644
> > > > > --- a/arch/sparc/include/uapi/asm/fcntl.h
> > > > > +++ b/arch/sparc/include/uapi/asm/fcntl.h
> > > > > @@ -37,6 +37,7 @@
> > > > >
> > > > > #define O_PATH 0x1000000
> > > > > #define __O_TMPFILE 0x2000000
> > > > > +#define OPENAT2_REGULAR 0x4000000
> > > > >
> > > > > #define F_GETOWN 5 /* for sockets. */
> > > > > #define F_SETOWN 6 /* for sockets. */
> > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > > index 66bbf6d517a9..6d8d4c7765e6 100644
> > > > > --- a/fs/ceph/file.c
> > > > > +++ b/fs/ceph/file.c
> > > > > @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > > > ceph_init_inode_acls(newino, &as_ctx);
> > > > > file->f_mode |= FMODE_CREATED;
> > > > > }
> > > > > + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> > > > > + err = -EFTYPE;
> > > > > + goto out_req;
> > > > > + }
> > > >
> > > > ^^^
> > > > This doesn't look quite right. Here's a larger chunk of the code:
> > > >
> > > > -------------------------8<--------------------------
> > > > if (d_in_lookup(dentry)) {
> > > > dn = ceph_finish_lookup(req, dentry, err);
> > > > if (IS_ERR(dn))
> > > > err = PTR_ERR(dn);
> > > > } else {
> > > > /* we were given a hashed negative dentry */
> > > > dn = NULL;
> > > > }
> > > > if (err)
> > > > goto out_req;
> > > > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> > > > /* make vfs retry on splice, ENOENT, or symlink */
> > > > doutc(cl, "finish_no_open on dn %p\n", dn);
> > > > err = finish_no_open(file, dn);
> > > > } else {
> > > > if (IS_ENCRYPTED(dir) &&
> > > > !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
> > > > pr_warn_client(cl,
> > > > "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
> > > > ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
> > > > goto out_req;
> > > > }
> > > >
> > > > doutc(cl, "finish_open on dn %p\n", dn);
> > > > if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > > > struct inode *newino = d_inode(dentry);
> > > >
> > > > cache_file_layout(dir, newino);
> > > > ceph_init_inode_acls(newino, &as_ctx);
> > > > file->f_mode |= FMODE_CREATED;
> > > > }
> > > > err = finish_open(file, dentry, ceph_open);
> > > > }
> > > > -------------------------8<--------------------------
> > > >
> > > > It looks like this won't handle it correctly if the pathwalk terminates
> > > > on a symlink (re: d_is_symlink() case). You should either set up a test
> > > > ceph cluster on your own, or reach out to the ceph community and ask
> > > > them to test this.
> > > >
> > >
> > > Thanks for reviewing. The d_is_symlink() case seems to be calling
> > > finish_no_open so shouldn't this be okay?
> > >
> >
> > My mistake -- you're correct. I keep forgetting that finish_no_open()
> > will handle this case regardless of what else happens.
> >
> > > > > err = finish_open(file, dentry, ceph_open);
> > > > > }
> > > > > out_req:
> > > > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > > > index beab8080badf..240bb511557a 100644
> > > > > --- a/fs/fcntl.c
> > > > > +++ b/fs/fcntl.c
> > > > > @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> > > > > * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > > > > * is defined as O_NONBLOCK on some platforms and not on others.
> > > > > */
> > > > > - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> > > > > + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > > > > HWEIGHT32(
> > > > > - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > > + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > > __FMODE_EXEC));
> > > > >
> > > > > fasync_cache = kmem_cache_create("fasync_cache",
> > > > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > > > index 8344040ecaf7..4604e2e8a9cc 100644
> > > > > --- a/fs/gfs2/inode.c
> > > > > +++ b/fs/gfs2/inode.c
> > > > > @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> > > > > inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> > > > > error = PTR_ERR(inode);
> > > > > if (!IS_ERR(inode)) {
> > > > > + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
> > > >
> > > > Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
> > > >
> > > Yes, I thought the masking off was happening after this codepath got
> > > executed. Maybe it's better anyway to pass another flags param to this
> > > function and forward the flags from the gfs2_atomic_open function and
> > > in other call sites pass 0 ? What do you think?
> > >
> >
> > Also my mistake. That happens in do_dentry_open() which happens in
> > finish_open(), so you should be OK here.
> >
> > Reviewed-by: Jeff Layton <jlayton@kernel.org>
>
> Thanks for patiently reviewing this! I am planning on sending patches
> for man-pages and looking into some xfs-tests for this. But I am not
> sure if this patch series will get more reviews from others or if it
> will be picked up in the vfs branch?
>
This is a change to rather core VFS infrastructure so yes, you should
expect some more review. Assuming no major issues are found, then yes,
this should eventually get picked up by the VFS maintainers.
Cheers,
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply
* Re: [PATCH v6 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-04-06 15:30 UTC (permalink / raw)
To: linux-fsdevel, brauner
Cc: Jeff Layton, linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs,
linux-cifs, v9fs, linux-kselftest, viro, jack, chuck.lever,
alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
miklos, hansg
In-Reply-To: <ce36e877adf7a639bc4e61090d148c06fed63bf7.camel@kernel.org>
On Mon, Apr 6, 2026 at 5:27 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2026-04-04 at 21:17 +0600, Dorjoy Chowdhury wrote:
> > On Thu, Apr 2, 2026 at 1:02 AM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Mon, 2026-03-30 at 21:07 +0600, Dorjoy Chowdhury wrote:
> > > > On Mon, Mar 30, 2026 at 5:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > On Sat, 2026-03-28 at 23:22 +0600, Dorjoy Chowdhury wrote:
> > > > > > This flag indicates the path should be opened if it's a regular file.
> > > > > > This is useful to write secure programs that want to avoid being
> > > > > > tricked into opening device nodes with special semantics while thinking
> > > > > > they operate on regular files. This is a requested feature from the
> > > > > > uapi-group[1].
> > > > > >
> > > > > > A corresponding error code EFTYPE has been introduced. For example, if
> > > > > > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > > > > > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > > > > > like FreeBSD, macOS.
> > > > > >
> > > > > > When used in combination with O_CREAT, either the regular file is
> > > > > > created, or if the path already exists, it is opened if it's a regular
> > > > > > file. Otherwise, -EFTYPE is returned.
> > > > > >
> > > > > > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > > > > > as it doesn't make sense to open a path that is both a directory and a
> > > > > > regular file.
> > > > > >
> > > > > > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> > > > > >
> > > > > > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > > > > > ---
> > > > > > arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > > > arch/alpha/include/uapi/asm/fcntl.h | 1 +
> > > > > > arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > > > arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > > > arch/parisc/include/uapi/asm/fcntl.h | 1 +
> > > > > > arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > > > arch/sparc/include/uapi/asm/fcntl.h | 1 +
> > > > > > fs/ceph/file.c | 4 ++++
> > > > > > fs/fcntl.c | 4 ++--
> > > > > > fs/gfs2/inode.c | 6 ++++++
> > > > > > fs/namei.c | 4 ++++
> > > > > > fs/nfs/dir.c | 4 ++++
> > > > > > fs/open.c | 8 +++++---
> > > > > > fs/smb/client/dir.c | 14 +++++++++++++-
> > > > > > include/linux/fcntl.h | 2 ++
> > > > > > include/uapi/asm-generic/errno.h | 2 ++
> > > > > > include/uapi/asm-generic/fcntl.h | 4 ++++
> > > > > > tools/arch/alpha/include/uapi/asm/errno.h | 2 ++
> > > > > > tools/arch/mips/include/uapi/asm/errno.h | 2 ++
> > > > > > tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
> > > > > > tools/arch/sparc/include/uapi/asm/errno.h | 2 ++
> > > > > > tools/include/uapi/asm-generic/errno.h | 2 ++
> > > > > > 22 files changed, 67 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
> > > > > > index 6791f6508632..1a99f38813c7 100644
> > > > > > --- a/arch/alpha/include/uapi/asm/errno.h
> > > > > > +++ b/arch/alpha/include/uapi/asm/errno.h
> > > > > > @@ -127,4 +127,6 @@
> > > > > >
> > > > > > #define EHWPOISON 139 /* Memory page has hardware error */
> > > > > >
> > > > > > +#define EFTYPE 140 /* Wrong file type for the intended operation */
> > > > > > +
> > > > > > #endif
> > > > > > diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> > > > > > index 50bdc8e8a271..fe488bf7c18e 100644
> > > > > > --- a/arch/alpha/include/uapi/asm/fcntl.h
> > > > > > +++ b/arch/alpha/include/uapi/asm/fcntl.h
> > > > > > @@ -34,6 +34,7 @@
> > > > > >
> > > > > > #define O_PATH 040000000
> > > > > > #define __O_TMPFILE 0100000000
> > > > > > +#define OPENAT2_REGULAR 0200000000
> > > > > >
> > > > > > #define F_GETLK 7
> > > > > > #define F_SETLK 8
> > > > > > diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
> > > > > > index c01ed91b1ef4..1835a50b69ce 100644
> > > > > > --- a/arch/mips/include/uapi/asm/errno.h
> > > > > > +++ b/arch/mips/include/uapi/asm/errno.h
> > > > > > @@ -126,6 +126,8 @@
> > > > > >
> > > > > > #define EHWPOISON 168 /* Memory page has hardware error */
> > > > > >
> > > > > > +#define EFTYPE 169 /* Wrong file type for the intended operation */
> > > > > > +
> > > > > > #define EDQUOT 1133 /* Quota exceeded */
> > > > > >
> > > > > >
> > > > > > diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
> > > > > > index 8cbc07c1903e..93194fbb0a80 100644
> > > > > > --- a/arch/parisc/include/uapi/asm/errno.h
> > > > > > +++ b/arch/parisc/include/uapi/asm/errno.h
> > > > > > @@ -124,4 +124,6 @@
> > > > > >
> > > > > > #define EHWPOISON 257 /* Memory page has hardware error */
> > > > > >
> > > > > > +#define EFTYPE 258 /* Wrong file type for the intended operation */
> > > > > > +
> > > > > > #endif
> > > > > > diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> > > > > > index 03dee816cb13..d46812f2f0f4 100644
> > > > > > --- a/arch/parisc/include/uapi/asm/fcntl.h
> > > > > > +++ b/arch/parisc/include/uapi/asm/fcntl.h
> > > > > > @@ -19,6 +19,7 @@
> > > > > >
> > > > > > #define O_PATH 020000000
> > > > > > #define __O_TMPFILE 040000000
> > > > > > +#define OPENAT2_REGULAR 0100000000
> > > > > >
> > > > > > #define F_GETLK64 8
> > > > > > #define F_SETLK64 9
> > > > > > diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
> > > > > > index 4a41e7835fd5..71940ec9130b 100644
> > > > > > --- a/arch/sparc/include/uapi/asm/errno.h
> > > > > > +++ b/arch/sparc/include/uapi/asm/errno.h
> > > > > > @@ -117,4 +117,6 @@
> > > > > >
> > > > > > #define EHWPOISON 135 /* Memory page has hardware error */
> > > > > >
> > > > > > +#define EFTYPE 136 /* Wrong file type for the intended operation */
> > > > > > +
> > > > > > #endif
> > > > > > diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> > > > > > index 67dae75e5274..bb6e9fa94bc9 100644
> > > > > > --- a/arch/sparc/include/uapi/asm/fcntl.h
> > > > > > +++ b/arch/sparc/include/uapi/asm/fcntl.h
> > > > > > @@ -37,6 +37,7 @@
> > > > > >
> > > > > > #define O_PATH 0x1000000
> > > > > > #define __O_TMPFILE 0x2000000
> > > > > > +#define OPENAT2_REGULAR 0x4000000
> > > > > >
> > > > > > #define F_GETOWN 5 /* for sockets. */
> > > > > > #define F_SETOWN 6 /* for sockets. */
> > > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > > > index 66bbf6d517a9..6d8d4c7765e6 100644
> > > > > > --- a/fs/ceph/file.c
> > > > > > +++ b/fs/ceph/file.c
> > > > > > @@ -977,6 +977,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > > > > ceph_init_inode_acls(newino, &as_ctx);
> > > > > > file->f_mode |= FMODE_CREATED;
> > > > > > }
> > > > > > + if ((flags & OPENAT2_REGULAR) && !d_is_reg(dentry)) {
> > > > > > + err = -EFTYPE;
> > > > > > + goto out_req;
> > > > > > + }
> > > > >
> > > > > ^^^
> > > > > This doesn't look quite right. Here's a larger chunk of the code:
> > > > >
> > > > > -------------------------8<--------------------------
> > > > > if (d_in_lookup(dentry)) {
> > > > > dn = ceph_finish_lookup(req, dentry, err);
> > > > > if (IS_ERR(dn))
> > > > > err = PTR_ERR(dn);
> > > > > } else {
> > > > > /* we were given a hashed negative dentry */
> > > > > dn = NULL;
> > > > > }
> > > > > if (err)
> > > > > goto out_req;
> > > > > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> > > > > /* make vfs retry on splice, ENOENT, or symlink */
> > > > > doutc(cl, "finish_no_open on dn %p\n", dn);
> > > > > err = finish_no_open(file, dn);
> > > > > } else {
> > > > > if (IS_ENCRYPTED(dir) &&
> > > > > !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
> > > > > pr_warn_client(cl,
> > > > > "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
> > > > > ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
> > > > > goto out_req;
> > > > > }
> > > > >
> > > > > doutc(cl, "finish_open on dn %p\n", dn);
> > > > > if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > > > > struct inode *newino = d_inode(dentry);
> > > > >
> > > > > cache_file_layout(dir, newino);
> > > > > ceph_init_inode_acls(newino, &as_ctx);
> > > > > file->f_mode |= FMODE_CREATED;
> > > > > }
> > > > > err = finish_open(file, dentry, ceph_open);
> > > > > }
> > > > > -------------------------8<--------------------------
> > > > >
> > > > > It looks like this won't handle it correctly if the pathwalk terminates
> > > > > on a symlink (re: d_is_symlink() case). You should either set up a test
> > > > > ceph cluster on your own, or reach out to the ceph community and ask
> > > > > them to test this.
> > > > >
> > > >
> > > > Thanks for reviewing. The d_is_symlink() case seems to be calling
> > > > finish_no_open so shouldn't this be okay?
> > > >
> > >
> > > My mistake -- you're correct. I keep forgetting that finish_no_open()
> > > will handle this case regardless of what else happens.
> > >
> > > > > > err = finish_open(file, dentry, ceph_open);
> > > > > > }
> > > > > > out_req:
> > > > > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > > > > index beab8080badf..240bb511557a 100644
> > > > > > --- a/fs/fcntl.c
> > > > > > +++ b/fs/fcntl.c
> > > > > > @@ -1169,9 +1169,9 @@ static int __init fcntl_init(void)
> > > > > > * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > > > > > * is defined as O_NONBLOCK on some platforms and not on others.
> > > > > > */
> > > > > > - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> > > > > > + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > > > > > HWEIGHT32(
> > > > > > - (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > > > + (VALID_OPENAT2_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > > > > > __FMODE_EXEC));
> > > > > >
> > > > > > fasync_cache = kmem_cache_create("fasync_cache",
> > > > > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > > > > index 8344040ecaf7..4604e2e8a9cc 100644
> > > > > > --- a/fs/gfs2/inode.c
> > > > > > +++ b/fs/gfs2/inode.c
> > > > > > @@ -738,6 +738,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
> > > > > > inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
> > > > > > error = PTR_ERR(inode);
> > > > > > if (!IS_ERR(inode)) {
> > > > > > + if (file && (file->f_flags & OPENAT2_REGULAR) && !S_ISREG(inode->i_mode)) {
> > > > >
> > > > > Isn't OPENAT2_REGULAR getting masked off in ->f_flags now?
> > > > >
> > > > Yes, I thought the masking off was happening after this codepath got
> > > > executed. Maybe it's better anyway to pass another flags param to this
> > > > function and forward the flags from the gfs2_atomic_open function and
> > > > in other call sites pass 0 ? What do you think?
> > > >
> > >
> > > Also my mistake. That happens in do_dentry_open() which happens in
> > > finish_open(), so you should be OK here.
> > >
> > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> >
> > Thanks for patiently reviewing this! I am planning on sending patches
> > for man-pages and looking into some xfs-tests for this. But I am not
> > sure if this patch series will get more reviews from others or if it
> > will be picked up in the vfs branch?
> >
>
> This is a change to rather core VFS infrastructure so yes, you should
> expect some more review. Assuming no major issues are found, then yes,
> this should eventually get picked up by the VFS maintainers.
>
> Cheers,
> --
> Jeff Layton <jlayton@kernel.org>
Ping....
This patch series got a "Reviewed-by" from Jeff Layton but it probably
requires more reviews from other maintainers/reviewers as well. So
requesting for review on this patch series. Thanks!
Regards,
Dorjoy
^ permalink raw reply
* [RFC PATCH v3 1/6] uapi: add goldfish_address_space userspace ABI header
From: Wenzhao Liao @ 2026-04-06 16:51 UTC (permalink / raw)
To: rust-for-linux, linux-pci
Cc: ojeda, dakr, bhelgaas, kwilczynski, arnd, gregkh, linux-kernel,
linux-api
In-Reply-To: <20260406165120.166928-1-wenzhaoliao@ruc.edu.cn>
The external goldfish address-space driver exposes its userspace
contract through a dedicated header. Land the ioctl definitions in
include/uapi/linux so the Rust driver can depend on an in-tree UAPI
surface instead of carrying an external private header.
This RFC intentionally narrows the first upstream step to the
open/release/ioctl ABI subset. Userspace mmap and PING_WITH_DATA stay
out of this series until they have their own review and validation
story.
Signed-off-by: Wenzhao Liao <wenzhaoliao@ruc.edu.cn>
---
MAINTAINERS | 8 +++
include/uapi/linux/goldfish_address_space.h | 54 +++++++++++++++++++++
2 files changed, 62 insertions(+)
create mode 100644 include/uapi/linux/goldfish_address_space.h
diff --git a/MAINTAINERS b/MAINTAINERS
index a62f6af55c3a..800b2fe0e648 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1882,6 +1882,14 @@ S: Supported
F: Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.yaml
F: drivers/irqchip/irq-goldfish-pic.c
+ANDROID GOLDFISH ADDRESS SPACE DRIVER
+M: Wenzhao Liao <wenzhaoliao@ruc.edu.cn>
+L: linux-kernel@vger.kernel.org
+L: linux-pci@vger.kernel.org
+L: rust-for-linux@vger.kernel.org
+S: Maintained
+F: include/uapi/linux/goldfish_address_space.h
+
ANDROID GOLDFISH RTC DRIVER
M: Jiaxun Yang <jiaxun.yang@flygoat.com>
S: Supported
diff --git a/include/uapi/linux/goldfish_address_space.h b/include/uapi/linux/goldfish_address_space.h
new file mode 100644
index 000000000000..b782d82f53df
--- /dev/null
+++ b/include/uapi/linux/goldfish_address_space.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_GOLDFISH_ADDRESS_SPACE_H
+#define _UAPI_LINUX_GOLDFISH_ADDRESS_SPACE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define GOLDFISH_ADDRESS_SPACE_DEVICE_NAME "goldfish_address_space"
+
+struct goldfish_address_space_allocate_block {
+ __u64 size;
+ __u64 offset;
+ __u64 phys_addr;
+};
+
+struct goldfish_address_space_ping {
+ __u64 offset;
+ __u64 size;
+ __u64 metadata;
+ __u32 version;
+ __u32 wait_fd;
+ __u32 wait_flags;
+ __u32 direction;
+};
+
+struct goldfish_address_space_claim_shared {
+ __u64 offset;
+ __u64 size;
+};
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_MAGIC 'G'
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_OP(OP, T) \
+ _IOWR(GOLDFISH_ADDRESS_SPACE_IOCTL_MAGIC, OP, T)
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_ALLOCATE_BLOCK \
+ GOLDFISH_ADDRESS_SPACE_IOCTL_OP(10, \
+ struct goldfish_address_space_allocate_block)
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_DEALLOCATE_BLOCK \
+ GOLDFISH_ADDRESS_SPACE_IOCTL_OP(11, __u64)
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_PING \
+ GOLDFISH_ADDRESS_SPACE_IOCTL_OP(12, \
+ struct goldfish_address_space_ping)
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_CLAIM_SHARED \
+ GOLDFISH_ADDRESS_SPACE_IOCTL_OP(13, \
+ struct goldfish_address_space_claim_shared)
+
+#define GOLDFISH_ADDRESS_SPACE_IOCTL_UNCLAIM_SHARED \
+ GOLDFISH_ADDRESS_SPACE_IOCTL_OP(14, __u64)
+
+#endif /* _UAPI_LINUX_GOLDFISH_ADDRESS_SPACE_H */
--
2.34.1
^ permalink raw reply related
* [RFC PATCH v3 0/6] Rust goldfish_address_space driver (ioctl-only subset)
From: Wenzhao Liao @ 2026-04-06 16:51 UTC (permalink / raw)
To: rust-for-linux, linux-pci
Cc: ojeda, dakr, bhelgaas, kwilczynski, arnd, gregkh, linux-kernel,
linux-api
In-Reply-To: <cover.1775456181.git.wenzhaoliao@ruc.edu.cn>
This respin narrows the Rust goldfish_address_space RFC to the
open/release/ioctl ABI subset. Userspace mmap and PING_WITH_DATA are
not part of this series.
I would like to send this as a small first upstream step for the Rust
driver, instead of asking reviewers to take the mmap/VMA lifecycle work
in the same round.
The goal of the respin is to keep only the pieces that are still
required by the current driver:
- the goldfish UAPI header and Rust bindings exposure,
- minimal page helpers for the ping page,
- a small SharedMemoryBar abstraction for shared BAR reservation,
memremap() lifetime, and physical base discovery,
- hardened miscdevice registration/open boundaries,
- and the Rust goldfish_address_space driver itself.
Compared to the previous round, this drops the Rust VMA/BAR-to-VMA
mapping work from the series and rewrites the driver and miscdevice
pieces around the current teardown and publication model. The driver
remains #![forbid(unsafe_code)].
Feedback would be especially helpful on:
- whether the ioctl-only ABI subset is a reasonable first upstream step
for goldfish_address_space;
- whether SharedMemoryBar is the right minimal Rust abstraction for
shared-memory BAR reservation plus memremap() lifetime;
- whether the miscdevice hardening direction makes sense, especially the
publication-safe open context and the THIS_MODULE-owned safe
file_operations path.
Changes since v2:
- dropped the userspace mmap portion of the RFC and removed the unused
Rust VMA/BAR-to-VMA mapping patch from the series;
- narrowed the goldfish Kconfig help text and driver description to the
open/release/ioctl ABI subset;
- reworked miscdevice so safe open() only sees publication-safe state
and safe drivers no longer have a raw file_operations escape hatch;
- reworked goldfish teardown around deregister() -> shutdown() ->
disable_device(), with live-file revocation before PCI disable and
explicit enable_device_mem() probe unwind;
- kept the in-tree Rust VMA helpers still used by binder out of this
series, so the respin only carries code with a current caller.
Behavior exercised for the RFC-limited ABI subset:
- open / release
- allocate_block / deallocate_block
- ping
- claim_shared / unclaim_shared
- unknown ioctl
- reopen
No claim is made beyond that subset in this respin.
Build-tested:
- make LLVM=1 rust/kernel.o
- make LLVM=1 drivers/platform/goldfish/goldfish_address_space.o
- make LLVM=1 samples/rust/rust_misc_device.o
Wenzhao Liao (6):
uapi: add goldfish_address_space userspace ABI header
rust: bindings: expose goldfish address-space headers
rust: page: add helpers for page-backed ping state
rust: pci: add shared BAR memremap support
rust: miscdevice: harden registration and safe file_operations
invariants
platform/goldfish: add Rust goldfish_address_space driver
MAINTAINERS | 10 +
drivers/platform/goldfish/Kconfig | 11 +
drivers/platform/goldfish/Makefile | 1 +
.../goldfish/goldfish_address_space.rs | 917 ++++++++++++++++++
include/uapi/linux/goldfish_address_space.h | 54 ++
rust/bindings/bindings_helper.h | 1 +
rust/helpers/page.c | 5 +
rust/kernel/miscdevice.rs | 409 +++++---
rust/kernel/page.rs | 52 +-
rust/kernel/pci.rs | 8 +
rust/kernel/pci/id.rs | 2 +-
rust/kernel/pci/io.rs | 112 ++-
rust/uapi/uapi_helper.h | 1 +
samples/rust/rust_misc_device.rs | 9 +-
14 files changed, 1453 insertions(+), 139 deletions(-)
create mode 100644 drivers/platform/goldfish/goldfish_address_space.rs
create mode 100644 include/uapi/linux/goldfish_address_space.h
--
2.34.1
^ permalink raw reply
* [RFC PATCH v3 4/6] rust: pci: add shared BAR memremap support
From: Wenzhao Liao @ 2026-04-06 16:51 UTC (permalink / raw)
To: rust-for-linux, linux-pci
Cc: ojeda, dakr, bhelgaas, kwilczynski, arnd, gregkh, linux-kernel,
linux-api
In-Reply-To: <20260406165120.166928-1-wenzhaoliao@ruc.edu.cn>
Add a small Rust-owned abstraction for PCI BARs that back shared memory
instead of register MMIO.
The new SharedMemoryBar type owns both the BAR reservation and the
memremap() lifetime, exposes the physical BAR start needed by the
address-space ping path, and keeps the resource bookkeeping out of the
Rust driver.
The current RFC no longer exposes userspace mmap, but the driver still
needs an owned shared-BAR reservation and the BAR's physical base for
the ping path. Keeping the reservation/memremap() pairing in a Rust
abstraction avoids pushing that lifetime bookkeeping back into driver
code.
Signed-off-by: Wenzhao Liao <wenzhaoliao@ruc.edu.cn>
---
rust/kernel/pci.rs | 8 +++
rust/kernel/pci/id.rs | 2 +-
rust/kernel/pci/io.rs | 112 +++++++++++++++++++++++++++++++++++++++++-
3 files changed, 120 insertions(+), 2 deletions(-)
diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs
index af74ddff6114..4c63c931ffb2 100644
--- a/rust/kernel/pci.rs
+++ b/rust/kernel/pci.rs
@@ -47,6 +47,7 @@
ConfigSpaceSize,
Extended,
Normal, //
+ SharedMemoryBar,
};
pub use self::irq::{
IrqType,
@@ -458,6 +459,13 @@ pub fn set_master(&self) {
// SAFETY: `self.as_raw` is guaranteed to be a pointer to a valid `struct pci_dev`.
unsafe { bindings::pci_set_master(self.as_raw()) };
}
+
+ /// Disable this PCI device.
+ #[inline]
+ pub fn disable_device(&self) {
+ // SAFETY: `self.as_raw` is guaranteed to be a pointer to a valid `struct pci_dev`.
+ unsafe { bindings::pci_disable_device(self.as_raw()) };
+ }
}
// SAFETY: `pci::Device` is a transparent wrapper of `struct pci_dev`.
diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs
index 50005d176561..bd3cf17fd8de 100644
--- a/rust/kernel/pci/id.rs
+++ b/rust/kernel/pci/id.rs
@@ -156,7 +156,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
impl Vendor {
/// Create a Vendor from a raw 16-bit vendor ID.
#[inline]
- pub(super) fn from_raw(vendor_id: u16) -> Self {
+ pub const fn from_raw(vendor_id: u16) -> Self {
Self(vendor_id)
}
diff --git a/rust/kernel/pci/io.rs b/rust/kernel/pci/io.rs
index fb6edab2aea7..89bf882b9634 100644
--- a/rust/kernel/pci/io.rs
+++ b/rust/kernel/pci/io.rs
@@ -7,6 +7,7 @@
bindings,
device,
devres::Devres,
+ ffi::{c_ulong, c_void},
io::{
io_define_read,
io_define_write,
@@ -17,11 +18,13 @@
MmioRaw, //
},
prelude::*,
- sync::aref::ARef, //
+ sync::aref::ARef,
+ types::ScopeGuard,
};
use core::{
marker::PhantomData,
ops::Deref, //
+ ptr::NonNull,
};
/// Represents the size of a PCI configuration space.
@@ -285,6 +288,104 @@ fn deref(&self) -> &Self::Target {
}
}
+/// A cacheable shared-memory mapping of a PCI BAR created via `memremap()`.
+///
+/// This is intended for BARs that back shared memory rather than device register MMIO. The
+/// mapping owns both the underlying PCI region reservation and the `memremap()` lifetime, so
+/// driver code does not need to keep raw pointers or manually pair teardown calls.
+pub struct SharedMemoryBar {
+ pdev: ARef<Device>,
+ addr: NonNull<c_void>,
+ phys_start: bindings::resource_size_t,
+ len: usize,
+ num: i32,
+}
+
+// SAFETY: `SharedMemoryBar` owns a stable BAR reservation plus its `memremap()` mapping. Moving
+// the owner to another thread does not change the validity of the underlying PCI resource.
+unsafe impl Send for SharedMemoryBar {}
+
+// SAFETY: Shared references only expose immutable metadata queries; the mapped pointer itself is
+// not exposed for dereferencing.
+unsafe impl Sync for SharedMemoryBar {}
+
+impl SharedMemoryBar {
+ fn new(pdev: &Device, num: u32, name: &CStr) -> Result<Self> {
+ if !Bar::index_is_valid(num) {
+ return Err(EINVAL);
+ }
+
+ let len = pdev.resource_len(num)?;
+ if len == 0 {
+ return Err(ENXIO);
+ }
+
+ let len = usize::try_from(len)?;
+ let phys_start = pdev.resource_start(num)?;
+ let num = i32::try_from(num)?;
+
+ // SAFETY:
+ // - `pdev` is valid by the invariants of `Device`.
+ // - `num` is checked above.
+ // - `name` is a valid NUL-terminated string.
+ let ret = unsafe { bindings::pci_request_region(pdev.as_raw(), num, name.as_char_ptr()) };
+ if ret != 0 {
+ return Err(EBUSY);
+ }
+
+ let release_region = ScopeGuard::new(|| {
+ // SAFETY:
+ // - `pdev` is still valid for the duration of this constructor.
+ // - `num` has just been successfully reserved.
+ unsafe { bindings::pci_release_region(pdev.as_raw(), num) };
+ });
+
+ // SAFETY:
+ // - `phys_start`/`len` describe the BAR range we just reserved.
+ // - `MEMREMAP_WB` matches the external goldfish driver behaviour.
+ let addr = unsafe { bindings::memremap(phys_start, len, bindings::MEMREMAP_WB as c_ulong) };
+ let addr = NonNull::new(addr.cast()).ok_or(ENOMEM)?;
+
+ release_region.dismiss();
+
+ Ok(Self {
+ pdev: pdev.into(),
+ addr,
+ phys_start,
+ len,
+ num,
+ })
+ }
+
+ /// Returns the physical start address of the BAR.
+ #[inline]
+ pub fn phys_start(&self) -> bindings::resource_size_t {
+ self.phys_start
+ }
+
+ /// Returns the BAR size in bytes.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ fn release(&self) {
+ // SAFETY:
+ // - `self.addr` is a valid `memremap()` result owned by `self`.
+ // - `self.num` is the BAR region successfully reserved by `Self::new`.
+ unsafe {
+ bindings::memunmap(self.addr.as_ptr().cast());
+ bindings::pci_release_region(self.pdev.as_raw(), self.num);
+ }
+ }
+}
+
+impl Drop for SharedMemoryBar {
+ fn drop(&mut self) {
+ self.release();
+ }
+}
+
impl Device<device::Bound> {
/// Maps an entire PCI BAR after performing a region-request on it. I/O operation bound checks
/// can be performed on compile time for offsets (plus the requested type size) < SIZE.
@@ -305,6 +406,15 @@ pub fn iomap_region<'a>(
self.iomap_region_sized::<0>(bar, name)
}
+ /// Reserve and `memremap()` an entire PCI BAR as cacheable shared memory.
+ pub fn memremap_bar<'a>(
+ &'a self,
+ bar: u32,
+ name: &'a CStr,
+ ) -> impl PinInit<Devres<SharedMemoryBar>, Error> + 'a {
+ Devres::new(self.as_ref(), SharedMemoryBar::new(self, bar, name))
+ }
+
/// Returns the size of configuration space.
pub fn cfg_size(&self) -> ConfigSpaceSize {
// SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`.
--
2.34.1
^ permalink raw reply related
* [RFC PATCH v3 2/6] rust: bindings: expose goldfish address-space headers
From: Wenzhao Liao @ 2026-04-06 16:51 UTC (permalink / raw)
To: rust-for-linux, linux-pci
Cc: ojeda, dakr, bhelgaas, kwilczynski, arnd, gregkh, linux-kernel,
linux-api
In-Reply-To: <20260406165120.166928-1-wenzhaoliao@ruc.edu.cn>
Expose the UAPI header and the Linux I/O declarations needed by the Rust goldfish address-space driver.
This keeps the driver-side code on typed Rust interfaces while still allowing the binding and helper layers to see the header and memremap support required by the abstraction patches that follow.
Signed-off-by: Wenzhao Liao <wenzhaoliao@ruc.edu.cn>
---
rust/bindings/bindings_helper.h | 1 +
rust/uapi/uapi_helper.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index 083cc44aa952..b0baff4c6349 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -59,6 +59,7 @@
#include <linux/fs.h>
#include <linux/i2c.h>
#include <linux/interrupt.h>
+#include <linux/io.h>
#include <linux/io-pgtable.h>
#include <linux/ioport.h>
#include <linux/jiffies.h>
diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h
index 06d7d1a2e8da..ff19edab81da 100644
--- a/rust/uapi/uapi_helper.h
+++ b/rust/uapi/uapi_helper.h
@@ -11,6 +11,7 @@
#include <uapi/drm/nova_drm.h>
#include <uapi/drm/panthor_drm.h>
#include <uapi/linux/android/binder.h>
+#include <uapi/linux/goldfish_address_space.h>
#include <uapi/linux/mdio.h>
#include <uapi/linux/mii.h>
#include <uapi/linux/ethtool.h>
--
2.34.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox