* [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-12 13:54 [RFC PATCH v2 0/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd Jori Koolstra
@ 2026-04-12 13:54 ` Jori Koolstra
2026-04-24 10:09 ` Mateusz Guzik
2026-04-27 15:48 ` Christian Brauner
2026-04-12 13:54 ` [RFC PATCH v2 2/2] selftest: add tests for mkdirat2() Jori Koolstra
1 sibling, 2 replies; 15+ messages in thread
From: Jori Koolstra @ 2026-04-12 13:54 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner,
Arnd Bergmann
Cc: H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jori Koolstra, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
Currently there is no way to race-freely create and open a directory.
For regular files we have open(O_CREAT) for creating a new file inode,
and returning a pinning fd to it. The lack of such functionality for
directories means that when populating a directory tree there's always
a race involved: the inodes first need to be created, and then opened
to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
but in the time window between the creation and the opening they might
be replaced by something else.
Addressing this race without proper APIs is possible (by immediately
fstat()ing what was opened, to verify that it has the right inode type),
but difficult to get right. Hence, mkdirat2() that creates a directory
and returns an O_DIRECTORY fd is useful.
This feature idea (and description) is taken from the UAPI group:
https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/internal.h | 2 ++
fs/namei.c | 44 +++++++++++++++++++++++---
include/linux/syscalls.h | 2 ++
include/uapi/asm-generic/unistd.h | 5 ++-
scripts/syscall.tbl | 1 +
6 files changed, 50 insertions(+), 5 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 524155d655da..e200ca2067a4 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -396,6 +396,7 @@
469 common file_setattr sys_file_setattr
470 common listns sys_listns
471 common rseq_slice_yield sys_rseq_slice_yield
+472 common mkdirat2 sys_mkdirat2
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/internal.h b/fs/internal.h
index cbc384a1aa09..c6a79afadacf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -59,6 +59,8 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
+struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
+ unsigned int flags, bool open);
int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
int filename_linkat(int olddfd, struct filename *old, int newdfd,
diff --git a/fs/namei.c b/fs/namei.c
index a880454a6415..6451e96dc225 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5255,18 +5255,36 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
}
EXPORT_SYMBOL(vfs_mkdir);
-int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
+static int mkdirat_lookup_flags(unsigned int flags)
+{
+ int lookup_flags = LOOKUP_DIRECTORY;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+
+ return lookup_flags;
+}
+
+int filename_mkdirat(int dfd, struct filename *name, umode_t mode) {
+ return PTR_ERR_OR_ZERO(do_file_mkdirat(dfd, name, mode, 0, false));
+}
+
+struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
+ unsigned int flags, bool open)
{
struct dentry *dentry;
struct path path;
int error;
- unsigned int lookup_flags = LOOKUP_DIRECTORY;
+ struct file *filp = NULL;
+ unsigned int lookup_flags = mkdirat_lookup_flags(flags);
struct delegated_inode delegated_inode = { };
retry:
dentry = filename_create(dfd, name, &path, lookup_flags);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ return ERR_CAST(dentry);
error = security_path_mkdir(&path, dentry,
mode_strip_umask(path.dentry->d_inode, mode));
@@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
+ if (open && !error && !is_delegated(&delegated_inode)) {
+ const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
+ filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
+ }
end_creating_path(&path, dentry);
if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
@@ -5286,7 +5308,21 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
- return error;
+ if (error)
+ return ERR_PTR(error);
+ return filp;
+}
+
+#define VALID_MKDIRAT2_FLAGS (AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)
+
+SYSCALL_DEFINE4(mkdirat2, int, dfd, const char __user *, pathname, umode_t, mode,
+ unsigned int, flags)
+{
+ CLASS(filename, name)(pathname);
+ if (flags & ~VALID_MKDIRAT2_FLAGS)
+ return -EINVAL;
+
+ return FD_ADD(O_CLOEXEC, do_file_mkdirat(dfd, name, mode, flags, true));
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 02bd6ddb6278..b3b4ae26dbdd 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
u32 size, u32 flags);
asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
+asmlinkage long sys_mkdirat2(int dfd, const char __user *pathname, umode_t mode,
+ unsigned int flags)
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a627acc8fb5f..6efc21779b62 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
#define __NR_rseq_slice_yield 471
__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+#define __NR_mkdirat2 472
+__SYSCALL(__NR_mkdirat2, sys_mkdirat2)
+
#undef __NR_syscalls
-#define __NR_syscalls 472
+#define __NR_syscalls 473
/*
* 32 bit systems traditionally used different
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 7a42b32b6577..9d86f29762ae 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -412,3 +412,4 @@
469 common file_setattr sys_file_setattr
470 common listns sys_listns
471 common rseq_slice_yield sys_rseq_slice_yield
+472 common mkdirat2 sys_mkdirat2
--
2.53.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-12 13:54 ` [RFC PATCH v2 1/2] " Jori Koolstra
@ 2026-04-24 10:09 ` Mateusz Guzik
2026-04-27 15:14 ` Christian Brauner
2026-04-27 15:48 ` Christian Brauner
1 sibling, 1 reply; 15+ messages in thread
From: Mateusz Guzik @ 2026-04-24 10:09 UTC (permalink / raw)
To: Jori Koolstra
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner,
Arnd Bergmann, H . Peter Anvin, Jan Kara, Peter Zijlstra,
Andrey Albershteyn, Masami Hiramatsu, Jiri Olsa,
Thomas Weißschuh, Mathieu Desnoyers, Jeff Layton,
Aleksa Sarai, cmirabil, Greg Kroah-Hartman, linux-kernel,
linux-fsdevel, linux-api, linux-arch
On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat2() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> @@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> if (IS_ERR(dentry))
> error = PTR_ERR(dentry);
> }
> + if (open && !error && !is_delegated(&delegated_inode)) {
> + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
> + }
> end_creating_path(&path, dentry);
> if (is_delegated(&delegated_inode)) {
> error = break_deleg_wait(&delegated_inode);
> 2.53.0
>
Last time around I pointed out fd allocation being an issue.
The general problem is introduction of a failure point after mkdir
itself succeeds as there is no way to backpedal from it.
With the patch as proposed this remains a factor -- dentry_open itself
can fail due to inability to allocate a file obj, and even if that
succeeds there are several ways for do_dentry_open to error out.
For the patch to be viable some rototoiling is needed to make it so that
all the prep is done before issuing the mkdir. The only thing which can
legally happen after is installatin of the file obj in the fd table.
Now that I said it, the open handling is already buggy in that way.
do_open has the following:
error = may_open(idmap, &nd->path, acc_mode, open_flag);
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
if (!error)
error = security_file_post_open(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(idmap, file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
Suppose O_CREAT was passed.
There is no attempt to recover from the LSM returning an error, in which
case the file is left on the fs. The only LSM even using the hook is
ima. Even if the user being able to create the file implies the LSM
check will pass anyway, the inode itself is not locked so root can sneak
in to chmod it and trigger a failure. Suppose that's not important.
Things proceed to handle_truncate:
int error = get_write_access(inode);
if (error)
return error;
error = security_file_truncate(filp);
if (!error) {
error = do_truncate(idmap, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
I'm going to ignore the LSM situation and do_truncate failure modes in this one.
AFAICS nothing prevents the same user from racing against file creation to
execve it, which starts with exe_file_deny_write_access. Should the
other thread win the race, get_write_access will fail and the WARN_ON
splat will be generated. That is definitely a problem.
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-24 10:09 ` Mateusz Guzik
@ 2026-04-27 15:14 ` Christian Brauner
2026-04-27 16:30 ` Mateusz Guzik
0 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2026-04-27 15:14 UTC (permalink / raw)
To: Mateusz Guzik
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro, Arnd Bergmann,
H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
> Things proceed to handle_truncate:
> int error = get_write_access(inode);
> if (error)
> return error;
>
> error = security_file_truncate(filp);
> if (!error) {
> error = do_truncate(idmap, path->dentry, 0,
> ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
> filp);
> }
>
> I'm going to ignore the LSM situation and do_truncate failure modes in this one.
>
> AFAICS nothing prevents the same user from racing against file creation to
> execve it, which starts with exe_file_deny_write_access. Should the
> other thread win the race, get_write_access will fail and the WARN_ON
> splat will be generated. That is definitely a problem.
That can't happen:
static inline int get_write_access(struct inode *inode)
{
return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
and the check is:
error = handle_truncate(idmap, file);
if (unlikely(error > 0)) {
This was a catch all for broken LSM hook or ->open() instance.
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-27 15:14 ` Christian Brauner
@ 2026-04-27 16:30 ` Mateusz Guzik
2026-04-28 8:55 ` Christian Brauner
0 siblings, 1 reply; 15+ messages in thread
From: Mateusz Guzik @ 2026-04-27 16:30 UTC (permalink / raw)
To: Christian Brauner
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro, Arnd Bergmann,
H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
On Mon, Apr 27, 2026 at 5:14 PM Christian Brauner <brauner@kernel.org> wrote:
>
> > Things proceed to handle_truncate:
> > int error = get_write_access(inode);
> > if (error)
> > return error;
> >
> > error = security_file_truncate(filp);
> > if (!error) {
> > error = do_truncate(idmap, path->dentry, 0,
> > ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
> > filp);
> > }
> >
> > I'm going to ignore the LSM situation and do_truncate failure modes in this one.
> >
> > AFAICS nothing prevents the same user from racing against file creation to
> > execve it, which starts with exe_file_deny_write_access. Should the
> > other thread win the race, get_write_access will fail and the WARN_ON
> > splat will be generated. That is definitely a problem.
>
> That can't happen:
>
> static inline int get_write_access(struct inode *inode)
> {
> return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
> }
>
> and the check is:
>
> error = handle_truncate(idmap, file);
> if (unlikely(error > 0)) {
>
> This was a catch all for broken LSM hook or ->open() instance.
>
So with this prog:
#include <fcntl.h>
int main(void)
{
open("test", O_TRUNC);
}
I verified writecount is 0 on entry to handle_truncate like so:
bpftrace -e 'kprobe:security_file_truncate { @[comm, (int64)((struct
file *)arg0)->f_path.dentry->d_inode->i_writecount.counter] = count();
}'
@[a.out, 1]: 1
i.e., get_write_access in handle_truncate transitioned the count 0 -> 1
but then what prevents the following race:
CPU0 CPU1
open("test") execve("test")
handle_truncate do_open_execat
exe_file_deny_write_access # should
succeed as count is 0?
get_write_access # should fail as the count is now -1?
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-27 16:30 ` Mateusz Guzik
@ 2026-04-28 8:55 ` Christian Brauner
2026-04-28 14:39 ` Mateusz Guzik
0 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2026-04-28 8:55 UTC (permalink / raw)
To: Mateusz Guzik
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro, Arnd Bergmann,
H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
On Mon, Apr 27, 2026 at 06:30:42PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 27, 2026 at 5:14 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > > Things proceed to handle_truncate:
> > > int error = get_write_access(inode);
> > > if (error)
> > > return error;
> > >
> > > error = security_file_truncate(filp);
> > > if (!error) {
> > > error = do_truncate(idmap, path->dentry, 0,
> > > ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
> > > filp);
> > > }
> > >
> > > I'm going to ignore the LSM situation and do_truncate failure modes in this one.
> > >
> > > AFAICS nothing prevents the same user from racing against file creation to
> > > execve it, which starts with exe_file_deny_write_access. Should the
> > > other thread win the race, get_write_access will fail and the WARN_ON
> > > splat will be generated. That is definitely a problem.
> >
> > That can't happen:
> >
> > static inline int get_write_access(struct inode *inode)
> > {
> > return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
> > }
> >
> > and the check is:
> >
> > error = handle_truncate(idmap, file);
> > if (unlikely(error > 0)) {
> >
> > This was a catch all for broken LSM hook or ->open() instance.
> >
>
> So with this prog:
> #include <fcntl.h>
>
> int main(void)
> {
> open("test", O_TRUNC);
> }
>
> I verified writecount is 0 on entry to handle_truncate like so:
>
> bpftrace -e 'kprobe:security_file_truncate { @[comm, (int64)((struct
> file *)arg0)->f_path.dentry->d_inode->i_writecount.counter] = count();
> }'
>
> @[a.out, 1]: 1
>
> i.e., get_write_access in handle_truncate transitioned the count 0 -> 1
>
> but then what prevents the following race:
>
> CPU0 CPU1
> open("test") execve("test")
> handle_truncate do_open_execat
> exe_file_deny_write_access # should
> succeed as count is 0?
> get_write_access # should fail as the count is now -1?
I'm not arguing that get_write_access() cannot fail. I'm arguing that it
cannot hit that WARN_ON() as you said above because get_write_access()
returns either 0 or -ETXTBUSY.
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-28 8:55 ` Christian Brauner
@ 2026-04-28 14:39 ` Mateusz Guzik
0 siblings, 0 replies; 15+ messages in thread
From: Mateusz Guzik @ 2026-04-28 14:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Jori Koolstra, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Alexander Viro, Arnd Bergmann,
H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
On Tue, Apr 28, 2026 at 10:55 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Apr 27, 2026 at 06:30:42PM +0200, Mateusz Guzik wrote:
> > On Mon, Apr 27, 2026 at 5:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > > Things proceed to handle_truncate:
> > > > int error = get_write_access(inode);
> > > > if (error)
> > > > return error;
> > > >
> > > > error = security_file_truncate(filp);
> > > > if (!error) {
> > > > error = do_truncate(idmap, path->dentry, 0,
> > > > ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
> > > > filp);
> > > > }
> > > >
> > > > I'm going to ignore the LSM situation and do_truncate failure modes in this one.
> > > >
> > > > AFAICS nothing prevents the same user from racing against file creation to
> > > > execve it, which starts with exe_file_deny_write_access. Should the
> > > > other thread win the race, get_write_access will fail and the WARN_ON
> > > > splat will be generated. That is definitely a problem.
> > >
> > > That can't happen:
> > >
> > > static inline int get_write_access(struct inode *inode)
> > > {
> > > return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
> > > }
> > >
> > > and the check is:
> > >
> > > error = handle_truncate(idmap, file);
> > > if (unlikely(error > 0)) {
> > >
> > > This was a catch all for broken LSM hook or ->open() instance.
> > >
> >
> > So with this prog:
> > #include <fcntl.h>
> >
> > int main(void)
> > {
> > open("test", O_TRUNC);
> > }
> >
> > I verified writecount is 0 on entry to handle_truncate like so:
> >
> > bpftrace -e 'kprobe:security_file_truncate { @[comm, (int64)((struct
> > file *)arg0)->f_path.dentry->d_inode->i_writecount.counter] = count();
> > }'
> >
> > @[a.out, 1]: 1
> >
> > i.e., get_write_access in handle_truncate transitioned the count 0 -> 1
> >
> > but then what prevents the following race:
> >
> > CPU0 CPU1
> > open("test") execve("test")
> > handle_truncate do_open_execat
> > exe_file_deny_write_access # should
> > succeed as count is 0?
> > get_write_access # should fail as the count is now -1?
>
> I'm not arguing that get_write_access() cannot fail. I'm arguing that it
> cannot hit that WARN_ON() as you said above because get_write_access()
> returns either 0 or -ETXTBUSY.
ops, right:
4681 │ error = handle_truncate(idmap, file);
4682 if (unlikely(error > 0)) {
4683 WARN_ON(1);
4684 error = -EINVAL;
4685 }
I mentally had it warn on any error.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-12 13:54 ` [RFC PATCH v2 1/2] " Jori Koolstra
2026-04-24 10:09 ` Mateusz Guzik
@ 2026-04-27 15:48 ` Christian Brauner
2026-04-28 1:14 ` Aleksa Sarai
` (2 more replies)
1 sibling, 3 replies; 15+ messages in thread
From: Christian Brauner @ 2026-04-27 15:48 UTC (permalink / raw)
To: Jori Koolstra, Jeff Layton
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat2() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/internal.h | 2 ++
> fs/namei.c | 44 +++++++++++++++++++++++---
> include/linux/syscalls.h | 2 ++
> include/uapi/asm-generic/unistd.h | 5 ++-
> scripts/syscall.tbl | 1 +
> 6 files changed, 50 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 524155d655da..e200ca2067a4 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -396,6 +396,7 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common mkdirat2 sys_mkdirat2
>
> #
> # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/internal.h b/fs/internal.h
> index cbc384a1aa09..c6a79afadacf 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -59,6 +59,8 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link);
> int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
> struct filename *newname, unsigned int flags);
> int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
> + unsigned int flags, bool open);
> int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
> int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
> int filename_linkat(int olddfd, struct filename *old, int newdfd,
> diff --git a/fs/namei.c b/fs/namei.c
> index a880454a6415..6451e96dc225 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -5255,18 +5255,36 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
> }
> EXPORT_SYMBOL(vfs_mkdir);
>
> -int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> +static int mkdirat_lookup_flags(unsigned int flags)
> +{
> + int lookup_flags = LOOKUP_DIRECTORY;
> +
> + if (!(flags & AT_SYMLINK_NOFOLLOW))
> + lookup_flags |= LOOKUP_FOLLOW;
> + if (!(flags & AT_NO_AUTOMOUNT))
> + lookup_flags |= LOOKUP_AUTOMOUNT;
> +
> + return lookup_flags;
> +}
> +
> +int filename_mkdirat(int dfd, struct filename *name, umode_t mode) {
> + return PTR_ERR_OR_ZERO(do_file_mkdirat(dfd, name, mode, 0, false));
> +}
> +
> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
> + unsigned int flags, bool open)
> {
> struct dentry *dentry;
> struct path path;
> int error;
> - unsigned int lookup_flags = LOOKUP_DIRECTORY;
> + struct file *filp = NULL;
> + unsigned int lookup_flags = mkdirat_lookup_flags(flags);
> struct delegated_inode delegated_inode = { };
>
> retry:
> dentry = filename_create(dfd, name, &path, lookup_flags);
> if (IS_ERR(dentry))
> - return PTR_ERR(dentry);
> + return ERR_CAST(dentry);
>
> error = security_path_mkdir(&path, dentry,
> mode_strip_umask(path.dentry->d_inode, mode));
> @@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
> if (IS_ERR(dentry))
> error = PTR_ERR(dentry);
> }
> + if (open && !error && !is_delegated(&delegated_inode)) {
> + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
> + }
So definitely a patchset worthing doing but this will be hairy. And
Mateusz is right. As written this doesn't work. The canonical pattern
how e.g., dentry_open() does it is to preallocate the file.
I do wonder though whether we shouldn't just make O_CREAT | O_DIRECTORY
work. I remember that I had a vague comment about this in [1] a few
years ago (cf. [1]). It might even be less hairy to get that one right
as all the thinking for O_CREAT is already there.
What was the rationale for mkdirat2() instead of threading this through
openat()/openat2() with O_CREAT?
And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
O_DIRECTORY?
[1]: 43b450632676 ("open: return EINVAL for O_DIRECTORY | O_CREAT")
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-27 15:48 ` Christian Brauner
@ 2026-04-28 1:14 ` Aleksa Sarai
2026-04-28 6:39 ` Jeff Layton
2026-04-28 13:39 ` Stefan Metzmacher
2 siblings, 0 replies; 15+ messages in thread
From: Aleksa Sarai @ 2026-04-28 1:14 UTC (permalink / raw)
To: Christian Brauner
Cc: Jori Koolstra, Jeff Layton, Andy Lutomirski, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, Alexander Viro,
Arnd Bergmann, H . Peter Anvin, Jan Kara, Peter Zijlstra,
Andrey Albershteyn, Masami Hiramatsu, Jiri Olsa,
Thomas Weißschuh, Mathieu Desnoyers, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
[-- Attachment #1: Type: text/plain, Size: 1380 bytes --]
On 2026-04-27, Christian Brauner <brauner@kernel.org> wrote:
> On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
> > + if (open && !error && !is_delegated(&delegated_inode)) {
> > + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> > + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
> > + }
>
> So definitely a patchset worthing doing but this will be hairy. And
> Mateusz is right. As written this doesn't work. The canonical pattern
> how e.g., dentry_open() does it is to preallocate the file.
>
> I do wonder though whether we shouldn't just make O_CREAT | O_DIRECTORY
> work. I remember that I had a vague comment about this in [1] a few
> years ago (cf. [1]). It might even be less hairy to get that one right
> as all the thinking for O_CREAT is already there.
That would be my preference, as it would also allow us to use RESOLVE_*
flags nicely.
> What was the rationale for mkdirat2() instead of threading this through
> openat()/openat2() with O_CREAT?
Mateusz said that he didn't like the idea of having more branches in
the open() paths, I think that ship has long since sailed tbh.
> And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
> O_DIRECTORY?
>
> [1]: 43b450632676 ("open: return EINVAL for O_DIRECTORY | O_CREAT")
--
Aleksa Sarai
https://www.cyphar.com/
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-27 15:48 ` Christian Brauner
2026-04-28 1:14 ` Aleksa Sarai
@ 2026-04-28 6:39 ` Jeff Layton
2026-04-28 7:01 ` Jeff Layton
2026-04-28 13:39 ` Stefan Metzmacher
2 siblings, 1 reply; 15+ messages in thread
From: Jeff Layton @ 2026-04-28 6:39 UTC (permalink / raw)
To: Christian Brauner, Jori Koolstra
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
On Mon, 2026-04-27 at 17:48 +0200, Christian Brauner wrote:
>
>
> And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
> O_DIRECTORY?
>
No, it can't. OPEN calls only work on regular files. This is why
O_DIRECTORY works on NFS. If we end up issuing an OPEN against a
directory, it'll fail, which is what we want in that situation.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-28 6:39 ` Jeff Layton
@ 2026-04-28 7:01 ` Jeff Layton
0 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2026-04-28 7:01 UTC (permalink / raw)
To: Christian Brauner, Jori Koolstra
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
On Tue, 2026-04-28 at 07:39 +0100, Jeff Layton wrote:
> On Mon, 2026-04-27 at 17:48 +0200, Christian Brauner wrote:
> >
> >
> > And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
> > O_DIRECTORY?
> >
>
> No, it can't. OPEN calls only work on regular files. This is why
> O_DIRECTORY works on NFS. If we end up issuing an OPEN against a
> directory, it'll fail, which is what we want in that situation.
To be clear, we could make that work by sending a second RPC:
PUTFH+OPEN+.... (OPEN fails with NFS4ERR_ISDIR)
...and then send:
PUTFH+CREATE...
...for a directory (which is how mkdir works in v4). If the calls race
with something else being created in its place, we could just open it
if it's a directory, or fail.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-27 15:48 ` Christian Brauner
2026-04-28 1:14 ` Aleksa Sarai
2026-04-28 6:39 ` Jeff Layton
@ 2026-04-28 13:39 ` Stefan Metzmacher
2026-04-28 13:49 ` Stefan Metzmacher
2026-04-28 14:01 ` Paulo Alcantara
2 siblings, 2 replies; 15+ messages in thread
From: Stefan Metzmacher @ 2026-04-28 13:39 UTC (permalink / raw)
To: Christian Brauner, Jori Koolstra, Jeff Layton
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
Am 27.04.26 um 17:48 schrieb Christian Brauner:
> On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
>> Currently there is no way to race-freely create and open a directory.
>> For regular files we have open(O_CREAT) for creating a new file inode,
>> and returning a pinning fd to it. The lack of such functionality for
>> directories means that when populating a directory tree there's always
>> a race involved: the inodes first need to be created, and then opened
>> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
>> but in the time window between the creation and the opening they might
>> be replaced by something else.
>>
>> Addressing this race without proper APIs is possible (by immediately
>> fstat()ing what was opened, to verify that it has the right inode type),
>> but difficult to get right. Hence, mkdirat2() that creates a directory
>> and returns an O_DIRECTORY fd is useful.
>>
>> This feature idea (and description) is taken from the UAPI group:
>> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>>
>> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>> ---
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/internal.h | 2 ++
>> fs/namei.c | 44 +++++++++++++++++++++++---
>> include/linux/syscalls.h | 2 ++
>> include/uapi/asm-generic/unistd.h | 5 ++-
>> scripts/syscall.tbl | 1 +
>> 6 files changed, 50 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 524155d655da..e200ca2067a4 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -396,6 +396,7 @@
>> 469 common file_setattr sys_file_setattr
>> 470 common listns sys_listns
>> 471 common rseq_slice_yield sys_rseq_slice_yield
>> +472 common mkdirat2 sys_mkdirat2
>>
>> #
>> # Due to a historical design error, certain syscalls are numbered differently
>> diff --git a/fs/internal.h b/fs/internal.h
>> index cbc384a1aa09..c6a79afadacf 100644
>> --- a/fs/internal.h
>> +++ b/fs/internal.h
>> @@ -59,6 +59,8 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link);
>> int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
>> struct filename *newname, unsigned int flags);
>> int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>> + unsigned int flags, bool open);
>> int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
>> int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
>> int filename_linkat(int olddfd, struct filename *old, int newdfd,
>> diff --git a/fs/namei.c b/fs/namei.c
>> index a880454a6415..6451e96dc225 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -5255,18 +5255,36 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>> }
>> EXPORT_SYMBOL(vfs_mkdir);
>>
>> -int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>> +static int mkdirat_lookup_flags(unsigned int flags)
>> +{
>> + int lookup_flags = LOOKUP_DIRECTORY;
>> +
>> + if (!(flags & AT_SYMLINK_NOFOLLOW))
>> + lookup_flags |= LOOKUP_FOLLOW;
>> + if (!(flags & AT_NO_AUTOMOUNT))
>> + lookup_flags |= LOOKUP_AUTOMOUNT;
>> +
>> + return lookup_flags;
>> +}
>> +
>> +int filename_mkdirat(int dfd, struct filename *name, umode_t mode) {
>> + return PTR_ERR_OR_ZERO(do_file_mkdirat(dfd, name, mode, 0, false));
>> +}
>> +
>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>> + unsigned int flags, bool open)
>> {
>> struct dentry *dentry;
>> struct path path;
>> int error;
>> - unsigned int lookup_flags = LOOKUP_DIRECTORY;
>> + struct file *filp = NULL;
>> + unsigned int lookup_flags = mkdirat_lookup_flags(flags);
>> struct delegated_inode delegated_inode = { };
>>
>> retry:
>> dentry = filename_create(dfd, name, &path, lookup_flags);
>> if (IS_ERR(dentry))
>> - return PTR_ERR(dentry);
>> + return ERR_CAST(dentry);
>>
>> error = security_path_mkdir(&path, dentry,
>> mode_strip_umask(path.dentry->d_inode, mode));
>> @@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>> if (IS_ERR(dentry))
>> error = PTR_ERR(dentry);
>> }
>> + if (open && !error && !is_delegated(&delegated_inode)) {
>> + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
>> + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
>> + }
>
> So definitely a patchset worthing doing but this will be hairy. And
> Mateusz is right. As written this doesn't work. The canonical pattern
> how e.g., dentry_open() does it is to preallocate the file.
>
> I do wonder though whether we shouldn't just make O_CREAT | O_DIRECTORY
> work. I remember that I had a vague comment about this in [1] a few
> years ago (cf. [1]). It might even be less hairy to get that one right
> as all the thinking for O_CREAT is already there.
>
> What was the rationale for mkdirat2() instead of threading this through
> openat()/openat2() with O_CREAT?
>
> And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
> O_DIRECTORY?
If it helps the SMB2/3 protocol only has a single SMB2 Create operation
that uses FILE_CREATE+FILE_NON_DIRECTORY_FILE or FILE_CREATE+FILE_DIRECTORY_FILE.
Given all the openat() ignores unknown flags or combinations, maybe this
should be openat2 only and even a new flag (at the for the userspace interface).
or do_sys_open() will reject it for open and openat.
While we're there an O_TMPDIR would also be wonderful to have.
Currently samba works around it by using a hidden directory name, invisible
for SMB clients, but nfs and local users see it.
metze
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-28 13:39 ` Stefan Metzmacher
@ 2026-04-28 13:49 ` Stefan Metzmacher
2026-04-28 14:01 ` Paulo Alcantara
1 sibling, 0 replies; 15+ messages in thread
From: Stefan Metzmacher @ 2026-04-28 13:49 UTC (permalink / raw)
To: Christian Brauner, Jori Koolstra, Jeff Layton
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
Am 28.04.26 um 15:39 schrieb Stefan Metzmacher:
> Am 27.04.26 um 17:48 schrieb Christian Brauner:
>> On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
>>> Currently there is no way to race-freely create and open a directory.
>>> For regular files we have open(O_CREAT) for creating a new file inode,
>>> and returning a pinning fd to it. The lack of such functionality for
>>> directories means that when populating a directory tree there's always
>>> a race involved: the inodes first need to be created, and then opened
>>> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
>>> but in the time window between the creation and the opening they might
>>> be replaced by something else.
>>>
>>> Addressing this race without proper APIs is possible (by immediately
>>> fstat()ing what was opened, to verify that it has the right inode type),
>>> but difficult to get right. Hence, mkdirat2() that creates a directory
>>> and returns an O_DIRECTORY fd is useful.
>>>
>>> This feature idea (and description) is taken from the UAPI group:
>>> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>>>
>>> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>>> ---
>>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>>> fs/internal.h | 2 ++
>>> fs/namei.c | 44 +++++++++++++++++++++++---
>>> include/linux/syscalls.h | 2 ++
>>> include/uapi/asm-generic/unistd.h | 5 ++-
>>> scripts/syscall.tbl | 1 +
>>> 6 files changed, 50 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>>> index 524155d655da..e200ca2067a4 100644
>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>> @@ -396,6 +396,7 @@
>>> 469 common file_setattr sys_file_setattr
>>> 470 common listns sys_listns
>>> 471 common rseq_slice_yield sys_rseq_slice_yield
>>> +472 common mkdirat2 sys_mkdirat2
>>> #
>>> # Due to a historical design error, certain syscalls are numbered differently
>>> diff --git a/fs/internal.h b/fs/internal.h
>>> index cbc384a1aa09..c6a79afadacf 100644
>>> --- a/fs/internal.h
>>> +++ b/fs/internal.h
>>> @@ -59,6 +59,8 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link);
>>> int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
>>> struct filename *newname, unsigned int flags);
>>> int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
>>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>>> + unsigned int flags, bool open);
>>> int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
>>> int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
>>> int filename_linkat(int olddfd, struct filename *old, int newdfd,
>>> diff --git a/fs/namei.c b/fs/namei.c
>>> index a880454a6415..6451e96dc225 100644
>>> --- a/fs/namei.c
>>> +++ b/fs/namei.c
>>> @@ -5255,18 +5255,36 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>>> }
>>> EXPORT_SYMBOL(vfs_mkdir);
>>> -int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>>> +static int mkdirat_lookup_flags(unsigned int flags)
>>> +{
>>> + int lookup_flags = LOOKUP_DIRECTORY;
>>> +
>>> + if (!(flags & AT_SYMLINK_NOFOLLOW))
>>> + lookup_flags |= LOOKUP_FOLLOW;
>>> + if (!(flags & AT_NO_AUTOMOUNT))
>>> + lookup_flags |= LOOKUP_AUTOMOUNT;
>>> +
>>> + return lookup_flags;
>>> +}
>>> +
>>> +int filename_mkdirat(int dfd, struct filename *name, umode_t mode) {
>>> + return PTR_ERR_OR_ZERO(do_file_mkdirat(dfd, name, mode, 0, false));
>>> +}
>>> +
>>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>>> + unsigned int flags, bool open)
>>> {
>>> struct dentry *dentry;
>>> struct path path;
>>> int error;
>>> - unsigned int lookup_flags = LOOKUP_DIRECTORY;
>>> + struct file *filp = NULL;
>>> + unsigned int lookup_flags = mkdirat_lookup_flags(flags);
>>> struct delegated_inode delegated_inode = { };
>>> retry:
>>> dentry = filename_create(dfd, name, &path, lookup_flags);
>>> if (IS_ERR(dentry))
>>> - return PTR_ERR(dentry);
>>> + return ERR_CAST(dentry);
>>> error = security_path_mkdir(&path, dentry,
>>> mode_strip_umask(path.dentry->d_inode, mode));
>>> @@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>>> if (IS_ERR(dentry))
>>> error = PTR_ERR(dentry);
>>> }
>>> + if (open && !error && !is_delegated(&delegated_inode)) {
>>> + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
>>> + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
>>> + }
>>
>> So definitely a patchset worthing doing but this will be hairy. And
>> Mateusz is right. As written this doesn't work. The canonical pattern
>> how e.g., dentry_open() does it is to preallocate the file.
>>
>> I do wonder though whether we shouldn't just make O_CREAT | O_DIRECTORY
>> work. I remember that I had a vague comment about this in [1] a few
>> years ago (cf. [1]). It might even be less hairy to get that one right
>> as all the thinking for O_CREAT is already there.
>>
>> What was the rationale for mkdirat2() instead of threading this through
>> openat()/openat2() with O_CREAT?
>>
>> And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
>> O_DIRECTORY?
>
> If it helps the SMB2/3 protocol only has a single SMB2 Create operation
> that uses FILE_CREATE+FILE_NON_DIRECTORY_FILE or FILE_CREATE+FILE_DIRECTORY_FILE.
>
> Given all the openat() ignores unknown flags or combinations, maybe this
> should be openat2 only and even a new flag (at the for the userspace interface).
> or do_sys_open() will reject it for open and openat.
I just found the interaction of __O_TMPFILE and O_DIRECTORY
there should be a O_MKDIR or something similar that's openat2 only.
> While we're there an O_TMPDIR would also be wonderful to have.
> Currently samba works around it by using a hidden directory name, invisible
> for SMB clients, but nfs and local users see it.
That should also be openat2 only if added.
metze
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [RFC PATCH v2 1/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd
2026-04-28 13:39 ` Stefan Metzmacher
2026-04-28 13:49 ` Stefan Metzmacher
@ 2026-04-28 14:01 ` Paulo Alcantara
1 sibling, 0 replies; 15+ messages in thread
From: Paulo Alcantara @ 2026-04-28 14:01 UTC (permalink / raw)
To: Stefan Metzmacher, Christian Brauner, Jori Koolstra, Jeff Layton
Cc: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Arnd Bergmann, H . Peter Anvin,
Jan Kara, Peter Zijlstra, Andrey Albershteyn, Masami Hiramatsu,
Jiri Olsa, Thomas Weißschuh, Mathieu Desnoyers, Aleksa Sarai,
cmirabil, Greg Kroah-Hartman, linux-kernel, linux-fsdevel,
linux-api, linux-arch
Stefan Metzmacher <metze@samba.org> writes:
> Am 27.04.26 um 17:48 schrieb Christian Brauner:
>> On Sun, Apr 12, 2026 at 03:54:33PM +0200, Jori Koolstra wrote:
>>> Currently there is no way to race-freely create and open a directory.
>>> For regular files we have open(O_CREAT) for creating a new file inode,
>>> and returning a pinning fd to it. The lack of such functionality for
>>> directories means that when populating a directory tree there's always
>>> a race involved: the inodes first need to be created, and then opened
>>> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
>>> but in the time window between the creation and the opening they might
>>> be replaced by something else.
>>>
>>> Addressing this race without proper APIs is possible (by immediately
>>> fstat()ing what was opened, to verify that it has the right inode type),
>>> but difficult to get right. Hence, mkdirat2() that creates a directory
>>> and returns an O_DIRECTORY fd is useful.
>>>
>>> This feature idea (and description) is taken from the UAPI group:
>>> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>>>
>>> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>>> ---
>>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>>> fs/internal.h | 2 ++
>>> fs/namei.c | 44 +++++++++++++++++++++++---
>>> include/linux/syscalls.h | 2 ++
>>> include/uapi/asm-generic/unistd.h | 5 ++-
>>> scripts/syscall.tbl | 1 +
>>> 6 files changed, 50 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>>> index 524155d655da..e200ca2067a4 100644
>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>> @@ -396,6 +396,7 @@
>>> 469 common file_setattr sys_file_setattr
>>> 470 common listns sys_listns
>>> 471 common rseq_slice_yield sys_rseq_slice_yield
>>> +472 common mkdirat2 sys_mkdirat2
>>>
>>> #
>>> # Due to a historical design error, certain syscalls are numbered differently
>>> diff --git a/fs/internal.h b/fs/internal.h
>>> index cbc384a1aa09..c6a79afadacf 100644
>>> --- a/fs/internal.h
>>> +++ b/fs/internal.h
>>> @@ -59,6 +59,8 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link);
>>> int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
>>> struct filename *newname, unsigned int flags);
>>> int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
>>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>>> + unsigned int flags, bool open);
>>> int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
>>> int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
>>> int filename_linkat(int olddfd, struct filename *old, int newdfd,
>>> diff --git a/fs/namei.c b/fs/namei.c
>>> index a880454a6415..6451e96dc225 100644
>>> --- a/fs/namei.c
>>> +++ b/fs/namei.c
>>> @@ -5255,18 +5255,36 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>>> }
>>> EXPORT_SYMBOL(vfs_mkdir);
>>>
>>> -int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>>> +static int mkdirat_lookup_flags(unsigned int flags)
>>> +{
>>> + int lookup_flags = LOOKUP_DIRECTORY;
>>> +
>>> + if (!(flags & AT_SYMLINK_NOFOLLOW))
>>> + lookup_flags |= LOOKUP_FOLLOW;
>>> + if (!(flags & AT_NO_AUTOMOUNT))
>>> + lookup_flags |= LOOKUP_AUTOMOUNT;
>>> +
>>> + return lookup_flags;
>>> +}
>>> +
>>> +int filename_mkdirat(int dfd, struct filename *name, umode_t mode) {
>>> + return PTR_ERR_OR_ZERO(do_file_mkdirat(dfd, name, mode, 0, false));
>>> +}
>>> +
>>> +struct file *do_file_mkdirat(int dfd, struct filename *name, umode_t mode,
>>> + unsigned int flags, bool open)
>>> {
>>> struct dentry *dentry;
>>> struct path path;
>>> int error;
>>> - unsigned int lookup_flags = LOOKUP_DIRECTORY;
>>> + struct file *filp = NULL;
>>> + unsigned int lookup_flags = mkdirat_lookup_flags(flags);
>>> struct delegated_inode delegated_inode = { };
>>>
>>> retry:
>>> dentry = filename_create(dfd, name, &path, lookup_flags);
>>> if (IS_ERR(dentry))
>>> - return PTR_ERR(dentry);
>>> + return ERR_CAST(dentry);
>>>
>>> error = security_path_mkdir(&path, dentry,
>>> mode_strip_umask(path.dentry->d_inode, mode));
>>> @@ -5276,6 +5294,10 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>>> if (IS_ERR(dentry))
>>> error = PTR_ERR(dentry);
>>> }
>>> + if (open && !error && !is_delegated(&delegated_inode)) {
>>> + const struct path new_path = { .mnt = path.mnt, .dentry = dentry };
>>> + filp = dentry_open(&new_path, O_DIRECTORY, current_cred());
>>> + }
>>
>> So definitely a patchset worthing doing but this will be hairy. And
>> Mateusz is right. As written this doesn't work. The canonical pattern
>> how e.g., dentry_open() does it is to preallocate the file.
>>
>> I do wonder though whether we shouldn't just make O_CREAT | O_DIRECTORY
>> work. I remember that I had a vague comment about this in [1] a few
>> years ago (cf. [1]). It might even be less hairy to get that one right
>> as all the thinking for O_CREAT is already there.
>>
>> What was the rationale for mkdirat2() instead of threading this through
>> openat()/openat2() with O_CREAT?
>>
>> And side-question: @Jeff, can nfs atomic open deal with O_CREAT |
>> O_DIRECTORY?
>
> If it helps the SMB2/3 protocol only has a single SMB2 Create operation
> that uses FILE_CREATE+FILE_NON_DIRECTORY_FILE or FILE_CREATE+FILE_DIRECTORY_FILE.
Yes. However cifs.ko will handle atomic open of regular files only.
IIRC, NFS also doesn't handle atomic opens of directories either. Jeff
could confirm that.
^ permalink raw reply [flat|nested] 15+ messages in thread
* [RFC PATCH v2 2/2] selftest: add tests for mkdirat2()
2026-04-12 13:54 [RFC PATCH v2 0/2] vfs: syscalls: add mkdirat2() that returns an O_DIRECTORY fd Jori Koolstra
2026-04-12 13:54 ` [RFC PATCH v2 1/2] " Jori Koolstra
@ 2026-04-12 13:54 ` Jori Koolstra
1 sibling, 0 replies; 15+ messages in thread
From: Jori Koolstra @ 2026-04-12 13:54 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, Alexander Viro, Christian Brauner,
Arnd Bergmann
Cc: H . Peter Anvin, Jan Kara, Peter Zijlstra, Andrey Albershteyn,
Masami Hiramatsu, Jori Koolstra, Jiri Olsa, Thomas Weißschuh,
Mathieu Desnoyers, Jeff Layton, Aleksa Sarai, cmirabil,
Greg Kroah-Hartman, linux-kernel, linux-fsdevel, linux-api,
linux-arch
Add some tests for the new mkdirat2() syscall to test compliance and
to showcase its behaviour.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
tools/include/uapi/asm-generic/unistd.h | 5 +-
.../testing/selftests/filesystems/.gitignore | 1 +
tools/testing/selftests/filesystems/Makefile | 4 +-
.../selftests/filesystems/mkdirat_fd_test.c | 143 ++++++++++++++++++
4 files changed, 150 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/mkdirat_fd_test.c
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index a627acc8fb5f..6efc21779b62 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
#define __NR_rseq_slice_yield 471
__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+#define __NR_mkdirat2 472
+__SYSCALL(__NR_mkdirat2, sys_mkdirat2)
+
#undef __NR_syscalls
-#define __NR_syscalls 472
+#define __NR_syscalls 473
/*
* 32 bit systems traditionally used different
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 64ac0dfa46b7..84e2175d171f 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -5,3 +5,4 @@ fclog
file_stressor
anon_inode_test
kernfs_test
+mkdirat_fd_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..7357769db57a 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+CFLAGS += $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog mkdirat_fd_test
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/mkdirat_fd_test.c b/tools/testing/selftests/filesystems/mkdirat_fd_test.c
new file mode 100644
index 000000000000..a02c0223d63b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mkdirat_fd_test.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/stat.h>
+
+#include <asm-generic/unistd.h>
+
+#include "kselftest_harness.h"
+
+#ifndef VALID_MKDIRAT2_FLAGS
+#define VALID_MKDIRAT2_FLAGS (AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)
+#endif
+
+#define mkdirat2_checked_flags(dfd, pathname, flags) ({ \
+ struct stat __st; \
+ int __fd = sys_mkdirat2(dfd, pathname, S_IRWXU, flags); \
+ ASSERT_GE(__fd, 0); \
+ EXPECT_EQ(fstat(__fd, &__st), 0); \
+ EXPECT_TRUE(S_ISDIR(__st.st_mode)); \
+ __fd; \
+})
+
+#define mkdirat2_checked(dfd, pathname) \
+ mkdirat2_checked_flags(dfd, pathname, 0)
+
+
+static inline int sys_mkdirat2(int dfd, const char *pathname, mode_t mode,
+ unsigned int flags)
+{
+ return syscall(__NR_mkdirat2, dfd, pathname, mode, flags);
+}
+
+FIXTURE(mkdirat2) {
+ char dirpath[PATH_MAX];
+ int dfd;
+};
+
+FIXTURE_SETUP(mkdirat2)
+{
+ snprintf(self->dirpath, sizeof(self->dirpath),
+ "/tmp/mkdirat2_test.%d", getpid());
+ ASSERT_EQ(mkdir(self->dirpath, S_IRWXU), 0);
+
+ self->dfd = open(self->dirpath, O_DIRECTORY);
+ ASSERT_GE(self->dfd, 0);
+}
+
+FIXTURE_TEARDOWN(mkdirat2)
+{
+ close(self->dfd);
+ rmdir(self->dirpath);
+}
+
+/* Does mkdirat2 return a fd at all */
+TEST_F(mkdirat2, returns_fd)
+{
+ int fd = mkdirat2_checked(self->dfd, "newdir");
+ EXPECT_EQ(close(fd), 0)
+ EXPECT_EQ(unlinkat(self->dfd, "newdir", AT_REMOVEDIR), 0);
+}
+
+/* The fd must refer to the directory that was just created. */
+TEST_F(mkdirat2, fd_is_created_dir)
+{
+ int fd;
+ struct stat st_via_fd, st_via_path;
+ char path[PATH_MAX];
+
+ fd = mkdirat2_checked(self->dfd, "checkdir");
+
+ ASSERT_EQ(fstat(fd, &st_via_fd), 0);
+
+ snprintf(path, sizeof(path), "%s/checkdir", self->dirpath);
+ ASSERT_EQ(stat(path, &st_via_path), 0);
+
+ EXPECT_EQ(st_via_fd.st_ino, st_via_path.st_ino);
+ EXPECT_EQ(st_via_fd.st_dev, st_via_path.st_dev);
+
+ EXPECT_EQ(close(fd), 0)
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+
+/* Missing parent component must fail with ENOENT. */
+TEST_F(mkdirat2, enoent_missing_parent)
+{
+ EXPECT_EQ(sys_mkdirat2(self->dfd, "nonexistent/child", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, ENOENT);
+}
+
+/* An invalid dfd must fail with EBADF. */
+TEST_F(mkdirat2, ebadf)
+{
+ EXPECT_EQ(sys_mkdirat2(-42, "badfdir", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, EBADF);
+}
+
+/* A dfd that points to a file (not a directory) must fail with ENOTDIR. */
+TEST_F(mkdirat2, enotdir_dfd)
+{
+ int file_fd;
+
+ file_fd = openat(self->dfd, "file",
+ O_CREAT | O_WRONLY, S_IRWXU);
+ ASSERT_GE(file_fd, 0);
+
+ EXPECT_EQ(sys_mkdirat2(file_fd, "subdir", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, ENOTDIR);
+
+ EXPECT_EQ(close(file_fd), 0);
+ EXPECT_EQ(unlinkat(self->dfd, "file", 0), 0);
+}
+
+/*
+ * The returned fd must be usable as a dfd for further *at() calls.
+ */
+TEST_F(mkdirat2, fd_usable_as_dfd)
+{
+ int parent_fd, child_fd;
+
+ parent_fd = mkdirat2_checked(self->dfd, "parent");
+ child_fd = mkdirat2_checked(parent_fd, "child");
+
+ EXPECT_EQ(close(child_fd), 0);
+ EXPECT_EQ(close(parent_fd), 0);
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/parent/child", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+ snprintf(path, sizeof(path), "%s/parent", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+/* Unknown flags must be rejected with EINVAL. */
+TEST_F(mkdirat2, einval_unknown_flags)
+{
+ EXPECT_EQ(sys_mkdirat2(self->dfd, "flagsdir", S_IRWXU, ~VALID_MKDIRAT2_FLAGS ), -1);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+TEST_HARNESS_MAIN
--
2.53.0
^ permalink raw reply related [flat|nested] 15+ messages in thread