* [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2)
@ 2026-06-14 16:44 Jori Koolstra
2026-06-14 16:44 ` [PATCH 01/12] fs/namei.c: use trailing_slashes() Jori Koolstra
` (11 more replies)
0 siblings, 12 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
This series implements new semantics for the O_CREAT|O_DIRECTORY flag
combination for open*(2): perform a mkdir and open the resulting
directory; return a pinning fd (which mkdir does not).
Jori Koolstra (12):
fs/namei.c: use trailing_slashes()
fs/namei.c: move create error && negative dentry case in lookup_open
up
vfs: prepare vfs_creat|mkdir_no_perm for reuse in lookup_open()
fs/namei.c: lookup_open(): move audit_inode_child() up
vfs: lookup_open(): move setting FMODE_CREATED up when calling
create()
vfs: lookup_open(): move i_op->create check to before
try_break_deleg()
vfs: lookup_open(): use vfs_create_no_perm()
vfs: add O_CREAT|O_DIRECTORY to open*(2)
vfs: move O_IS_MKDIR check out atomic_open() to individual filesystems
vfs: refuse O_CREAT for directories through a dangling symlink
vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens
selftest: add tests for open*(O_CREAT|O_DIRECTORY)
fs/9p/vfs_inode.c | 3 +
fs/9p/vfs_inode_dotl.c | 3 +
fs/ceph/file.c | 3 +
fs/fuse/dir.c | 3 +
fs/gfs2/inode.c | 3 +
fs/namei.c | 233 ++++++++++++------
fs/nfs/dir.c | 3 +
fs/nfs/file.c | 3 +
fs/open.c | 32 ++-
fs/smb/client/dir.c | 3 +
fs/vboxsf/dir.c | 3 +
include/linux/fcntl.h | 7 +
.../testing/selftests/filesystems/.gitignore | 1 +
tools/testing/selftests/filesystems/Makefile | 4 +-
tools/testing/selftests/filesystems/fclog.c | 1 +
.../filesystems/open_o_creat_o_dir.c | 197 +++++++++++++++
16 files changed, 413 insertions(+), 89 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/open_o_creat_o_dir.c
base-commit: 8cd9520d35a6c38db6567e97dd93b1f11f185dc6
--
2.54.0
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 01/12] fs/namei.c: use trailing_slashes()
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-15 9:26 ` David Laight
2026-06-14 16:44 ` [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up Jori Koolstra
` (10 subsequent siblings)
11 siblings, 1 reply; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
There are several places in fs/namei.c that can use the
trailing_slashes() function to improve intent.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 4787244ca4a7..64b91ed9efb7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2777,9 +2777,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
return s;
}
+static inline bool trailing_slashes(const struct qstr last)
+{
+ return (bool)last.name[last.len];
+}
+
static inline const char *lookup_last(struct nameidata *nd)
{
- if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+ if (nd->last_type == LAST_NORM && trailing_slashes(nd->last))
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
return walk_component(nd, WALK_TRAILING);
@@ -4524,17 +4529,12 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
return ERR_PTR(error);
}
-static inline bool trailing_slashes(struct nameidata *nd)
-{
- return (bool)nd->last.name[nd->last.len];
-}
-
static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
{
struct dentry *dentry;
if (open_flag & O_CREAT) {
- if (trailing_slashes(nd))
+ if (trailing_slashes(nd->last))
return ERR_PTR(-EISDIR);
/* Don't bother on an O_EXCL create */
@@ -4542,7 +4542,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
return NULL;
}
- if (trailing_slashes(nd))
+ if (trailing_slashes(nd->last))
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
dentry = lookup_fast(nd);
@@ -4945,7 +4945,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* Do the final lookup. Suppress 'create' if there is a trailing
* '/', and a directory wasn't requested.
*/
- if (last.name[last.len] && !want_dir)
+ if (trailing_slashes(last) && !want_dir)
create_flags &= ~LOOKUP_CREATE;
dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
if (IS_ERR(dentry))
@@ -5562,7 +5562,7 @@ int filename_unlinkat(int dfd, struct filename *name)
goto exit_drop_write;
/* Why not before? Because we want correct error value */
- if (unlikely(last.name[last.len])) {
+ if (unlikely(trailing_slashes(last))) {
if (d_is_dir(dentry))
error = -EISDIR;
else
@@ -6161,16 +6161,16 @@ int filename_renameat2(int olddfd, struct filename *from,
if (flags & RENAME_EXCHANGE) {
if (!d_is_dir(rd.new_dentry)) {
error = -ENOTDIR;
- if (new_last.name[new_last.len])
+ if (trailing_slashes(new_last))
goto exit_unlock;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(rd.old_dentry)) {
error = -ENOTDIR;
- if (old_last.name[old_last.len])
+ if (trailing_slashes(old_last))
goto exit_unlock;
- if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
+ if (!(flags & RENAME_EXCHANGE) && trailing_slashes(new_last))
goto exit_unlock;
}
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
2026-06-14 16:44 ` [PATCH 01/12] fs/namei.c: use trailing_slashes() Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-15 9:37 ` David Laight
2026-06-14 16:44 ` [PATCH 03/12] vfs: prepare vfs_creat|mkdir_no_perm for reuse in lookup_open() Jori Koolstra
` (9 subsequent siblings)
11 siblings, 1 reply; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
O_CREAT is stripped when create_error is set in lookup_open(), so when
lookup does not return an inode, the case
if (!dentry->d_inode && (open_flag & O_CREAT))
is always skipped. We can get rid of this cognitive step by handling the
error case first.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 64b91ed9efb7..f169d1123b17 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4499,6 +4499,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
}
}
+ if (unlikely(create_error) && !dentry->d_inode) {
+ error = create_error;
+ goto out_dput;
+ }
+
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
/* but break the directory lease first! */
@@ -4518,10 +4523,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (error)
goto out_dput;
}
- if (unlikely(create_error) && !dentry->d_inode) {
- error = create_error;
- goto out_dput;
- }
+
return dentry;
out_dput:
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 03/12] vfs: prepare vfs_creat|mkdir_no_perm for reuse in lookup_open()
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
2026-06-14 16:44 ` [PATCH 01/12] fs/namei.c: use trailing_slashes() Jori Koolstra
2026-06-14 16:44 ` [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up Jori Koolstra
` (8 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
To implement O_CREAT|O_DIRECTORY we will have to repeat some of the
logic that is now in vfs_mkdir() (e.g. do error checks in the same
order). Separate this out in vfs_mkdir_no_perm(), which does all the
non-permission related work of vfs_mkdir(). Permission checking for the
lookup_open() path is timed differently because we may just be doing an
open and no create. Similar considerations give rise to
vfs_create_no_perm().
Moving the fsnotify_* calls also allows us to deal with this in one
place for each type of operation. This does mean that we also need
to move the fsnotify_* calls into atomic_open() for the atomic open
case, but this actually reduces duplicate code in open_last_lookups()
and dentry_create().
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 105 ++++++++++++++++++++++++++++++++++-------------------
1 file changed, 68 insertions(+), 37 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index f169d1123b17..6bf1ded26377 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4171,6 +4171,24 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
return mode;
}
+static inline
+int vfs_create_no_perm(struct mnt_idmap *idmap, struct dentry *dentry,
+ umode_t mode, struct delegated_inode *di)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ int error;
+
+ error = try_break_deleg(dir, di);
+ if (error)
+ return error;
+
+ error = dir->i_op->create(idmap, dir, dentry, mode, true);
+ if (!error)
+ fsnotify_create(dir, dentry);
+
+ return error;
+}
+
/**
* vfs_create - create new file
* @idmap: idmap of the mount the inode was found from
@@ -4203,13 +4221,8 @@ int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- error = try_break_deleg(dir, di);
- if (error)
- return error;
- error = dir->i_op->create(idmap, dir, dentry, mode, true);
- if (!error)
- fsnotify_create(dir, dentry);
- return error;
+
+ return vfs_create_no_perm(idmap, dentry, mode, di);
}
EXPORT_SYMBOL(vfs_create);
@@ -4386,10 +4399,17 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
error = -ENOENT;
}
}
+
if (error) {
dput(dentry);
dentry = ERR_PTR(error);
+ } else {
+ if (file->f_mode & FMODE_CREATED)
+ fsnotify_create(dir, dentry);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
}
+
return dentry;
}
@@ -4522,6 +4542,8 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
mode, open_flag & O_EXCL);
if (error)
goto out_dput;
+
+ fsnotify_create(dir_inode, dentry);
}
return dentry;
@@ -4610,13 +4632,9 @@ static const char *open_last_lookups(struct nameidata *nd,
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
+
dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
- if (!IS_ERR(dentry)) {
- if (file->f_mode & FMODE_CREATED)
- fsnotify_create(dir->d_inode, dentry);
- if (file->f_mode & FMODE_OPENED)
- fsnotify_open(file);
- }
+
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
@@ -5061,13 +5079,6 @@ struct file *dentry_create(struct path *path, int flags, umode_t mode,
if (unlikely(create_error) && error == -ENOENT)
error = create_error;
- if (!error) {
- if (file->f_mode & FMODE_CREATED)
- fsnotify_create(dir->d_inode, dentry);
- if (file->f_mode & FMODE_OPENED)
- fsnotify_open(file);
- }
-
path->dentry = dentry;
} else {
@@ -5219,6 +5230,39 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
return filename_mknodat(AT_FDCWD, name, mode, dev);
}
+static inline
+struct dentry *vfs_mkdir_no_perm(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+ struct delegated_inode *di)
+{
+ int error;
+ struct dentry *de;
+ unsigned max_links = dir->i_sb->s_max_links;
+
+ error = -EMLINK;
+ if (max_links && dir->i_nlink >= max_links)
+ goto err;
+
+ error = try_break_deleg(dir, di);
+ if (error)
+ goto err;
+
+ de = dir->i_op->mkdir(idmap, dir, dentry, mode);
+ if (IS_ERR(de)) {
+ error = PTR_ERR(de);
+ goto err;
+ }
+ if (de) {
+ dput(dentry);
+ dentry = de;
+ }
+ fsnotify_mkdir(dir, dentry);
+ return dentry;
+
+err:
+ return ERR_PTR(error);
+}
+
/**
* vfs_mkdir - create directory returning correct dentry if possible
* @idmap: idmap of the mount the inode was found from
@@ -5246,7 +5290,6 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct delegated_inode *delegated_inode)
{
int error;
- unsigned max_links = dir->i_sb->s_max_links;
struct dentry *de;
error = may_create_dentry(idmap, dir, dentry);
@@ -5262,24 +5305,12 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (error)
goto err;
- error = -EMLINK;
- if (max_links && dir->i_nlink >= max_links)
- goto err;
-
- error = try_break_deleg(dir, delegated_inode);
- if (error)
- goto err;
-
- de = dir->i_op->mkdir(idmap, dir, dentry, mode);
- error = PTR_ERR(de);
- if (IS_ERR(de))
+ de = vfs_mkdir_no_perm(idmap, dir, dentry, mode, delegated_inode);
+ if (IS_ERR(de)) {
+ error = PTR_ERR(de);
goto err;
- if (de) {
- dput(dentry);
- dentry = de;
}
- fsnotify_mkdir(dir, dentry);
- return dentry;
+ return de;
err:
end_creating(dentry);
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (2 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 03/12] vfs: prepare vfs_creat|mkdir_no_perm for reuse in lookup_open() Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-15 21:43 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 05/12] vfs: lookup_open(): move setting FMODE_CREATED up when calling create() Jori Koolstra
` (7 subsequent siblings)
11 siblings, 1 reply; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
In the mknod(2) path of calling vfs_create() we call audit_inode_child()
before permission checks in may_create_dentry() (but after path-based
LSM check). Copy this behaviour to lookup_open() and move
audit_inode_child() to may_o_create().
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/namei.c b/fs/namei.c
index 6bf1ded26377..a4a8cdbb48e2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4345,6 +4345,8 @@ static int may_o_create(struct mnt_idmap *idmap,
if (error)
return error;
+ audit_inode_child(dir->dentry->d_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+
if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
return -EOVERFLOW;
@@ -4532,7 +4534,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
goto out_dput;
file->f_mode |= FMODE_CREATED;
- audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 05/12] vfs: lookup_open(): move setting FMODE_CREATED up when calling create()
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (3 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 06/12] vfs: lookup_open(): move i_op->create check to before try_break_deleg() Jori Koolstra
` (6 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
In preparation for using vfs_create_no_perm() in lookup_open() we need
to move setting FMODE_CREATED on the file mode to either before or after
that call, as currently it is in the middle. If try_break_deleg() fails
it is currently not set, but vfs_create_no_perm() includes a
try_break_deleg().
Going up the call chain of lookup_open() we see that it is only used in
open_last_lookups() if no error is returned from lookup_open(), so we
can safely move it to after the filesystem create() call. This also
makes more sense when reading the code as you don't have to wonder what
the implications are of setting FMODE_CREATED before the create() call.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/namei.c b/fs/namei.c
index a4a8cdbb48e2..e21f23502e8d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4533,7 +4533,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (error)
goto out_dput;
- file->f_mode |= FMODE_CREATED;
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
@@ -4545,6 +4544,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
goto out_dput;
fsnotify_create(dir_inode, dentry);
+ file->f_mode |= FMODE_CREATED;
}
return dentry;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 06/12] vfs: lookup_open(): move i_op->create check to before try_break_deleg()
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (4 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 05/12] vfs: lookup_open(): move setting FMODE_CREATED up when calling create() Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 07/12] vfs: lookup_open(): use vfs_create_no_perm() Jori Koolstra
` (5 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
The i_op->create check in lookup_open() takes place after the
try_break_deleg() call. This does not match the order when doing a
regular file create via mknod(2). There the call order is:
filename_mknodat()
vfs_create()
i_op->create check
try_break_deleg()
Move the i_op->create check to before try_break_deleg() in
lookup_open().
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index e21f23502e8d..15bf28b85a4e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4341,6 +4341,7 @@ static int may_o_create(struct mnt_idmap *idmap,
const struct path *dir, struct dentry *dentry,
umode_t mode)
{
+
int error = security_path_mknod(dir, dentry, mode, 0);
if (error)
return error;
@@ -4528,16 +4529,17 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
- /* but break the directory lease first! */
- error = try_break_deleg(dir_inode, delegated_inode);
- if (error)
- goto out_dput;
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
}
+ /* but break the directory lease first! */
+ error = try_break_deleg(dir_inode, delegated_inode);
+ if (error)
+ goto out_dput;
+
error = dir_inode->i_op->create(idmap, dir_inode, dentry,
mode, open_flag & O_EXCL);
if (error)
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 07/12] vfs: lookup_open(): use vfs_create_no_perm()
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (5 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 06/12] vfs: lookup_open(): move i_op->create check to before try_break_deleg() Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 08/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (4 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
We can replace the code in the no create_error/negative dentry found
from lookup case in lookup_open() with the vfs_create_no_perm() helper.
It is safe here to always pass excl=true to ->create via
vfs_create_no_perm(). Local filesystems always ignore the excl flag,
and cluster filesystems implement ->atomic_open, so that the ->create
call is never reached in lookup_open().
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 15bf28b85a4e..50eb357fc0ff 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4535,17 +4535,10 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
goto out_dput;
}
- /* but break the directory lease first! */
- error = try_break_deleg(dir_inode, delegated_inode);
+ error = vfs_create_no_perm(idmap, dentry, mode, delegated_inode);
if (error)
goto out_dput;
- error = dir_inode->i_op->create(idmap, dir_inode, dentry,
- mode, open_flag & O_EXCL);
- if (error)
- goto out_dput;
-
- fsnotify_create(dir_inode, dentry);
file->f_mode |= FMODE_CREATED;
}
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 08/12] vfs: add O_CREAT|O_DIRECTORY to open*(2)
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (6 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 07/12] vfs: lookup_open(): use vfs_create_no_perm() Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 09/12] vfs: move O_IS_MKDIR check out atomic_open() to individual filesystems Jori Koolstra
` (3 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
Currently there is no way to race-freely create and open a directory.
For regular files we have open(O_CREAT) for creating a new file inode,
and returning a pinning fd to it. The lack of such functionality for
directories means that when populating a directory tree there's always
a race involved: the inodes first need to be created, and then opened
to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
but in the time window between the creation and the opening they might
be replaced by something else.
Addressing this race without proper APIs is possible (by immediately
fstat()ing what was opened, to verify that it has the right inode type),
but difficult to get right. Hence, adding support for a new flag combo
O_CREAT|O_DIRECTORY to open*(2) that creates a directory (if it does not
exist already) and returns an O_DIRECTORY fd is very useful.
Historically, the O_CREAT|O_DIRECTORY behaviour was to return ENOTDIR if
a regular file exists at the open path; EISDIR if a directory exists at
the path; and to create a regular file if no file exists at the path.
This behaviour changed accidentally with 973d4b73fbaf ("do_last(): rejoin
the common path even earlier in FMODE_{OPENED,CREATED} case") causing
ENOTDIR to return in the last case while still creating the file. As
this change was not detected for a long time, Brauner proposed to adopt
the more consistent NetBSD behaviour, i.e. to return EINVAL on the the
O_CREAT|O_DIRECTORY combination. This change was applied in 43b450632676
("open: return EINVAL for O_DIRECTORY | O_CREAT") in March, 2023. As
the EINVAL behaviour has been in the kernel for about 3 year now, no
rollback is expected as a result of userspace reliance on old
behaviour, leaving us free to reassign the O_CREAT|O_DIRECTORY semantics.
O_CREAT|O_DIRECTORY is blocked in atomic_open() until individual
filesytems that support i_op->atomic_open() are given tailored support.
This feature idea (and some of its description) is taken from the
UAPI group:
https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 87 ++++++++++++++++++++++++++++++++++---------
fs/open.c | 25 +++++++------
include/linux/fcntl.h | 7 ++++
3 files changed, 90 insertions(+), 29 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 50eb357fc0ff..eec6a1ea369e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1377,13 +1377,12 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link)
/**
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
- * should be allowed, or not, on files that already
- * exist.
+ * should be allowed, or not
* @idmap: idmap of the mount the inode was found from
* @nd: nameidata pathwalk data
* @inode: the inode of the file to open
*
- * Block an O_CREAT open of a FIFO (or a regular file) when:
+ * Block an O_CREAT open of a FIFO (or a regular file/directory) when:
* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
* - the file already exists
* - we are in a sticky directory
@@ -1411,6 +1410,12 @@ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
if (likely(!(dir_mode & S_ISVTX)))
return 0;
+ // There is no separate sysctl for directory creation in sticky
+ // folders. Therefore, for the S_ISDIR case, disabling
+ // sysctl_protected_regular is not enough to allow creating a
+ // directory in a sticky folder, because that may suprise users
+ // not expecting that O_CREAT|O_DIRECTORY is possible on newer
+ // kernels.
if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
return 0;
@@ -1442,6 +1447,12 @@ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
"sticky_create_regular");
return -EACCES;
}
+
+ if (sysctl_protected_regular >= 2 && S_ISDIR(inode->i_mode)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT,
+ "sticky_create_dir");
+ return -EACCES;
+ }
}
return 0;
@@ -4339,10 +4350,15 @@ static inline int open_to_namei_flags(int flag)
static int may_o_create(struct mnt_idmap *idmap,
const struct path *dir, struct dentry *dentry,
- umode_t mode)
+ umode_t mode, bool create_dir)
{
+ struct inode *dir_inode = dir->dentry->d_inode;
+ int error;
- int error = security_path_mknod(dir, dentry, mode, 0);
+ if (create_dir)
+ error = security_path_mkdir(dir, dentry, mode);
+ else
+ error = security_path_mknod(dir, dentry, mode, 0);
if (error)
return error;
@@ -4351,12 +4367,25 @@ static int may_o_create(struct mnt_idmap *idmap,
if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
return -EOVERFLOW;
- error = inode_permission(idmap, dir->dentry->d_inode,
- MAY_WRITE | MAY_EXEC);
+ error = inode_permission(idmap, dir_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
- return security_inode_create(dir->dentry->d_inode, dentry, mode);
+ if (create_dir)
+ error = security_inode_mkdir(dir_inode, dentry, mode);
+ else
+ error = security_inode_create(dir_inode, dentry, mode);
+
+ return error;
+}
+
+static inline umode_t o_create_mode(struct mnt_idmap *idmap,
+ const struct inode *dir, umode_t mode, bool create_dir)
+{
+ if (create_dir)
+ return vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
+ else
+ return vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
}
/*
@@ -4382,8 +4411,12 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
file->__f_path.dentry = DENTRY_NOT_SET;
file->__f_path.mnt = path->mnt;
- error = dir->i_op->atomic_open(dir, dentry, file,
- open_to_namei_flags(open_flag), mode);
+
+ if (O_IS_MKDIR(open_flag))
+ error = EINVAL;
+ else
+ error = dir->i_op->atomic_open(dir, dentry, file,
+ open_to_namei_flags(open_flag), mode);
d_lookup_done(dentry);
if (!error) {
if (file->f_mode & FMODE_OPENED) {
@@ -4416,6 +4449,10 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
return dentry;
}
+static inline
+struct dentry *vfs_mkdir_no_perm(struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t,
+ struct delegated_inode *);
/*
* Look up and maybe create and open the last component.
*
@@ -4442,6 +4479,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
+ bool create_dir = O_IS_MKDIR(open_flag);
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
@@ -4490,10 +4528,10 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
- mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
+ mode = o_create_mode(idmap, dir_inode, mode, create_dir);
if (likely(got_write))
create_error = may_o_create(idmap, &nd->path,
- dentry, mode);
+ dentry, mode, create_dir);
else
create_error = -EROFS;
}
@@ -4530,12 +4568,24 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
- if (!dir_inode->i_op->create) {
+ if ((create_dir && !dir_inode->i_op->mkdir)
+ || (!create_dir && !dir_inode->i_op->create)) {
error = -EACCES;
goto out_dput;
}
- error = vfs_create_no_perm(idmap, dentry, mode, delegated_inode);
+ if (create_dir) {
+ struct dentry *res = vfs_mkdir_no_perm(idmap, dir_inode, dentry,
+ mode, delegated_inode);
+ if (IS_ERR(res)) {
+ error = PTR_ERR(res);
+ } else {
+ error = 0;
+ dentry = res;
+ }
+ } else {
+ error = vfs_create_no_perm(idmap, dentry, mode, delegated_inode);
+ }
if (error)
goto out_dput;
@@ -4554,7 +4604,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
struct dentry *dentry;
if (open_flag & O_CREAT) {
- if (trailing_slashes(nd->last))
+ if (trailing_slashes(nd->last) && !(open_flag & O_DIRECTORY))
return ERR_PTR(-EISDIR);
/* Don't bother on an O_EXCL create */
@@ -4625,7 +4675,7 @@ static const char *open_last_lookups(struct nameidata *nd,
*/
}
if (open_flag & O_CREAT)
- inode_lock(dir->d_inode);
+ inode_lock_nested(dir->d_inode, I_MUTEX_PARENT);
else
inode_lock_shared(dir->d_inode);
@@ -4688,8 +4738,9 @@ static int do_open(struct nameidata *nd,
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
- if (d_is_dir(nd->path.dentry))
+ if (!(open_flag & O_DIRECTORY) && d_is_dir(nd->path.dentry))
return -EISDIR;
+
error = may_create_in_sticky(idmap, nd,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
@@ -5056,7 +5107,7 @@ struct file *dentry_create(struct path *path, int flags, umode_t mode,
path->dentry = dir;
mode = vfs_prepare_mode(idmap, dir_inode, mode, S_IALLUGO, S_IFREG);
- create_error = may_o_create(idmap, path, dentry, mode);
+ create_error = may_o_create(idmap, path, dentry, mode, /*create_dir=*/ false);
if (create_error)
flags &= ~O_CREAT;
diff --git a/fs/open.c b/fs/open.c
index 681d405bc61e..5cf8ada58483 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1209,29 +1209,30 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
if (WILL_CREATE(flags)) {
if (how->mode & ~S_IALLUGO)
return -EINVAL;
- op->mode = how->mode | S_IFREG;
+ if (O_IS_MKDIR(flags))
+ op->mode = how->mode | S_IFDIR;
+ else
+ op->mode = how->mode | S_IFREG;
} else {
if (how->mode != 0)
return -EINVAL;
op->mode = 0;
}
- /*
- * Block bugs where O_DIRECTORY | O_CREAT created regular files.
- * Note, that blocking O_DIRECTORY | O_CREAT here also protects
- * O_TMPFILE below which requires O_DIRECTORY being raised.
- */
- if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
- return -EINVAL;
-
/* Now handle the creative implementation of O_TMPFILE. */
if (flags & __O_TMPFILE) {
/*
* In order to ensure programs get explicit errors when trying
* to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
- * is raised alongside __O_TMPFILE.
+ * is raised alongside __O_TMPFILE, but without O_CREAT. The
+ * reason for disallowing O_CREAT|O_TMPFILE is that
+ * O_DIRECTORY|O_CREAT used to work and created a regular file
+ * if nothing existed at the open path. Hence, allowing the
+ * combination would have caused O_CREAT|O_TMPFILE to create a
+ * regular (non-temporary) file on old kernels, while the caller
+ * would believe they created an actual O_TMPFILE.
*/
- if (!(flags & O_DIRECTORY))
+ if (!(flags & O_DIRECTORY) || (flags & O_CREAT))
return -EINVAL;
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
@@ -1268,6 +1269,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
if (flags & O_CREAT) {
+ if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
+ return -EISDIR;
op->intent |= LOOKUP_CREATE;
if (flags & O_EXCL) {
op->intent |= LOOKUP_EXCL;
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..79843e8add6e 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -12,6 +12,13 @@
FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+
+#define O_MKDIR_MASK (O_CREAT | O_DIRECTORY)
+static inline bool O_IS_MKDIR(unsigned flags)
+{
+ return (flags & O_MKDIR_MASK) == O_MKDIR_MASK;
+}
+
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 09/12] vfs: move O_IS_MKDIR check out atomic_open() to individual filesystems
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (7 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 08/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 10/12] vfs: refuse O_CREAT for directories through a dangling symlink Jori Koolstra
` (2 subsequent siblings)
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
Individual filesystems need to get the chance to implement
O_CREAT|O_DIRECTORY or not, rather than decide this at the VFS level in
atomic_open().
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/9p/vfs_inode.c | 3 +++
fs/9p/vfs_inode_dotl.c | 3 +++
fs/ceph/file.c | 3 +++
fs/fuse/dir.c | 3 +++
fs/gfs2/inode.c | 3 +++
fs/namei.c | 8 ++------
fs/nfs/dir.c | 3 +++
fs/nfs/file.c | 3 +++
fs/smb/client/dir.c | 3 +++
fs/vboxsf/dir.c | 3 +++
10 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f468acb8ee7d..8eff9320aa8a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -771,6 +771,9 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int p9_omode;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
if (d_in_lookup(dentry)) {
struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
if (res || d_really_is_positive(dentry))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 141fb54db65d..9a63ae0f3b58 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -239,6 +239,9 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
if (d_in_lookup(dentry)) {
struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
if (res || d_really_is_positive(dentry))
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d54d71669176..a82a711a86e6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -813,6 +813,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
err = ceph_wait_on_conflict_unlink(dentry);
if (err)
return err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b658b6baf72f..58f3310f828f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -940,6 +940,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fuse_is_bad(dir))
return -EIO;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
if (d_in_lookup(entry)) {
struct dentry *res = fuse_lookup(dir, entry, 0);
if (res || d_really_is_positive(entry))
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e9bf4879c07f..df66ac2d0c15 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1384,6 +1384,9 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
{
bool excl = !!(flags & O_EXCL);
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
if (d_in_lookup(dentry)) {
struct dentry *d = __gfs2_lookup(dir, dentry, file);
if (file->f_mode & FMODE_OPENED) {
diff --git a/fs/namei.c b/fs/namei.c
index eec6a1ea369e..f91de2038321 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4411,12 +4411,8 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
file->__f_path.dentry = DENTRY_NOT_SET;
file->__f_path.mnt = path->mnt;
-
- if (O_IS_MKDIR(open_flag))
- error = EINVAL;
- else
- error = dir->i_op->atomic_open(dir, dentry, file,
- open_to_namei_flags(open_flag), mode);
+ error = dir->i_op->atomic_open(dir, dentry, file,
+ open_to_namei_flags(open_flag), mode);
d_lookup_done(dentry);
if (!error) {
if (file->f_mode & FMODE_OPENED) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e9ce1883288c..cbbc61788d0e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2314,6 +2314,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
return -ENAMETOOLONG;
+ if (O_IS_MKDIR(open_flags))
+ return -EINVAL;
+
if (open_flags & O_CREAT) {
error = nfs_do_create(dir, dentry, mode, open_flags);
if (!error) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 25048a3c2364..b885d8facaf5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -52,6 +52,9 @@ int nfs_check_flags(int flags)
if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
return -EINVAL;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
return 0;
}
EXPORT_SYMBOL_GPL(nfs_check_flags);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index e4295a5b55b3..b282753713d6 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -526,6 +526,9 @@ int cifs_atomic_open(struct inode *dir, struct dentry *direntry,
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return smb_EIO(smb_eio_trace_forced_shutdown);
+ if (O_IS_MKDIR(oflags))
+ return -EINVAL;
+
/*
* Posix open is only called (at lookup time) for file create now. For
* opens (rather than creates), because we do not know if it is a file
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 42bedc4ec7af..cc999c1ab7cf 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -318,6 +318,9 @@ static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
u64 handle;
int err;
+ if (O_IS_MKDIR(flags))
+ return -EINVAL;
+
if (d_in_lookup(dentry)) {
struct dentry *res = vboxsf_dir_lookup(parent, dentry, 0);
if (res || d_really_is_positive(dentry))
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 10/12] vfs: refuse O_CREAT for directories through a dangling symlink
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (8 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 09/12] vfs: move O_IS_MKDIR check out atomic_open() to individual filesystems Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens Jori Koolstra
2026-06-14 16:44 ` [PATCH 12/12] selftest: add tests for open*(O_CREAT|O_DIRECTORY) Jori Koolstra
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
open(O_CREAT) without O_EXCL follows a trailing symlink and, when the
symlink target does not exist, creates it. Refuse to create through a
dangling symlink for directories.
In lookup_open() a negative target reached with nd->depth > 0 was
arrived at by following a trailing symlink; since the dentry is negative
the symlink is dangling. Set create_error to -ELOOP in that case.
Reusing the existing create_error path strips O_CREAT for both the
generic and ->atomic_open create paths and only reports the error when
the target is actually negative, so opening an existing target through a
symlink, interior symlinks, and O_EXCL (which never follows the trailing
link) are all unaffected.
Suggested-by: Christian Brauner (Amutable) <brauner@kernel.org>
Reviewed-by: Jori Koolstra <jkoolstra@xs4all.nl>
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/namei.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/fs/namei.c b/fs/namei.c
index f91de2038321..6029259cc9bd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4530,6 +4530,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
dentry, mode, create_dir);
else
create_error = -EROFS;
+ /* Refuse to create a directory through a dangling (trailing)
+ * symlink. For regular files this has been allowed historically
+ * on O_CREAT without O_EXCL. */
+ if (unlikely(nd->depth) && create_dir && !create_error)
+ create_error = -ELOOP;
}
if (create_error)
open_flag &= ~O_CREAT;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (9 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 10/12] vfs: refuse O_CREAT for directories through a dangling symlink Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
2026-06-14 17:01 ` Jori Koolstra
2026-06-15 12:56 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 12/12] selftest: add tests for open*(O_CREAT|O_DIRECTORY) Jori Koolstra
11 siblings, 2 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
Requesting write access on a directory can never succeed. Rather
than performing a path-walk to determine whether the target is
actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
to -ENOTDIR.
Currently O_WRONLY for directories is only blocked in may_open(),
which happens after we have the inode for the target, so after any
create via O_CREAT|O_DIRECTORY.
The advantage of short-circuiting is that we don't have to add even
more logic to lookup_open() to differentiate -EISDIR/-ENOTDIR. Also,
for filesystems that define atomic_open(), handling this cannot even be
done at the VFS level, as we can't know ahead of calling
->atomic_open() what the result of the lookup is.
Suggested-by: Christian Brauner (Amutable) <brauner@kernel.org>
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/open.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 5cf8ada58483..be980a737c82 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1268,9 +1268,16 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+ /*
+ * Requesting write access on a directory can never succeed. Rather
+ * than performing a path-walk to determine whether the target is
+ * actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
+ * to -ENOTDIR.
+ */
+ if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
+ return -ENOTDIR;
+
if (flags & O_CREAT) {
- if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
- return -EISDIR;
op->intent |= LOOKUP_CREATE;
if (flags & O_EXCL) {
op->intent |= LOOKUP_EXCL;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 12/12] selftest: add tests for open*(O_CREAT|O_DIRECTORY)
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
` (10 preceding siblings ...)
2026-06-14 16:44 ` [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens Jori Koolstra
@ 2026-06-14 16:44 ` Jori Koolstra
11 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 16:44 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, Jori Koolstra
Add some tests for the new valid O_CREAT|O_DIRECTORY flag combination for
open*(2) to test compliance and to showcase its behaviour.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
.../testing/selftests/filesystems/.gitignore | 1 +
tools/testing/selftests/filesystems/Makefile | 4 +-
tools/testing/selftests/filesystems/fclog.c | 1 +
.../filesystems/open_o_creat_o_dir.c | 197 ++++++++++++++++++
4 files changed, 201 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/open_o_creat_o_dir.c
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 64ac0dfa46b7..f257b3ddb479 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+open_o_creat_o_dir
dnotify_test
devpts_pts
fclog
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..ec7f93b700d2 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+CFLAGS += $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+TEST_GEN_PROGS := open_o_creat_o_dir devpts_pts file_stressor anon_inode_test kernfs_test fclog
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c
index 551c4a0f395a..33ed59286a2d 100644
--- a/tools/testing/selftests/filesystems/fclog.c
+++ b/tools/testing/selftests/filesystems/fclog.c
@@ -4,6 +4,7 @@
* Copyright (C) 2025 SUSE LLC.
*/
+#include <fcntl.h>
#include <assert.h>
#include <errno.h>
#include <sched.h>
diff --git a/tools/testing/selftests/filesystems/open_o_creat_o_dir.c b/tools/testing/selftests/filesystems/open_o_creat_o_dir.c
new file mode 100644
index 000000000000..df3cdbae2c85
--- /dev/null
+++ b/tools/testing/selftests/filesystems/open_o_creat_o_dir.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <errno.h>
+#include <limits.h>
+#include <fcntl.h>
+
+#include "kselftest_harness.h"
+
+static inline int open_o_creat_o_dir(int dfd, const char *pathname,
+ mode_t mode, unsigned int flags)
+{
+ return syscall(__NR_openat, dfd, pathname,
+ flags | O_DIRECTORY | O_CREAT, mode);
+}
+
+#define open_o_creat_o_dir_checked_flags(dfd, pathname, flags) ({ \
+ struct stat __st; \
+ int __fd = open_o_creat_o_dir(dfd, pathname, S_IRWXU, flags); \
+ ASSERT_GE(__fd, 0); \
+ ASSERT_EQ(fstat(__fd, &__st), 0); \
+ EXPECT_TRUE(S_ISDIR(__st.st_mode)); \
+ __fd; \
+})
+
+#define open_o_creat_o_dir_checked(dfd, pathname) \
+ open_o_creat_o_dir_checked_flags(dfd, pathname, 0)
+
+FIXTURE(open_o_creat_o_dir) {
+ char dirpath[PATH_MAX];
+ int dfd;
+};
+
+FIXTURE_SETUP(open_o_creat_o_dir)
+{
+ strcpy(self->dirpath, "/tmp/open_o_creat_o_dir_test.XXXXXX");
+ ASSERT_NE(mkdtemp(self->dirpath), NULL);
+ self->dfd = open(self->dirpath, O_DIRECTORY);
+ ASSERT_GE(self->dfd, 0);
+}
+
+FIXTURE_TEARDOWN(open_o_creat_o_dir)
+{
+ close(self->dfd);
+ rmdir(self->dirpath);
+}
+
+/* Does open_o_creat_o_dir return a fd at all? */
+TEST_F(open_o_creat_o_dir, returns_fd)
+{
+ int fd = open_o_creat_o_dir_checked(self->dfd, "newdir");
+ EXPECT_EQ(close(fd), 0);
+ EXPECT_EQ(unlinkat(self->dfd, "newdir", AT_REMOVEDIR), 0);
+}
+
+/* The fd must refer to the directory that was just created. */
+TEST_F(open_o_creat_o_dir, fd_is_created_dir)
+{
+ int fd;
+ struct stat st_via_fd, st_via_path;
+ char path[PATH_MAX];
+
+ fd = open_o_creat_o_dir_checked(self->dfd, "checkdir");
+
+ ASSERT_EQ(fstat(fd, &st_via_fd), 0);
+
+ snprintf(path, sizeof(path), "%s/checkdir", self->dirpath);
+ ASSERT_EQ(stat(path, &st_via_path), 0);
+
+ EXPECT_EQ(st_via_fd.st_ino, st_via_path.st_ino);
+ EXPECT_EQ(st_via_fd.st_dev, st_via_path.st_dev);
+
+ EXPECT_EQ(close(fd), 0);
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+/* Missing parent component must fail with ENOENT. */
+TEST_F(open_o_creat_o_dir, enoent_missing_parent)
+{
+ EXPECT_EQ(open_o_creat_o_dir(self->dfd, "nonexistent/child", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, ENOENT);
+}
+
+/* An invalid dfd must fail with EBADF. */
+TEST_F(open_o_creat_o_dir, ebadf)
+{
+ EXPECT_EQ(open_o_creat_o_dir(-42, "badfdir", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, EBADF);
+}
+
+/* A dfd that points to a file (not a directory) must fail with ENOTDIR. */
+TEST_F(open_o_creat_o_dir, enotdir_dfd)
+{
+ int file_fd;
+
+ file_fd = openat(self->dfd, "file",
+ O_CREAT | O_WRONLY, S_IRWXU);
+ ASSERT_GE(file_fd, 0);
+
+ EXPECT_EQ(open_o_creat_o_dir(file_fd, "subdir", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, ENOTDIR);
+
+ EXPECT_EQ(close(file_fd), 0);
+ EXPECT_EQ(unlinkat(self->dfd, "file", 0), 0);
+}
+
+/*
+ * O_EXCL together with O_CREAT|O_DIRECTORY must fail with EEXIST when
+ * the target directory already exists.
+ */
+TEST_F(open_o_creat_o_dir, o_excl_eexist)
+{
+ int fd;
+
+ fd = open_o_creat_o_dir_checked_flags(self->dfd, "excldir", O_EXCL);
+ EXPECT_EQ(close(fd), 0);
+
+ EXPECT_EQ(open_o_creat_o_dir(self->dfd, "excldir", S_IRWXU, O_EXCL), -1);
+ EXPECT_EQ(errno, EEXIST);
+
+ EXPECT_EQ(unlinkat(self->dfd, "excldir", AT_REMOVEDIR), 0);
+}
+
+/*
+ * O_CREAT|O_DIRECTORY on a path that already exists as a regular file
+ * must fail with ENOTDIR.
+ */
+TEST_F(open_o_creat_o_dir, existing_file_enotdir)
+{
+ int file_fd;
+
+ file_fd = openat(self->dfd, "regfile",
+ O_CREAT | O_WRONLY, S_IRWXU);
+ ASSERT_GE(file_fd, 0);
+ EXPECT_EQ(close(file_fd), 0);
+
+ EXPECT_EQ(open_o_creat_o_dir(self->dfd, "regfile", S_IRWXU, 0), -1);
+ EXPECT_EQ(errno, ENOTDIR);
+
+ EXPECT_EQ(unlinkat(self->dfd, "regfile", 0), 0);
+}
+
+/*
+ * O_CREAT|O_DIRECTORY combined with a writable access mode must be
+ * rejected: a directory cannot be opened for writing.
+ */
+TEST_F(open_o_creat_o_dir, rejects_writable_acc_mode)
+{
+ EXPECT_EQ(open_o_creat_o_dir(self->dfd, "rdwrdir", S_IRWXU, O_RDWR), -1);
+ EXPECT_EQ(errno, ENOTDIR);
+ /* Clean up if the kernel created the directory anyway. */
+ unlinkat(self->dfd, "rdwrdir", AT_REMOVEDIR);
+}
+
+/*
+ * openat(O_CREAT) with a trailing slash but without O_DIRECTORY
+ * must fail with EISDIR and must not create anything at the path.
+ */
+TEST_F(open_o_creat_o_dir, trailing_slash_no_o_dir)
+{
+ int fd;
+ struct stat st;
+
+ fd = openat(self->dfd, "trailing/", O_CREAT | O_WRONLY, S_IRWXU);
+ EXPECT_EQ(fd, -1);
+ EXPECT_EQ(errno, EISDIR);
+
+ EXPECT_EQ(fstatat(self->dfd, "trailing", &st, 0), -1);
+ EXPECT_EQ(errno, ENOENT);
+
+ /* Best-effort cleanup in case the kernel left a file behind. */
+ if (fd >= 0)
+ close(fd);
+ unlinkat(self->dfd, "trailing", 0);
+}
+
+/*
+ * The returned fd must be usable as a dfd for further *at() calls.
+ */
+TEST_F(open_o_creat_o_dir, fd_usable_as_dfd)
+{
+ int parent_fd, child_fd;
+ char path[PATH_MAX];
+
+ parent_fd = open_o_creat_o_dir_checked(self->dfd, "parent");
+ child_fd = open_o_creat_o_dir_checked(parent_fd, "child");
+
+ EXPECT_EQ(close(child_fd), 0);
+ EXPECT_EQ(close(parent_fd), 0);
+
+ snprintf(path, sizeof(path), "%s/parent/child", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+ snprintf(path, sizeof(path), "%s/parent", self->dirpath);
+ EXPECT_EQ(rmdir(path), 0);
+}
+
+TEST_HARNESS_MAIN
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens
2026-06-14 16:44 ` [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens Jori Koolstra
@ 2026-06-14 17:01 ` Jori Koolstra
2026-06-15 12:56 ` Jori Koolstra
1 sibling, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-14 17:01 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel, linux-api@vger.kernel.org
> Op 14-06-2026 18:44 CEST schreef Jori Koolstra <jkoolstra@xs4all.nl>:
>
>
> Requesting write access on a directory can never succeed. Rather
> than performing a path-walk to determine whether the target is
> actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
> to -ENOTDIR.
>
> Currently O_WRONLY for directories is only blocked in may_open(),
> which happens after we have the inode for the target, so after any
> create via O_CREAT|O_DIRECTORY.
>
> The advantage of short-circuiting is that we don't have to add even
> more logic to lookup_open() to differentiate -EISDIR/-ENOTDIR. Also,
> for filesystems that define atomic_open(), handling this cannot even be
> done at the VFS level, as we can't know ahead of calling
> ->atomic_open() what the result of the lookup is.
>
> Suggested-by: Christian Brauner (Amutable) <brauner@kernel.org>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/open.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/fs/open.c b/fs/open.c
> index 5cf8ada58483..be980a737c82 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1268,9 +1268,16 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>
> op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
>
> + /*
> + * Requesting write access on a directory can never succeed. Rather
> + * than performing a path-walk to determine whether the target is
> + * actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
> + * to -ENOTDIR.
> + */
> + if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
> + return -ENOTDIR;
> +
> if (flags & O_CREAT) {
> - if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
> - return -EISDIR;
> op->intent |= LOOKUP_CREATE;
> if (flags & O_EXCL) {
> op->intent |= LOOKUP_EXCL;
> --
> 2.54.0
Forgot to cc this to linux-api@vger.kernel.org. Hereby cc'ed.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 01/12] fs/namei.c: use trailing_slashes()
2026-06-14 16:44 ` [PATCH 01/12] fs/namei.c: use trailing_slashes() Jori Koolstra
@ 2026-06-15 9:26 ` David Laight
0 siblings, 0 replies; 18+ messages in thread
From: David Laight @ 2026-06-15 9:26 UTC (permalink / raw)
To: Jori Koolstra
Cc: Christian Brauner, Jan Kara, Al Viro, NeilBrown, linux-fsdevel,
linux-kernel
On Sun, 14 Jun 2026 18:44:27 +0200
Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> There are several places in fs/namei.c that can use the
> trailing_slashes() function to improve intent.
I'm not sure it does anything for the readability.
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/namei.c | 26 +++++++++++++-------------
> 1 file changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 4787244ca4a7..64b91ed9efb7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2777,9 +2777,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
> return s;
> }
>
> +static inline bool trailing_slashes(const struct qstr last)
You are passing a struct by value.
If the function isn't inlined then you've blown the stack.
David
> +{
> + return (bool)last.name[last.len];
> +}
> +
> static inline const char *lookup_last(struct nameidata *nd)
> {
> - if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
> + if (nd->last_type == LAST_NORM && trailing_slashes(nd->last))
> nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
>
> return walk_component(nd, WALK_TRAILING);
> @@ -4524,17 +4529,12 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
> return ERR_PTR(error);
> }
>
> -static inline bool trailing_slashes(struct nameidata *nd)
> -{
> - return (bool)nd->last.name[nd->last.len];
> -}
> -
> static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
> {
> struct dentry *dentry;
>
> if (open_flag & O_CREAT) {
> - if (trailing_slashes(nd))
> + if (trailing_slashes(nd->last))
> return ERR_PTR(-EISDIR);
>
> /* Don't bother on an O_EXCL create */
> @@ -4542,7 +4542,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
> return NULL;
> }
>
> - if (trailing_slashes(nd))
> + if (trailing_slashes(nd->last))
> nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
>
> dentry = lookup_fast(nd);
> @@ -4945,7 +4945,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
> * Do the final lookup. Suppress 'create' if there is a trailing
> * '/', and a directory wasn't requested.
> */
> - if (last.name[last.len] && !want_dir)
> + if (trailing_slashes(last) && !want_dir)
> create_flags &= ~LOOKUP_CREATE;
> dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
> if (IS_ERR(dentry))
> @@ -5562,7 +5562,7 @@ int filename_unlinkat(int dfd, struct filename *name)
> goto exit_drop_write;
>
> /* Why not before? Because we want correct error value */
> - if (unlikely(last.name[last.len])) {
> + if (unlikely(trailing_slashes(last))) {
> if (d_is_dir(dentry))
> error = -EISDIR;
> else
> @@ -6161,16 +6161,16 @@ int filename_renameat2(int olddfd, struct filename *from,
> if (flags & RENAME_EXCHANGE) {
> if (!d_is_dir(rd.new_dentry)) {
> error = -ENOTDIR;
> - if (new_last.name[new_last.len])
> + if (trailing_slashes(new_last))
> goto exit_unlock;
> }
> }
> /* unless the source is a directory trailing slashes give -ENOTDIR */
> if (!d_is_dir(rd.old_dentry)) {
> error = -ENOTDIR;
> - if (old_last.name[old_last.len])
> + if (trailing_slashes(old_last))
> goto exit_unlock;
> - if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
> + if (!(flags & RENAME_EXCHANGE) && trailing_slashes(new_last))
> goto exit_unlock;
> }
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up
2026-06-14 16:44 ` [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up Jori Koolstra
@ 2026-06-15 9:37 ` David Laight
0 siblings, 0 replies; 18+ messages in thread
From: David Laight @ 2026-06-15 9:37 UTC (permalink / raw)
To: Jori Koolstra
Cc: Christian Brauner, Jan Kara, Al Viro, NeilBrown, linux-fsdevel,
linux-kernel
On Sun, 14 Jun 2026 18:44:28 +0200
Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> O_CREAT is stripped when create_error is set in lookup_open(), so when
> lookup does not return an inode, the case
>
> if (!dentry->d_inode && (open_flag & O_CREAT))
>
> is always skipped. We can get rid of this cognitive step by handling the
> error case first.
You've just added an extra test into a normal/fast path.
OTOH the test isn't ideal the compiler needs to reverse the order of
the arguments to the && in spite of the unlikely().
Perhaps:
if (dentry->d_inode)
return dentry;
if (open_flag & O_CREAT) {
...
} else {
if (unlikely(create_error)) {
error = create_error();
goto out_dput;
}
}
return dentry;
-- David
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/namei.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 64b91ed9efb7..f169d1123b17 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -4499,6 +4499,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
> }
> }
>
> + if (unlikely(create_error) && !dentry->d_inode) {
> + error = create_error;
> + goto out_dput;
> + }
> +
> /* Negative dentry, just create the file */
> if (!dentry->d_inode && (open_flag & O_CREAT)) {
> /* but break the directory lease first! */
> @@ -4518,10 +4523,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
> if (error)
> goto out_dput;
> }
> - if (unlikely(create_error) && !dentry->d_inode) {
> - error = create_error;
> - goto out_dput;
> - }
> +
> return dentry;
>
> out_dput:
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens
2026-06-14 16:44 ` [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens Jori Koolstra
2026-06-14 17:01 ` Jori Koolstra
@ 2026-06-15 12:56 ` Jori Koolstra
1 sibling, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-15 12:56 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown
Cc: linux-fsdevel, linux-kernel
> Op 14-06-2026 18:44 CEST schreef Jori Koolstra <jkoolstra@xs4all.nl>:
>
>
> Requesting write access on a directory can never succeed. Rather
> than performing a path-walk to determine whether the target is
> actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
> to -ENOTDIR.
>
> Currently O_WRONLY for directories is only blocked in may_open(),
> which happens after we have the inode for the target, so after any
> create via O_CREAT|O_DIRECTORY.
>
> The advantage of short-circuiting is that we don't have to add even
> more logic to lookup_open() to differentiate -EISDIR/-ENOTDIR. Also,
> for filesystems that define atomic_open(), handling this cannot even be
> done at the VFS level, as we can't know ahead of calling
> ->atomic_open() what the result of the lookup is.
>
> Suggested-by: Christian Brauner (Amutable) <brauner@kernel.org>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/open.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/fs/open.c b/fs/open.c
> index 5cf8ada58483..be980a737c82 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1268,9 +1268,16 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>
> op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
>
> + /*
> + * Requesting write access on a directory can never succeed. Rather
> + * than performing a path-walk to determine whether the target is
> + * actually a directory (-EISDIR) or not (-ENOTDIR), we short-circuit
> + * to -ENOTDIR.
> + */
> + if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
> + return -ENOTDIR;
> +
I just realized this is not going to work. It should also check that __O_TMPFILE
isn't set, because of how O_TMPFILE is implemented. I'll fix this later.
> if (flags & O_CREAT) {
> - if ((flags & O_DIRECTORY) && (acc_mode & MAY_WRITE))
> - return -EISDIR;
> op->intent |= LOOKUP_CREATE;
> if (flags & O_EXCL) {
> op->intent |= LOOKUP_EXCL;
> --
> 2.54.0
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up
2026-06-14 16:44 ` [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up Jori Koolstra
@ 2026-06-15 21:43 ` Jori Koolstra
0 siblings, 0 replies; 18+ messages in thread
From: Jori Koolstra @ 2026-06-15 21:43 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Al Viro, NeilBrown, Paul Moore
Cc: linux-fsdevel, linux-kernel, audit@vger.kernel.org
> Op 14-06-2026 18:44 CEST schreef Jori Koolstra <jkoolstra@xs4all.nl>:
>
>
> In the mknod(2) path of calling vfs_create() we call audit_inode_child()
> before permission checks in may_create_dentry() (but after path-based
> LSM check). Copy this behaviour to lookup_open() and move
> audit_inode_child() to may_o_create().
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/namei.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 6bf1ded26377..a4a8cdbb48e2 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -4345,6 +4345,8 @@ static int may_o_create(struct mnt_idmap *idmap,
> if (error)
> return error;
>
> + audit_inode_child(dir->dentry->d_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
> +
> if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
> return -EOVERFLOW;
>
> @@ -4532,7 +4534,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
> goto out_dput;
>
> file->f_mode |= FMODE_CREATED;
> - audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
> if (!dir_inode->i_op->create) {
> error = -EACCES;
> goto out_dput;
> --
> 2.54.0
CC, audit@vger.kernel.org
Went too quick with this one... audit_inode_child() probably shouldn't be called
if we are in the lookup case. So there isn't really a way to do this exactly
symmetrical to the vfs_create()/vfs_mkdir() paths.
But certainly the current implementation is also wrong. In the atomic_open case
audit_inode_child() is called only once (in the final fsnotify call in
open_last_lookups()), but in the regular ->create case audit_inode_child() is
called twice.
What behavior is actually wanted here?
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2026-06-15 21:43 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-14 16:44 [PATCH 00/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
2026-06-14 16:44 ` [PATCH 01/12] fs/namei.c: use trailing_slashes() Jori Koolstra
2026-06-15 9:26 ` David Laight
2026-06-14 16:44 ` [PATCH 02/12] fs/namei.c: move create error && negative dentry case in lookup_open up Jori Koolstra
2026-06-15 9:37 ` David Laight
2026-06-14 16:44 ` [PATCH 03/12] vfs: prepare vfs_creat|mkdir_no_perm for reuse in lookup_open() Jori Koolstra
2026-06-14 16:44 ` [PATCH 04/12] fs/namei.c: lookup_open(): move audit_inode_child() up Jori Koolstra
2026-06-15 21:43 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 05/12] vfs: lookup_open(): move setting FMODE_CREATED up when calling create() Jori Koolstra
2026-06-14 16:44 ` [PATCH 06/12] vfs: lookup_open(): move i_op->create check to before try_break_deleg() Jori Koolstra
2026-06-14 16:44 ` [PATCH 07/12] vfs: lookup_open(): use vfs_create_no_perm() Jori Koolstra
2026-06-14 16:44 ` [PATCH 08/12] vfs: add O_CREAT|O_DIRECTORY to open*(2) Jori Koolstra
2026-06-14 16:44 ` [PATCH 09/12] vfs: move O_IS_MKDIR check out atomic_open() to individual filesystems Jori Koolstra
2026-06-14 16:44 ` [PATCH 10/12] vfs: refuse O_CREAT for directories through a dangling symlink Jori Koolstra
2026-06-14 16:44 ` [PATCH 11/12] vfs: short-circuit MAY_WRITE access for O_DIRECTORY opens Jori Koolstra
2026-06-14 17:01 ` Jori Koolstra
2026-06-15 12:56 ` Jori Koolstra
2026-06-14 16:44 ` [PATCH 12/12] selftest: add tests for open*(O_CREAT|O_DIRECTORY) Jori Koolstra
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.