[RFC PATCH v1 1/2] fs: Add O_DENY

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-22 17:07 [RFC PATCH v1 0/2] Add O_DENY_WRITE (complement AT_EXECVE_CHECK) Mickaël Salaün
@ 2025-08-22 17:07 ` Mickaël Salaün
  2025-08-22 19:45   ` Jann Horn
  2025-08-27 10:29   ` Aleksa Sarai
  0 siblings, 2 replies; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-22 17:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Kees Cook, Paul Moore, Serge Hallyn
  Cc: Mickaël Salaün, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu,
	Florian Weimer, Jann Horn, Jeff Xu, Jonathan Corbet,
	Jordan R Abrahams, Lakshmi Ramasubramanian, Luca Boccassi,
	Matt Bobrowski, Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet,
	Robert Waite, Roberto Sassu, Scott Shell, Steve Dower,
	Steve Grubb, kernel-hardening, linux-api, linux-fsdevel,
	linux-integrity, linux-kernel, linux-security-module,
	Andy Lutomirski, Jeff Xu

Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
passed file descriptors).  This changes the state of the opened file by
making it read-only until it is closed.  The main use case is for script
interpreters to get the guarantee that script' content cannot be altered
while being read and interpreted.  This is useful for generic distros
that may not have a write-xor-execute policy.  See commit a5874fde3c08
("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")

Both execve(2) and the IOCTL to enable fsverity can already set this
property on files with deny_write_access().  This new O_DENY_WRITE make
it widely available.  This is similar to what other OSs may provide
e.g., opening a file with only FILE_SHARE_READ on Windows.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge Hallyn <serge@hallyn.com>
Reported-by: Robert Waite <rowait@microsoft.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20250822170800.2116980-2-mic@digikod.net
---
 fs/fcntl.c                       | 26 ++++++++++++++++++++++++--
 fs/file_table.c                  |  2 ++
 fs/namei.c                       |  6 ++++++
 include/linux/fcntl.h            |  2 +-
 include/uapi/asm-generic/fcntl.h |  4 ++++
 5 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5598e4d57422..0c80c0fbc706 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -34,7 +34,8 @@
 
 #include "internal.h"
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
+	O_DENY_WRITE)
 
 static int setfl(int fd, struct file * filp, unsigned int arg)
 {
@@ -80,8 +81,29 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
 			error = 0;
 	}
 	spin_lock(&filp->f_lock);
+
+	if (arg & O_DENY_WRITE) {
+		/* Only regular files. */
+		if (!S_ISREG(inode->i_mode)) {
+			error = -EINVAL;
+			goto unlock;
+		}
+
+		/* Only sets once. */
+		if (!(filp->f_flags & O_DENY_WRITE)) {
+			error = exe_file_deny_write_access(filp);
+			if (error)
+				goto unlock;
+		}
+	} else {
+		if (filp->f_flags & O_DENY_WRITE)
+			exe_file_allow_write_access(filp);
+	}
+
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 	filp->f_iocb_flags = iocb_flags(filp);
+
+unlock:
 	spin_unlock(&filp->f_lock);
 
  out:
@@ -1158,7 +1180,7 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
 			__FMODE_EXEC));
diff --git a/fs/file_table.c b/fs/file_table.c
index 81c72576e548..6ba896b6a53f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -460,6 +460,8 @@ static void __fput(struct file *file)
 	locks_remove_file(file);
 
 	security_file_release(file);
+	if (unlikely(file->f_flags & O_DENY_WRITE))
+		exe_file_allow_write_access(file);
 	if (unlikely(file->f_flags & FASYNC)) {
 		if (file->f_op->fasync)
 			file->f_op->fasync(-1, file, 0);
diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..366530bf937d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3885,6 +3885,12 @@ static int do_open(struct nameidata *nd,
 	error = may_open(idmap, &nd->path, acc_mode, open_flag);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
+	if (!error && (open_flag & O_DENY_WRITE)) {
+		if (S_ISREG(file_inode(file)->i_mode))
+			error = exe_file_deny_write_access(file);
+		else
+			error = -EINVAL;
+	}
 	if (!error)
 		error = security_file_post_open(file, op->acc_mode);
 	if (!error && do_truncate)
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..dad14101686f 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -10,7 +10,7 @@
 	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
 	 O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
-	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_DENY_WRITE)
 
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 613475285643..facd9136f5af 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -91,6 +91,10 @@
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
 
+#ifndef O_DENY_WRITE
+#define O_DENY_WRITE	040000000
+#endif
+
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
 #endif
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-22 17:07 ` [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE Mickaël Salaün
@ 2025-08-22 19:45   ` Jann Horn
  2025-08-24 11:03     ` Mickaël Salaün
  2025-08-27 10:18     ` Aleksa Sarai
  2025-08-27 10:29   ` Aleksa Sarai
  1 sibling, 2 replies; 18+ messages in thread
From: Jann Horn @ 2025-08-22 19:45 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Al Viro, Christian Brauner, Kees Cook, Paul Moore, Serge Hallyn,
	Andy Lutomirski, Arnd Bergmann, Christian Heimes, Dmitry Vyukov,
	Elliott Hughes, Fan Wu, Florian Weimer, Jeff Xu, Jonathan Corbet,
	Jordan R Abrahams, Lakshmi Ramasubramanian, Luca Boccassi,
	Matt Bobrowski, Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet,
	Robert Waite, Roberto Sassu, Scott Shell, Steve Dower,
	Steve Grubb, kernel-hardening, linux-api, linux-fsdevel,
	linux-integrity, linux-kernel, linux-security-module,
	Andy Lutomirski, Jeff Xu

On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> passed file descriptors).  This changes the state of the opened file by
> making it read-only until it is closed.  The main use case is for script
> interpreters to get the guarantee that script' content cannot be altered
> while being read and interpreted.  This is useful for generic distros
> that may not have a write-xor-execute policy.  See commit a5874fde3c08
> ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
>
> Both execve(2) and the IOCTL to enable fsverity can already set this
> property on files with deny_write_access().  This new O_DENY_WRITE make

The kernel actually tried to get rid of this behavior on execve() in
commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
because it broke userspace assumptions.

> it widely available.  This is similar to what other OSs may provide
> e.g., opening a file with only FILE_SHARE_READ on Windows.

We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
removed for security reasons; as
https://man7.org/linux/man-pages/man2/mmap.2.html says:

|        MAP_DENYWRITE
|               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
|               signaled that attempts to write to the underlying file
|               should fail with ETXTBSY.  But this was a source of denial-
|               of-service attacks.)"

It seems to me that the same issue applies to your patch - it would
allow unprivileged processes to essentially lock files such that other
processes can't write to them anymore. This might allow unprivileged
users to prevent root from updating config files or stuff like that if
they're updated in-place.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-22 19:45   ` Jann Horn
@ 2025-08-24 11:03     ` Mickaël Salaün
  2025-08-24 18:04       ` Andy Lutomirski
  2025-08-27 10:18     ` Aleksa Sarai
  1 sibling, 1 reply; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-24 11:03 UTC (permalink / raw)
  To: Jann Horn
  Cc: Al Viro, Christian Brauner, Kees Cook, Paul Moore, Serge Hallyn,
	Andy Lutomirski, Arnd Bergmann, Christian Heimes, Dmitry Vyukov,
	Elliott Hughes, Fan Wu, Florian Weimer, Jeff Xu, Jonathan Corbet,
	Jordan R Abrahams, Lakshmi Ramasubramanian, Luca Boccassi,
	Matt Bobrowski, Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet,
	Robert Waite, Roberto Sassu, Scott Shell, Steve Dower,
	Steve Grubb, kernel-hardening, linux-api, linux-fsdevel,
	linux-integrity, linux-kernel, linux-security-module,
	Andy Lutomirski, Jeff Xu

On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > passed file descriptors).  This changes the state of the opened file by
> > making it read-only until it is closed.  The main use case is for script
> > interpreters to get the guarantee that script' content cannot be altered
> > while being read and interpreted.  This is useful for generic distros
> > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> >
> > Both execve(2) and the IOCTL to enable fsverity can already set this
> > property on files with deny_write_access().  This new O_DENY_WRITE make
> 
> The kernel actually tried to get rid of this behavior on execve() in
> commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> because it broke userspace assumptions.

Oh, good to know.

> 
> > it widely available.  This is similar to what other OSs may provide
> > e.g., opening a file with only FILE_SHARE_READ on Windows.
> 
> We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> removed for security reasons; as
> https://man7.org/linux/man-pages/man2/mmap.2.html says:
> 
> |        MAP_DENYWRITE
> |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> |               signaled that attempts to write to the underlying file
> |               should fail with ETXTBSY.  But this was a source of denial-
> |               of-service attacks.)"
> 
> It seems to me that the same issue applies to your patch - it would
> allow unprivileged processes to essentially lock files such that other
> processes can't write to them anymore. This might allow unprivileged
> users to prevent root from updating config files or stuff like that if
> they're updated in-place.

Yes, I agree, but since it is the case for executed files I though it
was worth starting a discussion on this topic.  This new flag could be
restricted to executable files, but we should avoid system-wide locks
like this.  I'm not sure how Windows handle these issues though.

Anyway, we should rely on the access control policy to control write and
execute access in a consistent way (e.g. write-xor-execute).  Thanks for
the references and the background!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-24 11:03     ` Mickaël Salaün
@ 2025-08-24 18:04       ` Andy Lutomirski
  2025-08-25  9:31         ` Mickaël Salaün
  0 siblings, 1 reply; 18+ messages in thread
From: Andy Lutomirski @ 2025-08-24 18:04 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Jann Horn, Al Viro, Christian Brauner, Kees Cook, Paul Moore,
	Serge Hallyn, Andy Lutomirski, Arnd Bergmann, Christian Heimes,
	Dmitry Vyukov, Elliott Hughes, Fan Wu, Florian Weimer, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Jeff Xu

On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
>
> On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > passed file descriptors).  This changes the state of the opened file by
> > > making it read-only until it is closed.  The main use case is for script
> > > interpreters to get the guarantee that script' content cannot be altered
> > > while being read and interpreted.  This is useful for generic distros
> > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > >
> > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > property on files with deny_write_access().  This new O_DENY_WRITE make
> >
> > The kernel actually tried to get rid of this behavior on execve() in
> > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > because it broke userspace assumptions.
>
> Oh, good to know.
>
> >
> > > it widely available.  This is similar to what other OSs may provide
> > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> >
> > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > removed for security reasons; as
> > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> >
> > |        MAP_DENYWRITE
> > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > |               signaled that attempts to write to the underlying file
> > |               should fail with ETXTBSY.  But this was a source of denial-
> > |               of-service attacks.)"
> >
> > It seems to me that the same issue applies to your patch - it would
> > allow unprivileged processes to essentially lock files such that other
> > processes can't write to them anymore. This might allow unprivileged
> > users to prevent root from updating config files or stuff like that if
> > they're updated in-place.
>
> Yes, I agree, but since it is the case for executed files I though it
> was worth starting a discussion on this topic.  This new flag could be
> restricted to executable files, but we should avoid system-wide locks
> like this.  I'm not sure how Windows handle these issues though.
>
> Anyway, we should rely on the access control policy to control write and
> execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> the references and the background!

I'm confused.  I understand that there are many contexts in which one
would want to prevent execution of unapproved content, which might
include preventing a given process from modifying some code and then
executing it.

I don't understand what these deny-write features have to do with it.
These features merely prevent someone from modifying code *that is
currently in use*, which is not at all the same thing as preventing
modifying code that might get executed -- one can often modify
contents *before* executing those contents.

In any case, IMO it's rather sad that the elimination of ETXTBSY had
to be reverted -- it's really quite a nasty feature.  But it occurs to
me that Linux can more or less do what is IMO the actually desired
thing: snapshot the contents of a file and execute the snapshot.  The
hack at the end of the email works!  (Well, it works if the chosen
filesystem supports it.)

$ ./silly_tmp /tmp/test /tmp vim /proc/self/fd/3

emacs is apparently far, far too clever and can't save if you do:

$ ./silly_tmp /tmp/test /tmp emacs /proc/self/fd/3

I'm not seriously suggesting that anyone should execute binaries or
scripts on Linux exactly like this, for a whole bunch of reasons:

- It needs filesystem support (but maybe this isn't so bad)

- It needs write access to a directory on the correct filesystem (a
showstopper for serious use)

- It is wildly incompatible with write-xor-execute, so this would be a
case of one step forward, ten steps back.

- It would defeat a lot of tools that inspect /proc, which would be
quite annoying to say the least.

But maybe a less kludgy version could be used for real.  What if there
was a syscall that would take an fd and make a snapshot of the file?
It would, at least by default, produce a *read-only* snapshot (fully
sealed a la F_SEAL_*), inherit any integrity data that came with the
source (e.g. LSMs could understand it), would not require a writable
directory on the filesystem, and would maybe even come with an extra
seal-like thing that prevents it from being linkat-ed.  (I'm not sure
that linkat would actually be a problem, but I'm also not immediately
sure that LSMs would be as comfortable with it if linkat were
allowed.)  And there could probably be an extremely efficient
implementation that might even reuse the existing deny-write mechanism
to optimize the common case where the file is never written.

For that matter, the actual common case would be to execute stuff in
/usr or similar, and those files really ought never to be modified.
So there could be a file attribute or something that means "this file
CANNOT be modified, but it can still be unlinked or replaced as
usual", and snapshotting such a file would be a no-op.  Distributions
and container tools could set that attribute.  Overlayfs could also
provide an efficient implementation if the file currently comes from
an immutable source.

Hmm, maybe it's not strictly necessary that it be immutable -- maybe
it's sometimes okay if reads start to fail if the contents change.
Let's call this a "weak snapshot" -- reads of a weak snapshot either
return the original contents or fail.  fsverity would give weak
snapshots for at no additional cost.

It's worth noting that the common case doesn't actually need an fd.
We have mmap(..., MAP_PRIVATE, ...).  What we would actually want for
mmap use cases is mmap(..., MAP_SNAPSHOT, ...), with the semantics
that the kernel promises that future writes to the source would either
not be reflected in the mapping or would cause SIGBUS.  One might
reasonably debate what forced-writes would do (I think forced-writes
should be allowed just like they currently are, since anyone who can
force-write to process memory is already assumed to be permitted to
bypass write-xor-execute).

---

/* Written by Claude Sonnet 4 with a surprisingly small amount of help
from Andy */

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>

int main(int argc, char *argv[]) {
    if (argc < 4) {
        fprintf(stderr, "Usage: %s <source_file> <temp_dir>
[exec_args...]\n", argv[0]);
        exit(1);
    }

    const char *source_file = argv[1];
    const char *temp_dir = argv[2];

    // Open source file
    int source_fd = open(source_file, O_RDONLY);
    if (source_fd == -1) {
        perror("Failed to open source file");
        exit(1);
    }

    // Create temporary file
    int temp_fd = open(temp_dir, O_TMPFILE | O_RDWR, 0600);
    if (temp_fd == -1) {
        perror("Failed to create temporary file");
        close(source_fd);
        exit(1);
    }

    // Clone the file contents using FICLONE
    if (ioctl(temp_fd, FICLONE, source_fd) == -1) {
        perror("Failed to clone file");
        close(source_fd);
        close(temp_fd);
        exit(1);
    }

    // Close source file
    close(source_fd);

    // Make sure temp file is on fd 3
    if (temp_fd != 3) {
        if (dup2(temp_fd, 3) == -1) {
            perror("Failed to move temp file to fd 3");
            close(temp_fd);
            exit(1);
        }
        close(temp_fd);
    }

    // Execute the remaining arguments
    if (argc >= 3) {
        execvp(argv[3], &argv[3]);
        perror("Failed to execute command");
        exit(1);
    }

    return 0;
}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-24 18:04       ` Andy Lutomirski
@ 2025-08-25  9:31         ` Mickaël Salaün
  2025-08-25  9:39           ` Florian Weimer
                             ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-25  9:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Jann Horn, Al Viro, Christian Brauner, Kees Cook, Paul Moore,
	Serge Hallyn, Andy Lutomirski, Arnd Bergmann, Christian Heimes,
	Dmitry Vyukov, Elliott Hughes, Fan Wu, Florian Weimer, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Jeff Xu

On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> >
> > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > passed file descriptors).  This changes the state of the opened file by
> > > > making it read-only until it is closed.  The main use case is for script
> > > > interpreters to get the guarantee that script' content cannot be altered
> > > > while being read and interpreted.  This is useful for generic distros
> > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > >
> > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > >
> > > The kernel actually tried to get rid of this behavior on execve() in
> > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > because it broke userspace assumptions.
> >
> > Oh, good to know.
> >
> > >
> > > > it widely available.  This is similar to what other OSs may provide
> > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > >
> > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > removed for security reasons; as
> > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > >
> > > |        MAP_DENYWRITE
> > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > |               signaled that attempts to write to the underlying file
> > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > |               of-service attacks.)"
> > >
> > > It seems to me that the same issue applies to your patch - it would
> > > allow unprivileged processes to essentially lock files such that other
> > > processes can't write to them anymore. This might allow unprivileged
> > > users to prevent root from updating config files or stuff like that if
> > > they're updated in-place.
> >
> > Yes, I agree, but since it is the case for executed files I though it
> > was worth starting a discussion on this topic.  This new flag could be
> > restricted to executable files, but we should avoid system-wide locks
> > like this.  I'm not sure how Windows handle these issues though.
> >
> > Anyway, we should rely on the access control policy to control write and
> > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > the references and the background!
> 
> I'm confused.  I understand that there are many contexts in which one
> would want to prevent execution of unapproved content, which might
> include preventing a given process from modifying some code and then
> executing it.
> 
> I don't understand what these deny-write features have to do with it.
> These features merely prevent someone from modifying code *that is
> currently in use*, which is not at all the same thing as preventing
> modifying code that might get executed -- one can often modify
> contents *before* executing those contents.

The order of checks would be:
1. open script with O_DENY_WRITE
2. check executability with AT_EXECVE_CHECK
3. read the content and interpret it

The deny-write feature was to guarantee that there is no race condition
between step 2 and 3.  All these checks are supposed to be done by a
trusted interpreter (which is allowed to be executed).  The
AT_EXECVE_CHECK call enables the caller to know if the kernel (and
associated security policies) allowed the *current* content of the file
to be executed.  Whatever happen before or after that (wrt.
O_DENY_WRITE) should be covered by the security policy.

> 
> In any case, IMO it's rather sad that the elimination of ETXTBSY had
> to be reverted -- it's really quite a nasty feature.  But it occurs to
> me that Linux can more or less do what is IMO the actually desired
> thing: snapshot the contents of a file and execute the snapshot.  The
> hack at the end of the email works!  (Well, it works if the chosen
> filesystem supports it.)
> 
> $ ./silly_tmp /tmp/test /tmp vim /proc/self/fd/3
> 
> emacs is apparently far, far too clever and can't save if you do:
> 
> $ ./silly_tmp /tmp/test /tmp emacs /proc/self/fd/3
> 
> 
> I'm not seriously suggesting that anyone should execute binaries or
> scripts on Linux exactly like this, for a whole bunch of reasons:
> 
> - It needs filesystem support (but maybe this isn't so bad)
> 
> - It needs write access to a directory on the correct filesystem (a
> showstopper for serious use)
> 
> - It is wildly incompatible with write-xor-execute, so this would be a
> case of one step forward, ten steps back.
> 
> - It would defeat a lot of tools that inspect /proc, which would be
> quite annoying to say the least.
> 
> 
> But maybe a less kludgy version could be used for real.  What if there
> was a syscall that would take an fd and make a snapshot of the file?

Yes, that would be a clean solution.  I don't think this is achievable
in an efficient way without involving filesystem implementations though.

> It would, at least by default, produce a *read-only* snapshot (fully
> sealed a la F_SEAL_*), inherit any integrity data that came with the
> source (e.g. LSMs could understand it), would not require a writable
> directory on the filesystem, and would maybe even come with an extra
> seal-like thing that prevents it from being linkat-ed.  (I'm not sure
> that linkat would actually be a problem, but I'm also not immediately
> sure that LSMs would be as comfortable with it if linkat were
> allowed.)  And there could probably be an extremely efficient
> implementation that might even reuse the existing deny-write mechanism
> to optimize the common case where the file is never written.
> 
> For that matter, the actual common case would be to execute stuff in
> /usr or similar, and those files really ought never to be modified.
> So there could be a file attribute or something that means "this file
> CANNOT be modified, but it can still be unlinked or replaced as
> usual", and snapshotting such a file would be a no-op.  Distributions
> and container tools could set that attribute.  Overlayfs could also
> provide an efficient implementation if the file currently comes from
> an immutable source.
> 
> Hmm, maybe it's not strictly necessary that it be immutable -- maybe
> it's sometimes okay if reads start to fail if the contents change.
> Let's call this a "weak snapshot" -- reads of a weak snapshot either
> return the original contents or fail.  fsverity would give weak
> snapshots for at no additional cost.
> 
> 
> It's worth noting that the common case doesn't actually need an fd.
> We have mmap(..., MAP_PRIVATE, ...).  What we would actually want for
> mmap use cases is mmap(..., MAP_SNAPSHOT, ...), with the semantics
> that the kernel promises that future writes to the source would either
> not be reflected in the mapping or would cause SIGBUS.  One might
> reasonably debate what forced-writes would do (I think forced-writes
> should be allowed just like they currently are, since anyone who can
> force-write to process memory is already assumed to be permitted to
> bypass write-xor-execute).
> 
> 
> ---
> 
> /* Written by Claude Sonnet 4 with a surprisingly small amount of help
> from Andy */
> 
> #define _GNU_SOURCE
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <sys/ioctl.h>
> #include <linux/fs.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <errno.h>
> #include <string.h>
> 
> int main(int argc, char *argv[]) {
>     if (argc < 4) {
>         fprintf(stderr, "Usage: %s <source_file> <temp_dir>
> [exec_args...]\n", argv[0]);
>         exit(1);
>     }
> 
>     const char *source_file = argv[1];
>     const char *temp_dir = argv[2];
> 
>     // Open source file
>     int source_fd = open(source_file, O_RDONLY);
>     if (source_fd == -1) {
>         perror("Failed to open source file");
>         exit(1);
>     }
> 
>     // Create temporary file
>     int temp_fd = open(temp_dir, O_TMPFILE | O_RDWR, 0600);
>     if (temp_fd == -1) {
>         perror("Failed to create temporary file");
>         close(source_fd);
>         exit(1);
>     }
> 
>     // Clone the file contents using FICLONE
>     if (ioctl(temp_fd, FICLONE, source_fd) == -1) {
>         perror("Failed to clone file");
>         close(source_fd);
>         close(temp_fd);
>         exit(1);
>     }
> 
>     // Close source file
>     close(source_fd);
> 
>     // Make sure temp file is on fd 3
>     if (temp_fd != 3) {
>         if (dup2(temp_fd, 3) == -1) {
>             perror("Failed to move temp file to fd 3");
>             close(temp_fd);
>             exit(1);
>         }
>         close(temp_fd);
>     }
> 
>     // Execute the remaining arguments
>     if (argc >= 3) {
>         execvp(argv[3], &argv[3]);
>         perror("Failed to execute command");
>         exit(1);
>     }
> 
>     return 0;
> }

As you said, this doesn't work if temp_dir is not allowed for execution,
and it doesn't allow the kernel to check/track the content of the
script, which is the purpose of AT_EXECVE_CHECK.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25  9:31         ` Mickaël Salaün
@ 2025-08-25  9:39           ` Florian Weimer
  2025-08-26 12:35             ` Mickaël Salaün
  2025-08-25 16:43           ` Andy Lutomirski
  2025-08-25 17:57           ` Jeff Xu
  2 siblings, 1 reply; 18+ messages in thread
From: Florian Weimer @ 2025-08-25  9:39 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner, Kees Cook,
	Paul Moore, Serge Hallyn, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Jeff Xu

* Mickaël Salaün:

> The order of checks would be:
> 1. open script with O_DENY_WRITE
> 2. check executability with AT_EXECVE_CHECK
> 3. read the content and interpret it
>
> The deny-write feature was to guarantee that there is no race condition
> between step 2 and 3.  All these checks are supposed to be done by a
> trusted interpreter (which is allowed to be executed).  The
> AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> associated security policies) allowed the *current* content of the file
> to be executed.  Whatever happen before or after that (wrt.
> O_DENY_WRITE) should be covered by the security policy.

Why isn't it an improper system configuration if the script file is
writable?

In the past, the argument was that making a file (writable and)
executable was an auditable even, and that provided enough coverage for
those people who are interested in this.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25  9:31         ` Mickaël Salaün
  2025-08-25  9:39           ` Florian Weimer
@ 2025-08-25 16:43           ` Andy Lutomirski
  2025-08-25 18:10             ` Jeff Xu
  2025-08-25 17:57           ` Jeff Xu
  2 siblings, 1 reply; 18+ messages in thread
From: Andy Lutomirski @ 2025-08-25 16:43 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Jann Horn, Al Viro, Christian Brauner, Kees Cook, Paul Moore,
	Serge Hallyn, Andy Lutomirski, Arnd Bergmann, Christian Heimes,
	Dmitry Vyukov, Elliott Hughes, Fan Wu, Florian Weimer, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Jeff Xu

On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
>
> On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > >
> > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > making it read-only until it is closed.  The main use case is for script
> > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > while being read and interpreted.  This is useful for generic distros
> > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > >
> > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > >
> > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > because it broke userspace assumptions.
> > >
> > > Oh, good to know.
> > >
> > > >
> > > > > it widely available.  This is similar to what other OSs may provide
> > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > >
> > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > removed for security reasons; as
> > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > >
> > > > |        MAP_DENYWRITE
> > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > |               signaled that attempts to write to the underlying file
> > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > |               of-service attacks.)"
> > > >
> > > > It seems to me that the same issue applies to your patch - it would
> > > > allow unprivileged processes to essentially lock files such that other
> > > > processes can't write to them anymore. This might allow unprivileged
> > > > users to prevent root from updating config files or stuff like that if
> > > > they're updated in-place.
> > >
> > > Yes, I agree, but since it is the case for executed files I though it
> > > was worth starting a discussion on this topic.  This new flag could be
> > > restricted to executable files, but we should avoid system-wide locks
> > > like this.  I'm not sure how Windows handle these issues though.
> > >
> > > Anyway, we should rely on the access control policy to control write and
> > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > the references and the background!
> >
> > I'm confused.  I understand that there are many contexts in which one
> > would want to prevent execution of unapproved content, which might
> > include preventing a given process from modifying some code and then
> > executing it.
> >
> > I don't understand what these deny-write features have to do with it.
> > These features merely prevent someone from modifying code *that is
> > currently in use*, which is not at all the same thing as preventing
> > modifying code that might get executed -- one can often modify
> > contents *before* executing those contents.
>
> The order of checks would be:
> 1. open script with O_DENY_WRITE
> 2. check executability with AT_EXECVE_CHECK
> 3. read the content and interpret it

Hmm.  Common LSM configurations should be able to handle this without
deny write, I think.  If you don't want a program to be able to make
their own scripts, then don't allow AT_EXECVE_CHECK to succeed on a
script that the program can write.

Keep in mind that trying to lock this down too hard is pointless for
users who are allowed to to ptrace-write to their own processes.  Or
for users who can do JIT, or for users who can run a REPL, etc.

> > But maybe a less kludgy version could be used for real.  What if there
> > was a syscall that would take an fd and make a snapshot of the file?
>
> Yes, that would be a clean solution.  I don't think this is achievable
> in an efficient way without involving filesystem implementations though.

It wouldn't be so terrible to involve filesystem implementations.
Most of the filesystems that people who care at all about security run
their binaries from either support reflinks or are immutable.  Things
like OCI implementations may already fit meet those criteria, and it
would be pretty nifty if the kernel was actually aware that OCI layers
are intended to be immutable.  We could even have an API to
generically query the hash of an immutable file and to ask the kernel
if it's validating the hash on reads.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25  9:31         ` Mickaël Salaün
  2025-08-25  9:39           ` Florian Weimer
  2025-08-25 16:43           ` Andy Lutomirski
@ 2025-08-25 17:57           ` Jeff Xu
  2025-08-26 12:39             ` Mickaël Salaün
  2 siblings, 1 reply; 18+ messages in thread
From: Jeff Xu @ 2025-08-25 17:57 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner, Kees Cook,
	Paul Moore, Serge Hallyn, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu,
	Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Jeff Xu

Hi Mickaël

On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
>
> On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > >
> > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > making it read-only until it is closed.  The main use case is for script
> > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > while being read and interpreted.  This is useful for generic distros
> > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > >
> > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > >
> > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > because it broke userspace assumptions.
> > >
> > > Oh, good to know.
> > >
> > > >
> > > > > it widely available.  This is similar to what other OSs may provide
> > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > >
> > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > removed for security reasons; as
> > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > >
> > > > |        MAP_DENYWRITE
> > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > |               signaled that attempts to write to the underlying file
> > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > |               of-service attacks.)"
> > > >
> > > > It seems to me that the same issue applies to your patch - it would
> > > > allow unprivileged processes to essentially lock files such that other
> > > > processes can't write to them anymore. This might allow unprivileged
> > > > users to prevent root from updating config files or stuff like that if
> > > > they're updated in-place.
> > >
> > > Yes, I agree, but since it is the case for executed files I though it
> > > was worth starting a discussion on this topic.  This new flag could be
> > > restricted to executable files, but we should avoid system-wide locks
> > > like this.  I'm not sure how Windows handle these issues though.
> > >
> > > Anyway, we should rely on the access control policy to control write and
> > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > the references and the background!
> >
> > I'm confused.  I understand that there are many contexts in which one
> > would want to prevent execution of unapproved content, which might
> > include preventing a given process from modifying some code and then
> > executing it.
> >
> > I don't understand what these deny-write features have to do with it.
> > These features merely prevent someone from modifying code *that is
> > currently in use*, which is not at all the same thing as preventing
> > modifying code that might get executed -- one can often modify
> > contents *before* executing those contents.
>
> The order of checks would be:
> 1. open script with O_DENY_WRITE
> 2. check executability with AT_EXECVE_CHECK
> 3. read the content and interpret it
>
I'm not sure about the O_DENY_WRITE approach, but the problem is worth solving.

AT_EXECVE_CHECK is not just for scripting languages. It could also
work with bytecodes like Java, for example. If we let the Java runtime
call AT_EXECVE_CHECK before loading the bytecode, the LSM could
develop a policy based on that.

> The deny-write feature was to guarantee that there is no race condition
> between step 2 and 3.  All these checks are supposed to be done by a
> trusted interpreter (which is allowed to be executed).  The
> AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> associated security policies) allowed the *current* content of the file
> to be executed.  Whatever happen before or after that (wrt.
> O_DENY_WRITE) should be covered by the security policy.
>
Agree, the race problem needs to be solved in order for AT_EXECVE_CHECK.

Enforcing non-write for the path that stores scripts or bytecodes can
be challenging due to historical or backward compatibility reasons.
Since AT_EXECVE_CHECK provides a mechanism to check the file right
before it is used, we can assume it will detect any "problem" that
happened before that, (e.g. the file was overwritten). However, that
also imposes two additional requirements:
1> the file doesn't change while AT_EXECVE_CHECK does the check.
2>The file content kept by the process remains unchanged after passing
the AT_EXECVE_CHECK.

I imagine, the complete solution for AT_EXECVE_CHECK would include
those two grantees.

Thanks
-Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25 16:43           ` Andy Lutomirski
@ 2025-08-25 18:10             ` Jeff Xu
  0 siblings, 0 replies; 18+ messages in thread
From: Jeff Xu @ 2025-08-25 18:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Mickaël Salaün, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Jeff Xu

On Mon, Aug 25, 2025 at 9:43 AM Andy Lutomirski <luto@amacapital.net> wrote:
>
> On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> >
> > On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > >
> > > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > > making it read-only until it is closed.  The main use case is for script
> > > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > > while being read and interpreted.  This is useful for generic distros
> > > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > > >
> > > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > > >
> > > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > > because it broke userspace assumptions.
> > > >
> > > > Oh, good to know.
> > > >
> > > > >
> > > > > > it widely available.  This is similar to what other OSs may provide
> > > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > > >
> > > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > > removed for security reasons; as
> > > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > > >
> > > > > |        MAP_DENYWRITE
> > > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > > |               signaled that attempts to write to the underlying file
> > > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > > |               of-service attacks.)"
> > > > >
> > > > > It seems to me that the same issue applies to your patch - it would
> > > > > allow unprivileged processes to essentially lock files such that other
> > > > > processes can't write to them anymore. This might allow unprivileged
> > > > > users to prevent root from updating config files or stuff like that if
> > > > > they're updated in-place.
> > > >
> > > > Yes, I agree, but since it is the case for executed files I though it
> > > > was worth starting a discussion on this topic.  This new flag could be
> > > > restricted to executable files, but we should avoid system-wide locks
> > > > like this.  I'm not sure how Windows handle these issues though.
> > > >
> > > > Anyway, we should rely on the access control policy to control write and
> > > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > > the references and the background!
> > >
> > > I'm confused.  I understand that there are many contexts in which one
> > > would want to prevent execution of unapproved content, which might
> > > include preventing a given process from modifying some code and then
> > > executing it.
> > >
> > > I don't understand what these deny-write features have to do with it.
> > > These features merely prevent someone from modifying code *that is
> > > currently in use*, which is not at all the same thing as preventing
> > > modifying code that might get executed -- one can often modify
> > > contents *before* executing those contents.
> >
> > The order of checks would be:
> > 1. open script with O_DENY_WRITE
> > 2. check executability with AT_EXECVE_CHECK
> > 3. read the content and interpret it
>
> Hmm.  Common LSM configurations should be able to handle this without
> deny write, I think.  If you don't want a program to be able to make
> their own scripts, then don't allow AT_EXECVE_CHECK to succeed on a
> script that the program can write.
>
Yes, Common LSM could handle this, however, due to historic and app
backward compability reason, sometimes it is impossible to enforce
that kind of policy in practice, therefore as an alternative, a
machinism such as AT_EXECVE_CHECK is really useful.

> Keep in mind that trying to lock this down too hard is pointless for
> users who are allowed to to ptrace-write to their own processes.  Or
> for users who can do JIT, or for users who can run a REPL, etc.
>
The ptrace-write and /proc/pid/mem writing are on my radar, at least
for ChomeOS and Android.
AT_EXECVE_CHECK is orthogonal to those IMO, I hope eventually all
those paths will be hardened.

Thanks and regards,
-Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
@ 2025-08-25 21:56 Andy Lutomirski
  2025-08-25 23:06 ` Jeff Xu
  0 siblings, 1 reply; 18+ messages in thread
From: Andy Lutomirski @ 2025-08-25 21:56 UTC (permalink / raw)
  To: Jeff Xu
  Cc: Mickaël Salaün, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Jeff Xu


> On Aug 25, 2025, at 11:10 AM, Jeff Xu <jeffxu@google.com> wrote:
> 
> On Mon, Aug 25, 2025 at 9:43 AM Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
>>> On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
>>>> On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
>>>>> On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
>>>>>> On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
>>>>>>> Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
>>>>>>> passed file descriptors).  This changes the state of the opened file by
>>>>>>> making it read-only until it is closed.  The main use case is for script
>>>>>>> interpreters to get the guarantee that script' content cannot be altered
>>>>>>> while being read and interpreted.  This is useful for generic distros
>>>>>>> that may not have a write-xor-execute policy.  See commit a5874fde3c08
>>>>>>> ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
>>>>>>> Both execve(2) and the IOCTL to enable fsverity can already set this
>>>>>>> property on files with deny_write_access().  This new O_DENY_WRITE make
>>>>>> The kernel actually tried to get rid of this behavior on execve() in
>>>>>> commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
>>>>>> to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
>>>>>> because it broke userspace assumptions.
>>>>> Oh, good to know.
>>>>>>> it widely available.  This is similar to what other OSs may provide
>>>>>>> e.g., opening a file with only FILE_SHARE_READ on Windows.
>>>>>> We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
>>>>>> removed for security reasons; as
>>>>>> https://man7.org/linux/man-pages/man2/mmap.2.html says:
>>>>>> |        MAP_DENYWRITE
>>>>>> |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
>>>>>> |               signaled that attempts to write to the underlying file
>>>>>> |               should fail with ETXTBSY.  But this was a source of denial-
>>>>>> |               of-service attacks.)"
>>>>>> It seems to me that the same issue applies to your patch - it would
>>>>>> allow unprivileged processes to essentially lock files such that other
>>>>>> processes can't write to them anymore. This might allow unprivileged
>>>>>> users to prevent root from updating config files or stuff like that if
>>>>>> they're updated in-place.
>>>>> Yes, I agree, but since it is the case for executed files I though it
>>>>> was worth starting a discussion on this topic.  This new flag could be
>>>>> restricted to executable files, but we should avoid system-wide locks
>>>>> like this.  I'm not sure how Windows handle these issues though.
>>>>> Anyway, we should rely on the access control policy to control write and
>>>>> execute access in a consistent way (e.g. write-xor-execute).  Thanks for
>>>>> the references and the background!
>>>> I'm confused.  I understand that there are many contexts in which one
>>>> would want to prevent execution of unapproved content, which might
>>>> include preventing a given process from modifying some code and then
>>>> executing it.
>>>> I don't understand what these deny-write features have to do with it.
>>>> These features merely prevent someone from modifying code *that is
>>>> currently in use*, which is not at all the same thing as preventing
>>>> modifying code that might get executed -- one can often modify
>>>> contents *before* executing those contents.
>>> The order of checks would be:
>>> 1. open script with O_DENY_WRITE
>>> 2. check executability with AT_EXECVE_CHECK
>>> 3. read the content and interpret it
>> Hmm.  Common LSM configurations should be able to handle this without
>> deny write, I think.  If you don't want a program to be able to make
>> their own scripts, then don't allow AT_EXECVE_CHECK to succeed on a
>> script that the program can write.
> Yes, Common LSM could handle this, however, due to historic and app
> backward compability reason, sometimes it is impossible to enforce
> that kind of policy in practice, therefore as an alternative, a
> machinism such as AT_EXECVE_CHECK is really useful.

Can you clarify?  I’m suspicious that we’re taking past each other.

AT_EXECVE_CHECK solves a problem that there are actions that effectively “execute” a file that don’t execute literal CPU instructions for it. Sometimes open+read has the effect of interpreting the contents of the file as something code-like.

But, as I see it, deny-write is almost entirely orthogonal. If you open a file with the intent of executing it (mmap-execute or interpret — makes little practical difference here), then the kernel can enforce some policy. If the file is writable by a process that ought not have permission to execute code in the context of the opening-for-execute process, then LSMs need deny-write to be enforced so that they can verify the contents at the time of opening.

But let’s step back a moment: is there any actual sensible security policy that does this?  If I want to *enforce* that a process only execute approved code, then wouldn’t I do it be only allowing executing files that the process can’t write?

The reason that the removal of deny-write wasn’t security — it was a functionality issue: a linker accidentally modified an in-use binary. If you have permission to use gcc or lld, etc to create binaries, and you have permission to run them, then you pretty much have permission to run whatever code you like.

So, if there’s a real security use case for deny-write, I’m still not seeing it.

>> Keep in mind that trying to lock this down too hard is pointless for
>> users who are allowed to to ptrace-write to their own processes.  Or
>> for users who can do JIT, or for users who can run a REPL, etc.
> The ptrace-write and /proc/pid/mem writing are on my radar, at least
> for ChomeOS and Android.
> AT_EXECVE_CHECK is orthogonal to those IMO, I hope eventually all
> those paths will be hardened.
> 
> Thanks and regards,
> -Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25 21:56 [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE Andy Lutomirski
@ 2025-08-25 23:06 ` Jeff Xu
  0 siblings, 0 replies; 18+ messages in thread
From: Jeff Xu @ 2025-08-25 23:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Mickaël Salaün, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Jeff Xu

On Mon, Aug 25, 2025 at 2:56 PM Andy Lutomirski <luto@amacapital.net> wrote:
>
>
> > On Aug 25, 2025, at 11:10 AM, Jeff Xu <jeffxu@google.com> wrote:
> >
> > On Mon, Aug 25, 2025 at 9:43 AM Andy Lutomirski <luto@amacapital.net> wrote:
> >>> On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> >>> On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> >>>> On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> >>>>> On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> >>>>>> On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> >>>>>>> Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> >>>>>>> passed file descriptors).  This changes the state of the opened file by
> >>>>>>> making it read-only until it is closed.  The main use case is for script
> >>>>>>> interpreters to get the guarantee that script' content cannot be altered
> >>>>>>> while being read and interpreted.  This is useful for generic distros
> >>>>>>> that may not have a write-xor-execute policy.  See commit a5874fde3c08
> >>>>>>> ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> >>>>>>> Both execve(2) and the IOCTL to enable fsverity can already set this
> >>>>>>> property on files with deny_write_access().  This new O_DENY_WRITE make
> >>>>>> The kernel actually tried to get rid of this behavior on execve() in
> >>>>>> commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> >>>>>> to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> >>>>>> because it broke userspace assumptions.
> >>>>> Oh, good to know.
> >>>>>>> it widely available.  This is similar to what other OSs may provide
> >>>>>>> e.g., opening a file with only FILE_SHARE_READ on Windows.
> >>>>>> We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> >>>>>> removed for security reasons; as
> >>>>>> https://man7.org/linux/man-pages/man2/mmap.2.html says:
> >>>>>> |        MAP_DENYWRITE
> >>>>>> |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> >>>>>> |               signaled that attempts to write to the underlying file
> >>>>>> |               should fail with ETXTBSY.  But this was a source of denial-
> >>>>>> |               of-service attacks.)"
> >>>>>> It seems to me that the same issue applies to your patch - it would
> >>>>>> allow unprivileged processes to essentially lock files such that other
> >>>>>> processes can't write to them anymore. This might allow unprivileged
> >>>>>> users to prevent root from updating config files or stuff like that if
> >>>>>> they're updated in-place.
> >>>>> Yes, I agree, but since it is the case for executed files I though it
> >>>>> was worth starting a discussion on this topic.  This new flag could be
> >>>>> restricted to executable files, but we should avoid system-wide locks
> >>>>> like this.  I'm not sure how Windows handle these issues though.
> >>>>> Anyway, we should rely on the access control policy to control write and
> >>>>> execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> >>>>> the references and the background!
> >>>> I'm confused.  I understand that there are many contexts in which one
> >>>> would want to prevent execution of unapproved content, which might
> >>>> include preventing a given process from modifying some code and then
> >>>> executing it.
> >>>> I don't understand what these deny-write features have to do with it.
> >>>> These features merely prevent someone from modifying code *that is
> >>>> currently in use*, which is not at all the same thing as preventing
> >>>> modifying code that might get executed -- one can often modify
> >>>> contents *before* executing those contents.
> >>> The order of checks would be:
> >>> 1. open script with O_DENY_WRITE
> >>> 2. check executability with AT_EXECVE_CHECK
> >>> 3. read the content and interpret it
> >> Hmm.  Common LSM configurations should be able to handle this without
> >> deny write, I think.  If you don't want a program to be able to make
> >> their own scripts, then don't allow AT_EXECVE_CHECK to succeed on a
> >> script that the program can write.
> > Yes, Common LSM could handle this, however, due to historic and app
> > backward compability reason, sometimes it is impossible to enforce
> > that kind of policy in practice, therefore as an alternative, a
> > machinism such as AT_EXECVE_CHECK is really useful.
>
> Can you clarify?  I’m suspicious that we’re taking past each other.
>
Apology, my response isn't clear.

> AT_EXECVE_CHECK solves a problem that there are actions that effectively “execute” a file that don’t execute literal CPU instructions for it. Sometimes open+read has the effect of interpreting the contents of the file as something code-like.
>
Yes. We have the same understanding of this.
As an example, shell script or java byte code, their file permission
can be rw, but no x bit set. The interpreter reads those and executes
them.

> But, as I see it, deny-write is almost entirely orthogonal. If you open a file with the intent of executing it (mmap-execute or interpret — makes little practical difference here), then the kernel can enforce some policy. If the file is writable by a process that ought not have permission to execute code in the context of the opening-for-execute process, then LSMs need deny-write to be enforced so that they can verify the contents at the time of opening.
>
> But let’s step back a moment: is there any actual sensible security policy that does this?  If I want to *enforce* that a process only execute approved code, then wouldn’t I do it be only allowing executing files that the process can’t write?
>
I imagine the following situation: an app has both "rw" access to the
file that holds the script code, the "w" is needed because the app
updates the script sometimes.

What is a reasonable sandbox solution for such an app? There are maybe
two options:

1> split the app as two processes: processA has "w" access to the
script for updating when needed. Process B has "r" access but no "w",
for executing. ProcessA and ProcessB will coordinate to avoid racing
on the script update.

2> The process will use AT_EXECVE_CHECK (added by interpreter) to
validate the file before opening , and the file content held by the
process should be immutable while being validated and executed later
by interpreter.

option 1 is the ideal, and IIUC, you promote this too. However, that
requires refactoring the app as two processes.
option 2 is an alternative. Because it doesn't require the change from
the apps, therefore a solution worth considering.

> The reason that the removal of deny-write wasn’t security — it was a functionality issue: a linker accidentally modified an in-use binary. If you have permission to use gcc or lld, etc to create binaries, and you have permission to run them, then you pretty much have permission to run whatever code you like.
>
> So, if there’s a real security use case for deny-write, I’m still not seeing it.
>
Although the current patch might not be ideal due to the potential DOS
attack, it does offer a starting point to address the needs. Let's
continue the discussion based on this patch and explore different
ideas.

Thanks and regards,
-Jeff

> >> Keep in mind that trying to lock this down too hard is pointless for
> >> users who are allowed to to ptrace-write to their own processes.  Or
> >> for users who can do JIT, or for users who can run a REPL, etc.
> > The ptrace-write and /proc/pid/mem writing are on my radar, at least
> > for ChomeOS and Android.
> > AT_EXECVE_CHECK is orthogonal to those IMO, I hope eventually all
> > those paths will be hardened.
> >
> > Thanks and regards,
> > -Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25  9:39           ` Florian Weimer
@ 2025-08-26 12:35             ` Mickaël Salaün
  0 siblings, 0 replies; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-26 12:35 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner, Kees Cook,
	Paul Moore, Serge Hallyn, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Jeff Xu

On Mon, Aug 25, 2025 at 11:39:11AM +0200, Florian Weimer wrote:
> * Mickaël Salaün:
> 
> > The order of checks would be:
> > 1. open script with O_DENY_WRITE
> > 2. check executability with AT_EXECVE_CHECK
> > 3. read the content and interpret it
> >
> > The deny-write feature was to guarantee that there is no race condition
> > between step 2 and 3.  All these checks are supposed to be done by a
> > trusted interpreter (which is allowed to be executed).  The
> > AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> > associated security policies) allowed the *current* content of the file
> > to be executed.  Whatever happen before or after that (wrt.
> > O_DENY_WRITE) should be covered by the security policy.
> 
> Why isn't it an improper system configuration if the script file is
> writable?

It is, except if the system only wants to track executions (e.g. record
checksum of scripts) without restricting file modifications.

> 
> In the past, the argument was that making a file (writable and)
> executable was an auditable even, and that provided enough coverage for
> those people who are interested in this.

Yes, but in this case there is a race condition that this patch tried to
fix.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-25 17:57           ` Jeff Xu
@ 2025-08-26 12:39             ` Mickaël Salaün
  2025-08-26 20:29               ` Jeff Xu
  0 siblings, 1 reply; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-26 12:39 UTC (permalink / raw)
  To: Jeff Xu
  Cc: Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner, Kees Cook,
	Paul Moore, Serge Hallyn, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu,
	Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Jeff Xu

On Mon, Aug 25, 2025 at 10:57:57AM -0700, Jeff Xu wrote:
> Hi Mickaël
> 
> On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> >
> > On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > >
> > > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > > making it read-only until it is closed.  The main use case is for script
> > > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > > while being read and interpreted.  This is useful for generic distros
> > > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > > >
> > > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > > >
> > > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > > because it broke userspace assumptions.
> > > >
> > > > Oh, good to know.
> > > >
> > > > >
> > > > > > it widely available.  This is similar to what other OSs may provide
> > > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > > >
> > > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > > removed for security reasons; as
> > > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > > >
> > > > > |        MAP_DENYWRITE
> > > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > > |               signaled that attempts to write to the underlying file
> > > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > > |               of-service attacks.)"
> > > > >
> > > > > It seems to me that the same issue applies to your patch - it would
> > > > > allow unprivileged processes to essentially lock files such that other
> > > > > processes can't write to them anymore. This might allow unprivileged
> > > > > users to prevent root from updating config files or stuff like that if
> > > > > they're updated in-place.
> > > >
> > > > Yes, I agree, but since it is the case for executed files I though it
> > > > was worth starting a discussion on this topic.  This new flag could be
> > > > restricted to executable files, but we should avoid system-wide locks
> > > > like this.  I'm not sure how Windows handle these issues though.
> > > >
> > > > Anyway, we should rely on the access control policy to control write and
> > > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > > the references and the background!
> > >
> > > I'm confused.  I understand that there are many contexts in which one
> > > would want to prevent execution of unapproved content, which might
> > > include preventing a given process from modifying some code and then
> > > executing it.
> > >
> > > I don't understand what these deny-write features have to do with it.
> > > These features merely prevent someone from modifying code *that is
> > > currently in use*, which is not at all the same thing as preventing
> > > modifying code that might get executed -- one can often modify
> > > contents *before* executing those contents.
> >
> > The order of checks would be:
> > 1. open script with O_DENY_WRITE
> > 2. check executability with AT_EXECVE_CHECK
> > 3. read the content and interpret it
> >
> I'm not sure about the O_DENY_WRITE approach, but the problem is worth solving.
> 
> AT_EXECVE_CHECK is not just for scripting languages. It could also
> work with bytecodes like Java, for example. If we let the Java runtime
> call AT_EXECVE_CHECK before loading the bytecode, the LSM could
> develop a policy based on that.

Sure, I'm using "script" to make it simple, but this applies to other
use cases.

> 
> > The deny-write feature was to guarantee that there is no race condition
> > between step 2 and 3.  All these checks are supposed to be done by a
> > trusted interpreter (which is allowed to be executed).  The
> > AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> > associated security policies) allowed the *current* content of the file
> > to be executed.  Whatever happen before or after that (wrt.
> > O_DENY_WRITE) should be covered by the security policy.
> >
> Agree, the race problem needs to be solved in order for AT_EXECVE_CHECK.
> 
> Enforcing non-write for the path that stores scripts or bytecodes can
> be challenging due to historical or backward compatibility reasons.
> Since AT_EXECVE_CHECK provides a mechanism to check the file right
> before it is used, we can assume it will detect any "problem" that
> happened before that, (e.g. the file was overwritten). However, that
> also imposes two additional requirements:
> 1> the file doesn't change while AT_EXECVE_CHECK does the check.

This is already the case, so any kind of LSM checks are good.

> 2>The file content kept by the process remains unchanged after passing
> the AT_EXECVE_CHECK.

The goal of this patch was to avoid such race condition in the case
where executable files can be updated.  But in most cases it should not
be a security issue (because processes allowed to write to executable
files should be trusted), but this could still lead to bugs (because of
inconsistent file content, half-updated).

> 
> I imagine, the complete solution for AT_EXECVE_CHECK would include
> those two grantees.

There is no issue directly with AT_EXECVE_CHECK, but according to the
system configuration, interpreters could read a file that was updated
after the AT_EXECVE_CHECK.  This should not be an issue for secure
systems where executable files are only updated with trusted code,
except if the update mechanism is not atomic.  The main use case for
this patch series was for generic distros that may not have the
write-xor-execute guarantees e.g., for developers.

The only viable solution I see would be to have some kind of snapshot of
files, requested by interpreters, but I'm not sure if it is worth it.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-26 12:39             ` Mickaël Salaün
@ 2025-08-26 20:29               ` Jeff Xu
  2025-08-27  8:19                 ` Mickaël Salaün
  0 siblings, 1 reply; 18+ messages in thread
From: Jeff Xu @ 2025-08-26 20:29 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Jeff Xu, Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module

Hi Mickaël

On Tue, Aug 26, 2025 at 5:39 AM Mickaël Salaün <mic@digikod.net> wrote:
>
> On Mon, Aug 25, 2025 at 10:57:57AM -0700, Jeff Xu wrote:
> > Hi Mickaël
> >
> > On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> > >
> > > On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > > > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > > >
> > > > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > > > making it read-only until it is closed.  The main use case is for script
> > > > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > > > while being read and interpreted.  This is useful for generic distros
> > > > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > > > >
> > > > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > > > >
> > > > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > > > because it broke userspace assumptions.
> > > > >
> > > > > Oh, good to know.
> > > > >
> > > > > >
> > > > > > > it widely available.  This is similar to what other OSs may provide
> > > > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > > > >
> > > > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > > > removed for security reasons; as
> > > > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > > > >
> > > > > > |        MAP_DENYWRITE
> > > > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > > > |               signaled that attempts to write to the underlying file
> > > > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > > > |               of-service attacks.)"
> > > > > >
> > > > > > It seems to me that the same issue applies to your patch - it would
> > > > > > allow unprivileged processes to essentially lock files such that other
> > > > > > processes can't write to them anymore. This might allow unprivileged
> > > > > > users to prevent root from updating config files or stuff like that if
> > > > > > they're updated in-place.
> > > > >
> > > > > Yes, I agree, but since it is the case for executed files I though it
> > > > > was worth starting a discussion on this topic.  This new flag could be
> > > > > restricted to executable files, but we should avoid system-wide locks
> > > > > like this.  I'm not sure how Windows handle these issues though.
> > > > >
> > > > > Anyway, we should rely on the access control policy to control write and
> > > > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > > > the references and the background!
> > > >
> > > > I'm confused.  I understand that there are many contexts in which one
> > > > would want to prevent execution of unapproved content, which might
> > > > include preventing a given process from modifying some code and then
> > > > executing it.
> > > >
> > > > I don't understand what these deny-write features have to do with it.
> > > > These features merely prevent someone from modifying code *that is
> > > > currently in use*, which is not at all the same thing as preventing
> > > > modifying code that might get executed -- one can often modify
> > > > contents *before* executing those contents.
> > >
> > > The order of checks would be:
> > > 1. open script with O_DENY_WRITE
> > > 2. check executability with AT_EXECVE_CHECK
> > > 3. read the content and interpret it
> > >
> > I'm not sure about the O_DENY_WRITE approach, but the problem is worth solving.
> >
> > AT_EXECVE_CHECK is not just for scripting languages. It could also
> > work with bytecodes like Java, for example. If we let the Java runtime
> > call AT_EXECVE_CHECK before loading the bytecode, the LSM could
> > develop a policy based on that.
>
> Sure, I'm using "script" to make it simple, but this applies to other
> use cases.
>
That makes sense.

> >
> > > The deny-write feature was to guarantee that there is no race condition
> > > between step 2 and 3.  All these checks are supposed to be done by a
> > > trusted interpreter (which is allowed to be executed).  The
> > > AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> > > associated security policies) allowed the *current* content of the file
> > > to be executed.  Whatever happen before or after that (wrt.
> > > O_DENY_WRITE) should be covered by the security policy.
> > >
> > Agree, the race problem needs to be solved in order for AT_EXECVE_CHECK.
> >
> > Enforcing non-write for the path that stores scripts or bytecodes can
> > be challenging due to historical or backward compatibility reasons.
> > Since AT_EXECVE_CHECK provides a mechanism to check the file right
> > before it is used, we can assume it will detect any "problem" that
> > happened before that, (e.g. the file was overwritten). However, that
> > also imposes two additional requirements:
> > 1> the file doesn't change while AT_EXECVE_CHECK does the check.
>
> This is already the case, so any kind of LSM checks are good.
>
May I ask how this is done? some code in do_open_execat() does this ?
Apologies if this is a basic question.

> > 2>The file content kept by the process remains unchanged after passing
> > the AT_EXECVE_CHECK.
>
> The goal of this patch was to avoid such race condition in the case
> where executable files can be updated.  But in most cases it should not
> be a security issue (because processes allowed to write to executable
> files should be trusted), but this could still lead to bugs (because of
> inconsistent file content, half-updated).
>
There is also a time gap between:
a> the time of AT_EXECVE_CHECK
b> the time that the app opens the file for execution.
right ? another potential attack path (though this is not the case I
mentioned previously).

For the case I mentioned previously, I have to think more if the race
condition is a bug or security issue.
IIUC, two solutions are discussed so far:
1> the process could write to fs to update the script.  However, for
execution, the process still uses the copy that passed the
AT_EXECVE_CHECK. (snapshot solution by Andy Lutomirski)
or 2> the process blocks the write while opening the file as read only
and executing the script. (this seems to be the approach of this
patch).

I wonder if there are other ideas.

Thanks and regards,
-Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-26 20:29               ` Jeff Xu
@ 2025-08-27  8:19                 ` Mickaël Salaün
  2025-08-28 20:17                   ` Jeff Xu
  0 siblings, 1 reply; 18+ messages in thread
From: Mickaël Salaün @ 2025-08-27  8:19 UTC (permalink / raw)
  To: Jeff Xu
  Cc: Jeff Xu, Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module

On Tue, Aug 26, 2025 at 01:29:55PM -0700, Jeff Xu wrote:
> Hi Mickaël
> 
> On Tue, Aug 26, 2025 at 5:39 AM Mickaël Salaün <mic@digikod.net> wrote:
> >
> > On Mon, Aug 25, 2025 at 10:57:57AM -0700, Jeff Xu wrote:
> > > Hi Mickaël
> > >
> > > On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > >
> > > > On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > > > > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > >
> > > > > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > > > > making it read-only until it is closed.  The main use case is for script
> > > > > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > > > > while being read and interpreted.  This is useful for generic distros
> > > > > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > > > > >
> > > > > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > > > > >
> > > > > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > > > > because it broke userspace assumptions.
> > > > > >
> > > > > > Oh, good to know.
> > > > > >
> > > > > > >
> > > > > > > > it widely available.  This is similar to what other OSs may provide
> > > > > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > > > > >
> > > > > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > > > > removed for security reasons; as
> > > > > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > > > > >
> > > > > > > |        MAP_DENYWRITE
> > > > > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > > > > |               signaled that attempts to write to the underlying file
> > > > > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > > > > |               of-service attacks.)"
> > > > > > >
> > > > > > > It seems to me that the same issue applies to your patch - it would
> > > > > > > allow unprivileged processes to essentially lock files such that other
> > > > > > > processes can't write to them anymore. This might allow unprivileged
> > > > > > > users to prevent root from updating config files or stuff like that if
> > > > > > > they're updated in-place.
> > > > > >
> > > > > > Yes, I agree, but since it is the case for executed files I though it
> > > > > > was worth starting a discussion on this topic.  This new flag could be
> > > > > > restricted to executable files, but we should avoid system-wide locks
> > > > > > like this.  I'm not sure how Windows handle these issues though.
> > > > > >
> > > > > > Anyway, we should rely on the access control policy to control write and
> > > > > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > > > > the references and the background!
> > > > >
> > > > > I'm confused.  I understand that there are many contexts in which one
> > > > > would want to prevent execution of unapproved content, which might
> > > > > include preventing a given process from modifying some code and then
> > > > > executing it.
> > > > >
> > > > > I don't understand what these deny-write features have to do with it.
> > > > > These features merely prevent someone from modifying code *that is
> > > > > currently in use*, which is not at all the same thing as preventing
> > > > > modifying code that might get executed -- one can often modify
> > > > > contents *before* executing those contents.
> > > >
> > > > The order of checks would be:
> > > > 1. open script with O_DENY_WRITE
> > > > 2. check executability with AT_EXECVE_CHECK
> > > > 3. read the content and interpret it
> > > >
> > > I'm not sure about the O_DENY_WRITE approach, but the problem is worth solving.
> > >
> > > AT_EXECVE_CHECK is not just for scripting languages. It could also
> > > work with bytecodes like Java, for example. If we let the Java runtime
> > > call AT_EXECVE_CHECK before loading the bytecode, the LSM could
> > > develop a policy based on that.
> >
> > Sure, I'm using "script" to make it simple, but this applies to other
> > use cases.
> >
> That makes sense.
> 
> > >
> > > > The deny-write feature was to guarantee that there is no race condition
> > > > between step 2 and 3.  All these checks are supposed to be done by a
> > > > trusted interpreter (which is allowed to be executed).  The
> > > > AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> > > > associated security policies) allowed the *current* content of the file
> > > > to be executed.  Whatever happen before or after that (wrt.
> > > > O_DENY_WRITE) should be covered by the security policy.
> > > >
> > > Agree, the race problem needs to be solved in order for AT_EXECVE_CHECK.
> > >
> > > Enforcing non-write for the path that stores scripts or bytecodes can
> > > be challenging due to historical or backward compatibility reasons.
> > > Since AT_EXECVE_CHECK provides a mechanism to check the file right
> > > before it is used, we can assume it will detect any "problem" that
> > > happened before that, (e.g. the file was overwritten). However, that
> > > also imposes two additional requirements:
> > > 1> the file doesn't change while AT_EXECVE_CHECK does the check.
> >
> > This is already the case, so any kind of LSM checks are good.
> >
> May I ask how this is done? some code in do_open_execat() does this ?
> Apologies if this is a basic question.

do_open_execat() calls exe_file_deny_write_access()

> 
> > > 2>The file content kept by the process remains unchanged after passing
> > > the AT_EXECVE_CHECK.
> >
> > The goal of this patch was to avoid such race condition in the case
> > where executable files can be updated.  But in most cases it should not
> > be a security issue (because processes allowed to write to executable
> > files should be trusted), but this could still lead to bugs (because of
> > inconsistent file content, half-updated).
> >
> There is also a time gap between:
> a> the time of AT_EXECVE_CHECK
> b> the time that the app opens the file for execution.
> right ? another potential attack path (though this is not the case I
> mentioned previously).

As explained in the documentation, to avoid this specific race
condition, interpreters should open the script once, check the FD with
AT_EXECVE_CHECK, and then read the content with the same FD.

> 
> For the case I mentioned previously, I have to think more if the race
> condition is a bug or security issue.
> IIUC, two solutions are discussed so far:
> 1> the process could write to fs to update the script.  However, for
> execution, the process still uses the copy that passed the
> AT_EXECVE_CHECK. (snapshot solution by Andy Lutomirski)

Yes, the snapshot solution would be the best, but I guess it would rely
on filesystems to support this feature.

> or 2> the process blocks the write while opening the file as read only
> and executing the script. (this seems to be the approach of this
> patch).

Yes, and this is not something we want anymore.

> 
> I wonder if there are other ideas.

I don't see other efficient ways do give the same guarantees.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-22 19:45   ` Jann Horn
  2025-08-24 11:03     ` Mickaël Salaün
@ 2025-08-27 10:18     ` Aleksa Sarai
  1 sibling, 0 replies; 18+ messages in thread
From: Aleksa Sarai @ 2025-08-27 10:18 UTC (permalink / raw)
  To: Jann Horn
  Cc: Mickaël Salaün, Al Viro, Christian Brauner, Kees Cook,
	Paul Moore, Serge Hallyn, Andy Lutomirski, Arnd Bergmann,
	Christian Heimes, Dmitry Vyukov, Elliott Hughes, Fan Wu,
	Florian Weimer, Jeff Xu, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module, Andy Lutomirski, Jeff Xu

[-- Attachment #1: Type: text/plain, Size: 2558 bytes --]

On 2025-08-22, Jann Horn <jannh@google.com> wrote:
> On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > passed file descriptors).  This changes the state of the opened file by
> > making it read-only until it is closed.  The main use case is for script
> > interpreters to get the guarantee that script' content cannot be altered
> > while being read and interpreted.  This is useful for generic distros
> > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> >
> > Both execve(2) and the IOCTL to enable fsverity can already set this
> > property on files with deny_write_access().  This new O_DENY_WRITE make
> 
> The kernel actually tried to get rid of this behavior on execve() in
> commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> because it broke userspace assumptions.

Also the ETXTBSY behaviour for binaries is not always guaranteed to
block writes to the file. When we were discussing this back in 2021 and
when we initially removed it, I remember there being some fairly trivial
ways to get around it anyway (but because process mm is mapped with
MAP_PRIVATE, writes aren't seen by the actual process).

> > it widely available.  This is similar to what other OSs may provide
> > e.g., opening a file with only FILE_SHARE_READ on Windows.
> 
> We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> removed for security reasons; as
> https://man7.org/linux/man-pages/man2/mmap.2.html says:
> 
> |        MAP_DENYWRITE
> |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> |               signaled that attempts to write to the underlying file
> |               should fail with ETXTBSY.  But this was a source of denial-
> |               of-service attacks.)"
> 
> It seems to me that the same issue applies to your patch - it would
> allow unprivileged processes to essentially lock files such that other
> processes can't write to them anymore. This might allow unprivileged
> users to prevent root from updating config files or stuff like that if
> they're updated in-place.

Agreed, and this was one of the major issues with the also-now-removed
mandatory locking as well.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-22 17:07 ` [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE Mickaël Salaün
  2025-08-22 19:45   ` Jann Horn
@ 2025-08-27 10:29   ` Aleksa Sarai
  1 sibling, 0 replies; 18+ messages in thread
From: Aleksa Sarai @ 2025-08-27 10:29 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Al Viro, Christian Brauner, Kees Cook, Paul Moore, Serge Hallyn,
	Andy Lutomirski, Arnd Bergmann, Christian Heimes, Dmitry Vyukov,
	Elliott Hughes, Fan Wu, Florian Weimer, Jann Horn, Jeff Xu,
	Jonathan Corbet, Jordan R Abrahams, Lakshmi Ramasubramanian,
	Luca Boccassi, Matt Bobrowski, Miklos Szeredi, Mimi Zohar,
	Nicolas Bouchinet, Robert Waite, Roberto Sassu, Scott Shell,
	Steve Dower, Steve Grubb, kernel-hardening, linux-api,
	linux-fsdevel, linux-integrity, linux-kernel,
	linux-security-module, Andy Lutomirski, Jeff Xu

[-- Attachment #1: Type: text/plain, Size: 6799 bytes --]

On 2025-08-22, Mickaël Salaün <mic@digikod.net> wrote:
> Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> passed file descriptors).  This changes the state of the opened file by
> making it read-only until it is closed.  The main use case is for script
> interpreters to get the guarantee that script' content cannot be altered
> while being read and interpreted.  This is useful for generic distros
> that may not have a write-xor-execute policy.  See commit a5874fde3c08
> ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> 
> Both execve(2) and the IOCTL to enable fsverity can already set this
> property on files with deny_write_access().  This new O_DENY_WRITE make
> it widely available.  This is similar to what other OSs may provide
> e.g., opening a file with only FILE_SHARE_READ on Windows.
> 
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Andy Lutomirski <luto@amacapital.net>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Jeff Xu <jeffxu@chromium.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Paul Moore <paul@paul-moore.com>
> Cc: Serge Hallyn <serge@hallyn.com>
> Reported-by: Robert Waite <rowait@microsoft.com>
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> Link: https://lore.kernel.org/r/20250822170800.2116980-2-mic@digikod.net
> ---
>  fs/fcntl.c                       | 26 ++++++++++++++++++++++++--
>  fs/file_table.c                  |  2 ++
>  fs/namei.c                       |  6 ++++++
>  include/linux/fcntl.h            |  2 +-
>  include/uapi/asm-generic/fcntl.h |  4 ++++
>  5 files changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 5598e4d57422..0c80c0fbc706 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -34,7 +34,8 @@
>  
>  #include "internal.h"
>  
> -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
> +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
> +	O_DENY_WRITE)
>  
>  static int setfl(int fd, struct file * filp, unsigned int arg)
>  {
> @@ -80,8 +81,29 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
>  			error = 0;
>  	}
>  	spin_lock(&filp->f_lock);
> +
> +	if (arg & O_DENY_WRITE) {
> +		/* Only regular files. */
> +		if (!S_ISREG(inode->i_mode)) {
> +			error = -EINVAL;
> +			goto unlock;
> +		}
> +
> +		/* Only sets once. */
> +		if (!(filp->f_flags & O_DENY_WRITE)) {
> +			error = exe_file_deny_write_access(filp);
> +			if (error)
> +				goto unlock;
> +		}
> +	} else {
> +		if (filp->f_flags & O_DENY_WRITE)
> +			exe_file_allow_write_access(filp);
> +	}

I appreciate the goal of making this something that can be cleared
(presumably for interpreters that mmap(MAP_PRIVATE) their scripts), but
making a security-related flag this easy to clear seems like a footgun
(any library function could mask O_DENY_WRITE or forget to copy the old
flag values).

> +
>  	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
>  	filp->f_iocb_flags = iocb_flags(filp);
> +
> +unlock:
>  	spin_unlock(&filp->f_lock);
>  
>   out:
> @@ -1158,7 +1180,7 @@ static int __init fcntl_init(void)
>  	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
>  	 * is defined as O_NONBLOCK on some platforms and not on others.
>  	 */
> -	BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> +	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
>  		HWEIGHT32(
>  			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
>  			__FMODE_EXEC));
> diff --git a/fs/file_table.c b/fs/file_table.c
> index 81c72576e548..6ba896b6a53f 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -460,6 +460,8 @@ static void __fput(struct file *file)
>  	locks_remove_file(file);
>  
>  	security_file_release(file);
> +	if (unlikely(file->f_flags & O_DENY_WRITE))
> +		exe_file_allow_write_access(file);
>  	if (unlikely(file->f_flags & FASYNC)) {
>  		if (file->f_op->fasync)
>  			file->f_op->fasync(-1, file, 0);
> diff --git a/fs/namei.c b/fs/namei.c
> index cd43ff89fbaa..366530bf937d 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3885,6 +3885,12 @@ static int do_open(struct nameidata *nd,
>  	error = may_open(idmap, &nd->path, acc_mode, open_flag);
>  	if (!error && !(file->f_mode & FMODE_OPENED))
>  		error = vfs_open(&nd->path, file);
> +	if (!error && (open_flag & O_DENY_WRITE)) {
> +		if (S_ISREG(file_inode(file)->i_mode))
> +			error = exe_file_deny_write_access(file);
> +		else
> +			error = -EINVAL;
> +	}
>  	if (!error)
>  		error = security_file_post_open(file, op->acc_mode);
>  	if (!error && do_truncate)
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..dad14101686f 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -10,7 +10,7 @@
>  	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
>  	 O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
>  	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> -	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
> +	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_DENY_WRITE)

I don't like this patch for the same reasons Christian has already said,
but in addition -- you cannot just add new open(2) flags like this.

Unlike openat2(2), classic open(2) does not verify invalid flag bits, so
any new flag must be designed so that old kernels will return an error
for that flag combination, which ensures that:

 * No old programs set those bits inadvertently, which lets us avoid
   breaking userspace in some really fun and hard-to-debug ways.
 * For security-related bits, that new programs running on old kernels
   do not think they are getting a security property that they aren't
   actually getting.

O_TMPFILE's bitflag soup is an example of how you can resolve this issue
for open(2), but I would suggest that authors of new O_* flags seriously
consider making their flags openat2(2)-only unless it's trivial to get
the above behaviour.

>  /* List of all valid flags for the how->resolve argument: */
>  #define VALID_RESOLVE_FLAGS \
> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> index 613475285643..facd9136f5af 100644
> --- a/include/uapi/asm-generic/fcntl.h
> +++ b/include/uapi/asm-generic/fcntl.h
> @@ -91,6 +91,10 @@
>  /* a horrid kludge trying to make sure that this will fail on old kernels */
>  #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
>  
> +#ifndef O_DENY_WRITE
> +#define O_DENY_WRITE	040000000
> +#endif
> +
>  #ifndef O_NDELAY
>  #define O_NDELAY	O_NONBLOCK
>  #endif

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE
  2025-08-27  8:19                 ` Mickaël Salaün
@ 2025-08-28 20:17                   ` Jeff Xu
  0 siblings, 0 replies; 18+ messages in thread
From: Jeff Xu @ 2025-08-28 20:17 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Jeff Xu, Andy Lutomirski, Jann Horn, Al Viro, Christian Brauner,
	Kees Cook, Paul Moore, Serge Hallyn, Andy Lutomirski,
	Arnd Bergmann, Christian Heimes, Dmitry Vyukov, Elliott Hughes,
	Fan Wu, Florian Weimer, Jonathan Corbet, Jordan R Abrahams,
	Lakshmi Ramasubramanian, Luca Boccassi, Matt Bobrowski,
	Miklos Szeredi, Mimi Zohar, Nicolas Bouchinet, Robert Waite,
	Roberto Sassu, Scott Shell, Steve Dower, Steve Grubb,
	kernel-hardening, linux-api, linux-fsdevel, linux-integrity,
	linux-kernel, linux-security-module

Hi Mickaël

On Wed, Aug 27, 2025 at 1:19 AM Mickaël Salaün <mic@digikod.net> wrote:
>
> On Tue, Aug 26, 2025 at 01:29:55PM -0700, Jeff Xu wrote:
> > Hi Mickaël
> >
> > On Tue, Aug 26, 2025 at 5:39 AM Mickaël Salaün <mic@digikod.net> wrote:
> > >
> > > On Mon, Aug 25, 2025 at 10:57:57AM -0700, Jeff Xu wrote:
> > > > Hi Mickaël
> > > >
> > > > On Mon, Aug 25, 2025 at 2:31 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > > >
> > > > > On Sun, Aug 24, 2025 at 11:04:03AM -0700, Andy Lutomirski wrote:
> > > > > > On Sun, Aug 24, 2025 at 4:03 AM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 22, 2025 at 09:45:32PM +0200, Jann Horn wrote:
> > > > > > > > On Fri, Aug 22, 2025 at 7:08 PM Mickaël Salaün <mic@digikod.net> wrote:
> > > > > > > > > Add a new O_DENY_WRITE flag usable at open time and on opened file (e.g.
> > > > > > > > > passed file descriptors).  This changes the state of the opened file by
> > > > > > > > > making it read-only until it is closed.  The main use case is for script
> > > > > > > > > interpreters to get the guarantee that script' content cannot be altered
> > > > > > > > > while being read and interpreted.  This is useful for generic distros
> > > > > > > > > that may not have a write-xor-execute policy.  See commit a5874fde3c08
> > > > > > > > > ("exec: Add a new AT_EXECVE_CHECK flag to execveat(2)")
> > > > > > > > >
> > > > > > > > > Both execve(2) and the IOCTL to enable fsverity can already set this
> > > > > > > > > property on files with deny_write_access().  This new O_DENY_WRITE make
> > > > > > > >
> > > > > > > > The kernel actually tried to get rid of this behavior on execve() in
> > > > > > > > commit 2a010c41285345da60cece35575b4e0af7e7bf44.; but sadly that had
> > > > > > > > to be reverted in commit 3b832035387ff508fdcf0fba66701afc78f79e3d
> > > > > > > > because it broke userspace assumptions.
> > > > > > >
> > > > > > > Oh, good to know.
> > > > > > >
> > > > > > > >
> > > > > > > > > it widely available.  This is similar to what other OSs may provide
> > > > > > > > > e.g., opening a file with only FILE_SHARE_READ on Windows.
> > > > > > > >
> > > > > > > > We used to have the analogous mmap() flag MAP_DENYWRITE, and that was
> > > > > > > > removed for security reasons; as
> > > > > > > > https://man7.org/linux/man-pages/man2/mmap.2.html says:
> > > > > > > >
> > > > > > > > |        MAP_DENYWRITE
> > > > > > > > |               This flag is ignored.  (Long ago—Linux 2.0 and earlier—it
> > > > > > > > |               signaled that attempts to write to the underlying file
> > > > > > > > |               should fail with ETXTBSY.  But this was a source of denial-
> > > > > > > > |               of-service attacks.)"
> > > > > > > >
> > > > > > > > It seems to me that the same issue applies to your patch - it would
> > > > > > > > allow unprivileged processes to essentially lock files such that other
> > > > > > > > processes can't write to them anymore. This might allow unprivileged
> > > > > > > > users to prevent root from updating config files or stuff like that if
> > > > > > > > they're updated in-place.
> > > > > > >
> > > > > > > Yes, I agree, but since it is the case for executed files I though it
> > > > > > > was worth starting a discussion on this topic.  This new flag could be
> > > > > > > restricted to executable files, but we should avoid system-wide locks
> > > > > > > like this.  I'm not sure how Windows handle these issues though.
> > > > > > >
> > > > > > > Anyway, we should rely on the access control policy to control write and
> > > > > > > execute access in a consistent way (e.g. write-xor-execute).  Thanks for
> > > > > > > the references and the background!
> > > > > >
> > > > > > I'm confused.  I understand that there are many contexts in which one
> > > > > > would want to prevent execution of unapproved content, which might
> > > > > > include preventing a given process from modifying some code and then
> > > > > > executing it.
> > > > > >
> > > > > > I don't understand what these deny-write features have to do with it.
> > > > > > These features merely prevent someone from modifying code *that is
> > > > > > currently in use*, which is not at all the same thing as preventing
> > > > > > modifying code that might get executed -- one can often modify
> > > > > > contents *before* executing those contents.
> > > > >
> > > > > The order of checks would be:
> > > > > 1. open script with O_DENY_WRITE
> > > > > 2. check executability with AT_EXECVE_CHECK
> > > > > 3. read the content and interpret it
> > > > >
> > > > I'm not sure about the O_DENY_WRITE approach, but the problem is worth solving.
> > > >
> > > > AT_EXECVE_CHECK is not just for scripting languages. It could also
> > > > work with bytecodes like Java, for example. If we let the Java runtime
> > > > call AT_EXECVE_CHECK before loading the bytecode, the LSM could
> > > > develop a policy based on that.
> > >
> > > Sure, I'm using "script" to make it simple, but this applies to other
> > > use cases.
> > >
> > That makes sense.
> >
> > > >
> > > > > The deny-write feature was to guarantee that there is no race condition
> > > > > between step 2 and 3.  All these checks are supposed to be done by a
> > > > > trusted interpreter (which is allowed to be executed).  The
> > > > > AT_EXECVE_CHECK call enables the caller to know if the kernel (and
> > > > > associated security policies) allowed the *current* content of the file
> > > > > to be executed.  Whatever happen before or after that (wrt.
> > > > > O_DENY_WRITE) should be covered by the security policy.
> > > > >
> > > > Agree, the race problem needs to be solved in order for AT_EXECVE_CHECK.
> > > >
> > > > Enforcing non-write for the path that stores scripts or bytecodes can
> > > > be challenging due to historical or backward compatibility reasons.
> > > > Since AT_EXECVE_CHECK provides a mechanism to check the file right
> > > > before it is used, we can assume it will detect any "problem" that
> > > > happened before that, (e.g. the file was overwritten). However, that
> > > > also imposes two additional requirements:
> > > > 1> the file doesn't change while AT_EXECVE_CHECK does the check.
> > >
> > > This is already the case, so any kind of LSM checks are good.
> > >
> > May I ask how this is done? some code in do_open_execat() does this ?
> > Apologies if this is a basic question.
>
> do_open_execat() calls exe_file_deny_write_access()
>
Thanks for pointing.
With that, now I read the full history of discussion regarding this :-)

> >
> > > > 2>The file content kept by the process remains unchanged after passing
> > > > the AT_EXECVE_CHECK.
> > >
> > > The goal of this patch was to avoid such race condition in the case
> > > where executable files can be updated.  But in most cases it should not
> > > be a security issue (because processes allowed to write to executable
> > > files should be trusted), but this could still lead to bugs (because of
> > > inconsistent file content, half-updated).
> > >
> > There is also a time gap between:
> > a> the time of AT_EXECVE_CHECK
> > b> the time that the app opens the file for execution.
> > right ? another potential attack path (though this is not the case I
> > mentioned previously).
>
> As explained in the documentation, to avoid this specific race
> condition, interpreters should open the script once, check the FD with
> AT_EXECVE_CHECK, and then read the content with the same FD.
>
Ya, now I see that in the description of this patch, sorry that I
missed that previously.

> >
> > For the case I mentioned previously, I have to think more if the race
> > condition is a bug or security issue.
> > IIUC, two solutions are discussed so far:
> > 1> the process could write to fs to update the script.  However, for
> > execution, the process still uses the copy that passed the
> > AT_EXECVE_CHECK. (snapshot solution by Andy Lutomirski)
>
> Yes, the snapshot solution would be the best, but I guess it would rely
> on filesystems to support this feature.
>
snapshot seems to be the reasonable direction to go

Is this something related to the VMA ? e.g. preserve the in-memory
copy of the file when the file on fs was updated.

According to man mmap:
       MAP_PRIVATE
              Create a private copy-on-write mapping.  Updates to the
              mapping are not visible to other processes mapping the same
              file, and are not carried through to the underlying file.
              It is unspecified whether changes made to the file after
              the mmap() call are visible in the mapped region.

so the direction here is
the process -> update the vma -> doesn't carry to the file.

What we want is the reverse direction: (the unspecified part in the man page)
file updated on fs -> doesn't carry to the vma of this process.

> > or 2> the process blocks the write while opening the file as read only
> > and executing the script. (this seems to be the approach of this
> > patch).
>
> Yes, and this is not something we want anymore.
>
right. Thank you for clarifying this.

> >
> > I wonder if there are other ideas.
>
> I don't see other efficient ways to give the same guarantees.
right, me neither.

Thanks and regards,
-Jeff

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2025-08-28 20:17 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-25 21:56 [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE Andy Lutomirski
2025-08-25 23:06 ` Jeff Xu
  -- strict thread matches above, loose matches on Subject: below --
2025-08-22 17:07 [RFC PATCH v1 0/2] Add O_DENY_WRITE (complement AT_EXECVE_CHECK) Mickaël Salaün
2025-08-22 17:07 ` [RFC PATCH v1 1/2] fs: Add O_DENY_WRITE Mickaël Salaün
2025-08-22 19:45   ` Jann Horn
2025-08-24 11:03     ` Mickaël Salaün
2025-08-24 18:04       ` Andy Lutomirski
2025-08-25  9:31         ` Mickaël Salaün
2025-08-25  9:39           ` Florian Weimer
2025-08-26 12:35             ` Mickaël Salaün
2025-08-25 16:43           ` Andy Lutomirski
2025-08-25 18:10             ` Jeff Xu
2025-08-25 17:57           ` Jeff Xu
2025-08-26 12:39             ` Mickaël Salaün
2025-08-26 20:29               ` Jeff Xu
2025-08-27  8:19                 ` Mickaël Salaün
2025-08-28 20:17                   ` Jeff Xu
2025-08-27 10:18     ` Aleksa Sarai
2025-08-27 10:29   ` Aleksa Sarai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).