From: Christian Brauner <brauner@kernel.org>
To: linux-fsdevel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>,
Jeff Layton <jlayton@kernel.org>,
Amir Goldstein <amir73il@gmail.com>,
Josef Bacik <josef@toxicpanda.com>,
Aleksa Sarai <cyphar@cyphar.com>,
Christian Brauner <brauner@kernel.org>
Subject: [PATCH 3/7] mount: add FSMOUNT_NAMESPACE
Date: Thu, 22 Jan 2026 11:48:48 +0100 [thread overview]
Message-ID: <20260122-work-fsmount-namespace-v1-3-5ef0a886e646@kernel.org> (raw)
In-Reply-To: <20260122-work-fsmount-namespace-v1-0-5ef0a886e646@kernel.org>
Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for open_tree().
This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for container
runtimes and other namespace-based isolation mechanisms.
The rootfs mount is created before copying the real rootfs for the new
namespace meaning that the mount namespace id for the mount of the root
of the namespace is bigger than the child mounted on top of it. We've
never explicitly given the guarantee for such ordering and I doubt
anyone relies on it. Accepting that lets us avoid copying the mount
again and also avoids having to massage may_copy_tree() to grant an
exception for fsmount->mnt->mnt_ns being NULL.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/namespace.c | 53 +++++++++++++++++++++++++++++++++-------------
include/uapi/linux/mount.h | 1 +
2 files changed, 39 insertions(+), 15 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 46d2eb1c9c3d..30f2991b4a7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3068,8 +3068,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+enum open_newns_flags_t {
+ OPEN_NEWNS_RECURSIVE = BIT(0),
+ OPEN_NEWNS_CLONE = BIT(1),
+};
+
static struct mnt_namespace *create_new_namespace(struct path *path,
- bool recurse)
+ enum open_newns_flags_t flags)
{
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
struct path to_path __free(path_put) = {};
@@ -3080,6 +3085,9 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
unsigned int copy_flags = 0;
bool locked = false;
+ if ((flags & (OPEN_NEWNS_RECURSIVE | OPEN_NEWNS_CLONE)) == OPEN_NEWNS_RECURSIVE)
+ return ERR_PTR(-EINVAL);
+
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
@@ -3122,14 +3130,18 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
if (unlikely(IS_ERR(mp.parent)))
return ERR_CAST(mp.parent);
- /*
- * We don't emulate unshare()ing a mount namespace. We stick to the
- * restrictions of creating detached bind-mounts. It has a lot
- * saner and simpler semantics.
- */
- mnt = __do_loopback(path, recurse, copy_flags);
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
+ if (flags & OPEN_NEWNS_CLONE) {
+ /*
+ * We don't emulate unshare()ing a mount namespace. We stick to
+ * the restrictions of creating detached bind-mounts. It has a
+ * lot saner and simpler semantics.
+ */
+ mnt = __do_loopback(path, flags & OPEN_NEWNS_RECURSIVE, copy_flags);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+ } else {
+ mnt = real_mount(mntget(path->mnt));
+ }
scoped_guard(mount_writer) {
if (locked)
@@ -3154,11 +3166,12 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
return no_free_ptr(new_ns);
}
-static struct file *open_new_namespace(struct path *path, bool recurse)
+static struct file *open_new_namespace(struct path *path,
+ enum open_newns_flags_t flags)
{
struct mnt_namespace *new_ns;
- new_ns = create_new_namespace(path, recurse);
+ new_ns = create_new_namespace(path, flags);
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
return open_namespace_file(to_ns_common(new_ns));
@@ -3208,7 +3221,9 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
return ERR_PTR(ret);
if (flags & OPEN_TREE_NAMESPACE)
- return open_new_namespace(&path, (flags & AT_RECURSIVE));
+ return open_new_namespace(&path,
+ ((flags & AT_RECURSIVE) ? OPEN_NEWNS_RECURSIVE : 0) |
+ OPEN_NEWNS_CLONE);
if (flags & OPEN_TREE_CLONE)
return open_detached_copy(&path, flags);
@@ -4395,11 +4410,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
unsigned int mnt_flags = 0;
long ret;
- if (!may_mount())
+ if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
+ return -EINVAL;
+
+ if ((flags & FSMOUNT_NAMESPACE) &&
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
- if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
- return -EINVAL;
+ if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
+ return -EPERM;
if (attr_flags & ~FSMOUNT_VALID_FLAGS)
return -EINVAL;
@@ -4466,6 +4485,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
*/
vfs_clean_context(fc);
+ if (flags & FSMOUNT_NAMESPACE)
+ return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+ open_new_namespace(&new_path, 0));
+
ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
if (IS_ERR(ns))
return PTR_ERR(ns);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d9d86598d100..2204708dbf7a 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -110,6 +110,7 @@ enum fsconfig_command {
* fsmount() flags.
*/
#define FSMOUNT_CLOEXEC 0x00000001
+#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */
/*
* Mount attributes.
--
2.47.3
next prev parent reply other threads:[~2026-01-22 10:49 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-22 10:48 [PATCH 0/7] fsmount: add FSMOUNT_NAMESPACE Christian Brauner
2026-01-22 10:48 ` [PATCH 1/7] mount: start iterating from start of rbtree Christian Brauner
2026-01-22 10:48 ` [PATCH 2/7] mount: simplify __do_loopback() Christian Brauner
2026-01-22 10:48 ` Christian Brauner [this message]
2026-02-11 11:47 ` [PATCH 3/7] mount: add FSMOUNT_NAMESPACE Mark Brown
2026-02-11 12:13 ` Christian Brauner
2026-03-18 20:16 ` Mark Brown
2026-03-20 13:40 ` Christian Brauner
2026-03-20 14:04 ` Mark Brown
2026-01-22 10:48 ` [PATCH 4/7] tools: update mount.h header Christian Brauner
2026-01-22 10:48 ` [PATCH 5/7] selftests/statmount: add statmount_alloc() helper Christian Brauner
2026-01-22 10:48 ` [PATCH 6/7] selftests: add FSMOUNT_NAMESPACE tests Christian Brauner
2026-01-22 10:48 ` [PATCH 7/7] selftests/open_tree_ns: fix compilation Christian Brauner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260122-work-fsmount-namespace-v1-3-5ef0a886e646@kernel.org \
--to=brauner@kernel.org \
--cc=amir73il@gmail.com \
--cc=cyphar@cyphar.com \
--cc=jack@suse.cz \
--cc=jlayton@kernel.org \
--cc=josef@toxicpanda.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox